htmlentities 3.0.0 → 3.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGES CHANGED
@@ -1,29 +1,32 @@
1
+ == 3.0.1 (2005-04-08)
2
+ * Improved documentation.
3
+
1
4
  == 3.0.0 (2005-04-08)
2
5
  * Changed licence to MIT due to confusion with previous 'Fair' licence (my
3
6
  intention was to be liberal, not obscure).
4
7
  * Moved basic functionality out of String class; for previous behaviour,
5
- require 'htmlentities/string'
6
- * Changed version numbering scheme
7
- * Now available as a Gem
8
+ require 'htmlentities/string'.
9
+ * Changed version numbering scheme.
10
+ * Now available as a Gem.
8
11
 
9
12
  == 2.2 (2005-11-07)
10
- * Important bug fixes -- thanks to Moonwolf
13
+ * Important bug fixes -- thanks to Moonwolf.
11
14
  * Decoding hexadecimal entities now accepts 'f' as a hex digit. (D'oh!)
12
- * Decimal decoding edge cases addressed
13
- * Test cases added
15
+ * Decimal decoding edge cases addressed.
16
+ * Test cases added.
14
17
 
15
18
  == 2.1 (2005-10-31)
16
- * Removed some unnecessary code in basic entity encoding
19
+ * Removed some unnecessary code in basic entity encoding.
17
20
  * Improved handling of encoding: commands are now automatically sorted, so the
18
- user doesn't have to worry about their order
19
- * Now using setup.rb
20
- * Tests moved to separate file
21
+ user doesn't have to worry about their order.
22
+ * Now using setup.rb.
23
+ * Tests moved to separate file.
21
24
 
22
25
  == 2.0 (2005-08-23)
23
- * Added encoding to entities
24
- * Decoding interface unchanged
25
- * Fixed a bug with handling high codepoints
26
+ * Added encoding to entities.
27
+ * Decoding interface unchanged.
28
+ * Fixed a bug with handling high codepoints.
26
29
 
27
30
  == 1.0 (2005-08-03)
28
- * Initial release
29
- * Decoding only
31
+ * Initial release.
32
+ * Decoding only.
@@ -2,85 +2,84 @@
2
2
  # HTML entity encoding and decoding for Ruby
3
3
  #
4
4
 
5
- module HTMLEntities # :nodoc:
5
+ module HTMLEntities
6
6
 
7
7
  class InstructionError < RuntimeError
8
8
  end
9
9
 
10
- #
11
- # MAP is a hash of all the HTML entities I could discover, as taken
12
- # from the w3schools page on the subject:
13
- # http://www.w3schools.com/html/html_entitiesref.asp
14
- # The format is 'entity name' => codepoint where entity name is given
15
- # without the surrounding ampersand and semicolon.
16
- #
17
- MAP = {
18
- 'quot' => 34, 'apos' => 39, 'amp' => 38,
19
- 'lt' => 60, 'gt' => 62, 'nbsp' => 160,
20
- 'iexcl' => 161, 'curren' => 164, 'cent' => 162,
21
- 'pound' => 163, 'yen' => 165, 'brvbar' => 166,
22
- 'sect' => 167, 'uml' => 168, 'copy' => 169,
23
- 'ordf' => 170, 'laquo' => 171, 'not' => 172,
24
- 'shy' => 173, 'reg' => 174, 'trade' => 8482,
25
- 'macr' => 175, 'deg' => 176, 'plusmn' => 177,
26
- 'sup2' => 178, 'sup3' => 179, 'acute' => 180,
27
- 'micro' => 181, 'para' => 182, 'middot' => 183,
28
- 'cedil' => 184, 'sup1' => 185, 'ordm' => 186,
29
- 'raquo' => 187, 'frac14' => 188, 'frac12' => 189,
30
- 'frac34' => 190, 'iquest' => 191, 'times' => 215,
31
- 'divide' => 247, 'Agrave' => 192, 'Aacute' => 193,
32
- 'Acirc' => 194, 'Atilde' => 195, 'Auml' => 196,
33
- 'Aring' => 197, 'AElig' => 198, 'Ccedil' => 199,
34
- 'Egrave' => 200, 'Eacute' => 201, 'Ecirc' => 202,
35
- 'Euml' => 203, 'Igrave' => 204, 'Iacute' => 205,
36
- 'Icirc' => 206, 'Iuml' => 207, 'ETH' => 208,
37
- 'Ntilde' => 209, 'Ograve' => 210, 'Oacute' => 211,
38
- 'Ocirc' => 212, 'Otilde' => 213, 'Ouml' => 214,
39
- 'Oslash' => 216, 'Ugrave' => 217, 'Uacute' => 218,
40
- 'Ucirc' => 219, 'Uuml' => 220, 'Yacute' => 221,
41
- 'THORN' => 222, 'szlig' => 223, 'agrave' => 224,
42
- 'aacute' => 225, 'acirc' => 226, 'atilde' => 227,
43
- 'auml' => 228, 'aring' => 229, 'aelig' => 230,
44
- 'ccedil' => 231, 'egrave' => 232, 'eacute' => 233,
45
- 'ecirc' => 234, 'euml' => 235, 'igrave' => 236,
46
- 'iacute' => 237, 'icirc' => 238, 'iuml' => 239,
47
- 'eth' => 240, 'ntilde' => 241, 'ograve' => 242,
48
- 'oacute' => 243, 'ocirc' => 244, 'otilde' => 245,
49
- 'ouml' => 246, 'oslash' => 248, 'ugrave' => 249,
50
- 'uacute' => 250, 'ucirc' => 251, 'uuml' => 252,
51
- 'yacute' => 253, 'thorn' => 254, 'yuml' => 255,
52
- 'OElig' => 338, 'oelig' => 339, 'Scaron' => 352,
53
- 'scaron' => 353, 'Yuml' => 376, 'circ' => 710,
54
- 'tilde' => 732, 'ensp' => 8194, 'emsp' => 8195,
55
- 'thinsp' => 8201, 'zwnj' => 8204, 'zwj' => 8205,
56
- 'lrm' => 8206, 'rlm' => 8207, 'ndash' => 8211,
57
- 'mdash' => 8212, 'lsquo' => 8216, 'rsquo' => 8217,
58
- 'sbquo' => 8218, 'ldquo' => 8220, 'rdquo' => 8221,
59
- 'bdquo' => 8222, 'dagger' => 8224, 'Dagger' => 8225,
60
- 'hellip' => 8230, 'permil' => 8240, 'lsaquo' => 8249,
61
- 'rsaquo' => 8250, 'euro' => 8364
62
- }
10
+ module Data #:nodoc:
63
11
 
64
- MIN_LENGTH = MAP.keys.map{ |a| a.length }.min
65
- MAX_LENGTH = MAP.keys.map{ |a| a.length }.max
66
-
67
- # Precompile the regexp
68
- NAMED_ENTITY_REGEXP =
69
- /&([a-z]{#{HTMLEntities::MIN_LENGTH},#{HTMLEntities::MAX_LENGTH}});/i
12
+ #
13
+ # MAP is a hash of all the HTML entities I could discover, as taken
14
+ # from the w3schools page on the subject:
15
+ # http://www.w3schools.com/html/html_entitiesref.asp
16
+ # The format is 'entity name' => codepoint where entity name is given
17
+ # without the surrounding ampersand and semicolon.
18
+ #
19
+ MAP = {
20
+ 'quot' => 34, 'apos' => 39, 'amp' => 38,
21
+ 'lt' => 60, 'gt' => 62, 'nbsp' => 160,
22
+ 'iexcl' => 161, 'curren' => 164, 'cent' => 162,
23
+ 'pound' => 163, 'yen' => 165, 'brvbar' => 166,
24
+ 'sect' => 167, 'uml' => 168, 'copy' => 169,
25
+ 'ordf' => 170, 'laquo' => 171, 'not' => 172,
26
+ 'shy' => 173, 'reg' => 174, 'trade' => 8482,
27
+ 'macr' => 175, 'deg' => 176, 'plusmn' => 177,
28
+ 'sup2' => 178, 'sup3' => 179, 'acute' => 180,
29
+ 'micro' => 181, 'para' => 182, 'middot' => 183,
30
+ 'cedil' => 184, 'sup1' => 185, 'ordm' => 186,
31
+ 'raquo' => 187, 'frac14' => 188, 'frac12' => 189,
32
+ 'frac34' => 190, 'iquest' => 191, 'times' => 215,
33
+ 'divide' => 247, 'Agrave' => 192, 'Aacute' => 193,
34
+ 'Acirc' => 194, 'Atilde' => 195, 'Auml' => 196,
35
+ 'Aring' => 197, 'AElig' => 198, 'Ccedil' => 199,
36
+ 'Egrave' => 200, 'Eacute' => 201, 'Ecirc' => 202,
37
+ 'Euml' => 203, 'Igrave' => 204, 'Iacute' => 205,
38
+ 'Icirc' => 206, 'Iuml' => 207, 'ETH' => 208,
39
+ 'Ntilde' => 209, 'Ograve' => 210, 'Oacute' => 211,
40
+ 'Ocirc' => 212, 'Otilde' => 213, 'Ouml' => 214,
41
+ 'Oslash' => 216, 'Ugrave' => 217, 'Uacute' => 218,
42
+ 'Ucirc' => 219, 'Uuml' => 220, 'Yacute' => 221,
43
+ 'THORN' => 222, 'szlig' => 223, 'agrave' => 224,
44
+ 'aacute' => 225, 'acirc' => 226, 'atilde' => 227,
45
+ 'auml' => 228, 'aring' => 229, 'aelig' => 230,
46
+ 'ccedil' => 231, 'egrave' => 232, 'eacute' => 233,
47
+ 'ecirc' => 234, 'euml' => 235, 'igrave' => 236,
48
+ 'iacute' => 237, 'icirc' => 238, 'iuml' => 239,
49
+ 'eth' => 240, 'ntilde' => 241, 'ograve' => 242,
50
+ 'oacute' => 243, 'ocirc' => 244, 'otilde' => 245,
51
+ 'ouml' => 246, 'oslash' => 248, 'ugrave' => 249,
52
+ 'uacute' => 250, 'ucirc' => 251, 'uuml' => 252,
53
+ 'yacute' => 253, 'thorn' => 254, 'yuml' => 255,
54
+ 'OElig' => 338, 'oelig' => 339, 'Scaron' => 352,
55
+ 'scaron' => 353, 'Yuml' => 376, 'circ' => 710,
56
+ 'tilde' => 732, 'ensp' => 8194, 'emsp' => 8195,
57
+ 'thinsp' => 8201, 'zwnj' => 8204, 'zwj' => 8205,
58
+ 'lrm' => 8206, 'rlm' => 8207, 'ndash' => 8211,
59
+ 'mdash' => 8212, 'lsquo' => 8216, 'rsquo' => 8217,
60
+ 'sbquo' => 8218, 'ldquo' => 8220, 'rdquo' => 8221,
61
+ 'bdquo' => 8222, 'dagger' => 8224, 'Dagger' => 8225,
62
+ 'hellip' => 8230, 'permil' => 8240, 'lsaquo' => 8249,
63
+ 'rsaquo' => 8250, 'euro' => 8364
64
+ }
70
65
 
71
- # Reverse map for converting characters to named entities
72
- REVERSE_MAP = MAP.invert
66
+ MIN_LENGTH = MAP.keys.map{ |a| a.length }.min
67
+ MAX_LENGTH = MAP.keys.map{ |a| a.length }.max
68
+ NAMED_ENTITY_REGEXP = /&([a-z]{#{MIN_LENGTH},#{MAX_LENGTH}});/i
69
+ REVERSE_MAP = MAP.invert
73
70
 
74
- BASIC_ENTITY_REGEXP = /[<>'"&]/
71
+ BASIC_ENTITY_REGEXP = /[<>'"&]/
75
72
 
76
- UTF8_NON_ASCII_REGEXP = /[\x00-\x1f]|[\xc0-\xfd][\x80-\xbf]+/
73
+ UTF8_NON_ASCII_REGEXP = /[\x00-\x1f]|[\xc0-\xfd][\x80-\xbf]+/
77
74
 
78
- ENCODE_ENTITIES_COMMAND_ORDER = {
79
- :basic => 0,
80
- :named => 1,
81
- :decimal => 2,
82
- :hexadecimal => 3
83
- }
75
+ ENCODE_ENTITIES_COMMAND_ORDER = {
76
+ :basic => 0,
77
+ :named => 1,
78
+ :decimal => 2,
79
+ :hexadecimal => 3
80
+ }
81
+
82
+ end
84
83
 
85
84
  #
86
85
  # Decode XML and HTML 4.01 entities in a string into their UTF-8
@@ -91,8 +90,8 @@ module HTMLEntities # :nodoc:
91
90
  # Unknown named entities are not converted
92
91
  #
93
92
  def decode_entities(string)
94
- return string.gsub(NAMED_ENTITY_REGEXP) {
95
- (cp = MAP[$1]) ? [cp].pack('U') : $&
93
+ return string.gsub(Data::NAMED_ENTITY_REGEXP) {
94
+ (cp = Data::MAP[$1]) ? [cp].pack('U') : $&
96
95
  }.gsub(/&#([0-9]{1,7});|&#x([0-9a-f]{1,6});/i) {
97
96
  $1 ? [$1.to_i].pack('U') : [$2.to_i(16)].pack('U')
98
97
  }
@@ -129,7 +128,7 @@ module HTMLEntities # :nodoc:
129
128
  instructions = [:basic]
130
129
  else
131
130
  instructions = instructions.sort_by { |instruction|
132
- ENCODE_ENTITIES_COMMAND_ORDER[instruction] ||
131
+ Data::ENCODE_ENTITIES_COMMAND_ORDER[instruction] ||
133
132
  (raise InstructionError, "unknown encode_entities command `#{instruction.inspect}'")
134
133
  }
135
134
  end
@@ -137,23 +136,23 @@ module HTMLEntities # :nodoc:
137
136
  case instruction
138
137
  when :basic
139
138
  # Handled as basic ASCII
140
- output = (output || string).gsub(BASIC_ENTITY_REGEXP) {
139
+ output = (output || string).gsub(Data::BASIC_ENTITY_REGEXP) {
141
140
  # It's safe to use the simpler [0] here because we know
142
141
  # that the basic entities are ASCII.
143
- '&' << REVERSE_MAP[$&[0]] << ';'
142
+ '&' << Data::REVERSE_MAP[$&[0]] << ';'
144
143
  }
145
144
  when :named
146
145
  # Test everything except printable ASCII
147
- output = (output || string).gsub(UTF8_NON_ASCII_REGEXP) {
146
+ output = (output || string).gsub(Data::UTF8_NON_ASCII_REGEXP) {
148
147
  cp = $&.unpack('U')[0]
149
- (e = REVERSE_MAP[cp]) ? "&#{e};" : $&
148
+ (e = Data::REVERSE_MAP[cp]) ? "&#{e};" : $&
150
149
  }
151
150
  when :decimal
152
- output = (output || string).gsub(UTF8_NON_ASCII_REGEXP) {
151
+ output = (output || string).gsub(Data::UTF8_NON_ASCII_REGEXP) {
153
152
  "&##{$&.unpack('U')[0]};"
154
153
  }
155
154
  when :hexadecimal
156
- output = (output || string).gsub(UTF8_NON_ASCII_REGEXP) {
155
+ output = (output || string).gsub(Data::UTF8_NON_ASCII_REGEXP) {
157
156
  "&#x#{$&.unpack('U')[0].to_s(16)};"
158
157
  }
159
158
  end
@@ -1,15 +1,24 @@
1
1
  require 'htmlentities'
2
2
 
3
3
  #
4
- # This library extends the String class with methods to allow encoding and decoding of
4
+ # This file extends the String class with methods to allow encoding and decoding of
5
5
  # HTML/XML entities from/to their corresponding UTF-8 codepoints.
6
6
  #
7
7
  class String
8
8
 
9
+ #
10
+ # Decode XML and HTML 4.01 entities in a string into their UTF-8
11
+ # equivalents.
12
+ #
9
13
  def decode_entities
10
14
  return HTMLEntities.decode_entities(self)
11
15
  end
12
16
 
17
+ #
18
+ # Encode codepoints in a string into their corresponding entities. See
19
+ # the documentation of HTMLEntities.encode_entities for a list of possible
20
+ # instructions.
21
+ #
13
22
  def encode_entities(*instructions)
14
23
  return HTMLEntities.encode_entities(self, *instructions)
15
24
  end
metadata CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.8.11
3
3
  specification_version: 1
4
4
  name: htmlentities
5
5
  version: !ruby/object:Gem::Version
6
- version: 3.0.0
7
- date: 2006-04-08 00:00:00 +01:00
6
+ version: 3.0.1
7
+ date: 2006-04-09 00:00:00 +01:00
8
8
  summary: A module for encoding and decoding of HTML/XML entities from/to their corresponding UTF-8 codepoints. Optional String class extension for same.
9
9
  require_paths:
10
10
  - lib