htmlentities 3.0.0 → 3.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGES CHANGED
@@ -1,29 +1,32 @@
1
+ == 3.0.1 (2005-04-08)
2
+ * Improved documentation.
3
+
1
4
  == 3.0.0 (2005-04-08)
2
5
  * Changed licence to MIT due to confusion with previous 'Fair' licence (my
3
6
  intention was to be liberal, not obscure).
4
7
  * Moved basic functionality out of String class; for previous behaviour,
5
- require 'htmlentities/string'
6
- * Changed version numbering scheme
7
- * Now available as a Gem
8
+ require 'htmlentities/string'.
9
+ * Changed version numbering scheme.
10
+ * Now available as a Gem.
8
11
 
9
12
  == 2.2 (2005-11-07)
10
- * Important bug fixes -- thanks to Moonwolf
13
+ * Important bug fixes -- thanks to Moonwolf.
11
14
  * Decoding hexadecimal entities now accepts 'f' as a hex digit. (D'oh!)
12
- * Decimal decoding edge cases addressed
13
- * Test cases added
15
+ * Decimal decoding edge cases addressed.
16
+ * Test cases added.
14
17
 
15
18
  == 2.1 (2005-10-31)
16
- * Removed some unnecessary code in basic entity encoding
19
+ * Removed some unnecessary code in basic entity encoding.
17
20
  * Improved handling of encoding: commands are now automatically sorted, so the
18
- user doesn't have to worry about their order
19
- * Now using setup.rb
20
- * Tests moved to separate file
21
+ user doesn't have to worry about their order.
22
+ * Now using setup.rb.
23
+ * Tests moved to separate file.
21
24
 
22
25
  == 2.0 (2005-08-23)
23
- * Added encoding to entities
24
- * Decoding interface unchanged
25
- * Fixed a bug with handling high codepoints
26
+ * Added encoding to entities.
27
+ * Decoding interface unchanged.
28
+ * Fixed a bug with handling high codepoints.
26
29
 
27
30
  == 1.0 (2005-08-03)
28
- * Initial release
29
- * Decoding only
31
+ * Initial release.
32
+ * Decoding only.
@@ -2,85 +2,84 @@
2
2
  # HTML entity encoding and decoding for Ruby
3
3
  #
4
4
 
5
- module HTMLEntities # :nodoc:
5
+ module HTMLEntities
6
6
 
7
7
  class InstructionError < RuntimeError
8
8
  end
9
9
 
10
- #
11
- # MAP is a hash of all the HTML entities I could discover, as taken
12
- # from the w3schools page on the subject:
13
- # http://www.w3schools.com/html/html_entitiesref.asp
14
- # The format is 'entity name' => codepoint where entity name is given
15
- # without the surrounding ampersand and semicolon.
16
- #
17
- MAP = {
18
- 'quot' => 34, 'apos' => 39, 'amp' => 38,
19
- 'lt' => 60, 'gt' => 62, 'nbsp' => 160,
20
- 'iexcl' => 161, 'curren' => 164, 'cent' => 162,
21
- 'pound' => 163, 'yen' => 165, 'brvbar' => 166,
22
- 'sect' => 167, 'uml' => 168, 'copy' => 169,
23
- 'ordf' => 170, 'laquo' => 171, 'not' => 172,
24
- 'shy' => 173, 'reg' => 174, 'trade' => 8482,
25
- 'macr' => 175, 'deg' => 176, 'plusmn' => 177,
26
- 'sup2' => 178, 'sup3' => 179, 'acute' => 180,
27
- 'micro' => 181, 'para' => 182, 'middot' => 183,
28
- 'cedil' => 184, 'sup1' => 185, 'ordm' => 186,
29
- 'raquo' => 187, 'frac14' => 188, 'frac12' => 189,
30
- 'frac34' => 190, 'iquest' => 191, 'times' => 215,
31
- 'divide' => 247, 'Agrave' => 192, 'Aacute' => 193,
32
- 'Acirc' => 194, 'Atilde' => 195, 'Auml' => 196,
33
- 'Aring' => 197, 'AElig' => 198, 'Ccedil' => 199,
34
- 'Egrave' => 200, 'Eacute' => 201, 'Ecirc' => 202,
35
- 'Euml' => 203, 'Igrave' => 204, 'Iacute' => 205,
36
- 'Icirc' => 206, 'Iuml' => 207, 'ETH' => 208,
37
- 'Ntilde' => 209, 'Ograve' => 210, 'Oacute' => 211,
38
- 'Ocirc' => 212, 'Otilde' => 213, 'Ouml' => 214,
39
- 'Oslash' => 216, 'Ugrave' => 217, 'Uacute' => 218,
40
- 'Ucirc' => 219, 'Uuml' => 220, 'Yacute' => 221,
41
- 'THORN' => 222, 'szlig' => 223, 'agrave' => 224,
42
- 'aacute' => 225, 'acirc' => 226, 'atilde' => 227,
43
- 'auml' => 228, 'aring' => 229, 'aelig' => 230,
44
- 'ccedil' => 231, 'egrave' => 232, 'eacute' => 233,
45
- 'ecirc' => 234, 'euml' => 235, 'igrave' => 236,
46
- 'iacute' => 237, 'icirc' => 238, 'iuml' => 239,
47
- 'eth' => 240, 'ntilde' => 241, 'ograve' => 242,
48
- 'oacute' => 243, 'ocirc' => 244, 'otilde' => 245,
49
- 'ouml' => 246, 'oslash' => 248, 'ugrave' => 249,
50
- 'uacute' => 250, 'ucirc' => 251, 'uuml' => 252,
51
- 'yacute' => 253, 'thorn' => 254, 'yuml' => 255,
52
- 'OElig' => 338, 'oelig' => 339, 'Scaron' => 352,
53
- 'scaron' => 353, 'Yuml' => 376, 'circ' => 710,
54
- 'tilde' => 732, 'ensp' => 8194, 'emsp' => 8195,
55
- 'thinsp' => 8201, 'zwnj' => 8204, 'zwj' => 8205,
56
- 'lrm' => 8206, 'rlm' => 8207, 'ndash' => 8211,
57
- 'mdash' => 8212, 'lsquo' => 8216, 'rsquo' => 8217,
58
- 'sbquo' => 8218, 'ldquo' => 8220, 'rdquo' => 8221,
59
- 'bdquo' => 8222, 'dagger' => 8224, 'Dagger' => 8225,
60
- 'hellip' => 8230, 'permil' => 8240, 'lsaquo' => 8249,
61
- 'rsaquo' => 8250, 'euro' => 8364
62
- }
10
+ module Data #:nodoc:
63
11
 
64
- MIN_LENGTH = MAP.keys.map{ |a| a.length }.min
65
- MAX_LENGTH = MAP.keys.map{ |a| a.length }.max
66
-
67
- # Precompile the regexp
68
- NAMED_ENTITY_REGEXP =
69
- /&([a-z]{#{HTMLEntities::MIN_LENGTH},#{HTMLEntities::MAX_LENGTH}});/i
12
+ #
13
+ # MAP is a hash of all the HTML entities I could discover, as taken
14
+ # from the w3schools page on the subject:
15
+ # http://www.w3schools.com/html/html_entitiesref.asp
16
+ # The format is 'entity name' => codepoint where entity name is given
17
+ # without the surrounding ampersand and semicolon.
18
+ #
19
+ MAP = {
20
+ 'quot' => 34, 'apos' => 39, 'amp' => 38,
21
+ 'lt' => 60, 'gt' => 62, 'nbsp' => 160,
22
+ 'iexcl' => 161, 'curren' => 164, 'cent' => 162,
23
+ 'pound' => 163, 'yen' => 165, 'brvbar' => 166,
24
+ 'sect' => 167, 'uml' => 168, 'copy' => 169,
25
+ 'ordf' => 170, 'laquo' => 171, 'not' => 172,
26
+ 'shy' => 173, 'reg' => 174, 'trade' => 8482,
27
+ 'macr' => 175, 'deg' => 176, 'plusmn' => 177,
28
+ 'sup2' => 178, 'sup3' => 179, 'acute' => 180,
29
+ 'micro' => 181, 'para' => 182, 'middot' => 183,
30
+ 'cedil' => 184, 'sup1' => 185, 'ordm' => 186,
31
+ 'raquo' => 187, 'frac14' => 188, 'frac12' => 189,
32
+ 'frac34' => 190, 'iquest' => 191, 'times' => 215,
33
+ 'divide' => 247, 'Agrave' => 192, 'Aacute' => 193,
34
+ 'Acirc' => 194, 'Atilde' => 195, 'Auml' => 196,
35
+ 'Aring' => 197, 'AElig' => 198, 'Ccedil' => 199,
36
+ 'Egrave' => 200, 'Eacute' => 201, 'Ecirc' => 202,
37
+ 'Euml' => 203, 'Igrave' => 204, 'Iacute' => 205,
38
+ 'Icirc' => 206, 'Iuml' => 207, 'ETH' => 208,
39
+ 'Ntilde' => 209, 'Ograve' => 210, 'Oacute' => 211,
40
+ 'Ocirc' => 212, 'Otilde' => 213, 'Ouml' => 214,
41
+ 'Oslash' => 216, 'Ugrave' => 217, 'Uacute' => 218,
42
+ 'Ucirc' => 219, 'Uuml' => 220, 'Yacute' => 221,
43
+ 'THORN' => 222, 'szlig' => 223, 'agrave' => 224,
44
+ 'aacute' => 225, 'acirc' => 226, 'atilde' => 227,
45
+ 'auml' => 228, 'aring' => 229, 'aelig' => 230,
46
+ 'ccedil' => 231, 'egrave' => 232, 'eacute' => 233,
47
+ 'ecirc' => 234, 'euml' => 235, 'igrave' => 236,
48
+ 'iacute' => 237, 'icirc' => 238, 'iuml' => 239,
49
+ 'eth' => 240, 'ntilde' => 241, 'ograve' => 242,
50
+ 'oacute' => 243, 'ocirc' => 244, 'otilde' => 245,
51
+ 'ouml' => 246, 'oslash' => 248, 'ugrave' => 249,
52
+ 'uacute' => 250, 'ucirc' => 251, 'uuml' => 252,
53
+ 'yacute' => 253, 'thorn' => 254, 'yuml' => 255,
54
+ 'OElig' => 338, 'oelig' => 339, 'Scaron' => 352,
55
+ 'scaron' => 353, 'Yuml' => 376, 'circ' => 710,
56
+ 'tilde' => 732, 'ensp' => 8194, 'emsp' => 8195,
57
+ 'thinsp' => 8201, 'zwnj' => 8204, 'zwj' => 8205,
58
+ 'lrm' => 8206, 'rlm' => 8207, 'ndash' => 8211,
59
+ 'mdash' => 8212, 'lsquo' => 8216, 'rsquo' => 8217,
60
+ 'sbquo' => 8218, 'ldquo' => 8220, 'rdquo' => 8221,
61
+ 'bdquo' => 8222, 'dagger' => 8224, 'Dagger' => 8225,
62
+ 'hellip' => 8230, 'permil' => 8240, 'lsaquo' => 8249,
63
+ 'rsaquo' => 8250, 'euro' => 8364
64
+ }
70
65
 
71
- # Reverse map for converting characters to named entities
72
- REVERSE_MAP = MAP.invert
66
+ MIN_LENGTH = MAP.keys.map{ |a| a.length }.min
67
+ MAX_LENGTH = MAP.keys.map{ |a| a.length }.max
68
+ NAMED_ENTITY_REGEXP = /&([a-z]{#{MIN_LENGTH},#{MAX_LENGTH}});/i
69
+ REVERSE_MAP = MAP.invert
73
70
 
74
- BASIC_ENTITY_REGEXP = /[<>'"&]/
71
+ BASIC_ENTITY_REGEXP = /[<>'"&]/
75
72
 
76
- UTF8_NON_ASCII_REGEXP = /[\x00-\x1f]|[\xc0-\xfd][\x80-\xbf]+/
73
+ UTF8_NON_ASCII_REGEXP = /[\x00-\x1f]|[\xc0-\xfd][\x80-\xbf]+/
77
74
 
78
- ENCODE_ENTITIES_COMMAND_ORDER = {
79
- :basic => 0,
80
- :named => 1,
81
- :decimal => 2,
82
- :hexadecimal => 3
83
- }
75
+ ENCODE_ENTITIES_COMMAND_ORDER = {
76
+ :basic => 0,
77
+ :named => 1,
78
+ :decimal => 2,
79
+ :hexadecimal => 3
80
+ }
81
+
82
+ end
84
83
 
85
84
  #
86
85
  # Decode XML and HTML 4.01 entities in a string into their UTF-8
@@ -91,8 +90,8 @@ module HTMLEntities # :nodoc:
91
90
  # Unknown named entities are not converted
92
91
  #
93
92
  def decode_entities(string)
94
- return string.gsub(NAMED_ENTITY_REGEXP) {
95
- (cp = MAP[$1]) ? [cp].pack('U') : $&
93
+ return string.gsub(Data::NAMED_ENTITY_REGEXP) {
94
+ (cp = Data::MAP[$1]) ? [cp].pack('U') : $&
96
95
  }.gsub(/&#([0-9]{1,7});|&#x([0-9a-f]{1,6});/i) {
97
96
  $1 ? [$1.to_i].pack('U') : [$2.to_i(16)].pack('U')
98
97
  }
@@ -129,7 +128,7 @@ module HTMLEntities # :nodoc:
129
128
  instructions = [:basic]
130
129
  else
131
130
  instructions = instructions.sort_by { |instruction|
132
- ENCODE_ENTITIES_COMMAND_ORDER[instruction] ||
131
+ Data::ENCODE_ENTITIES_COMMAND_ORDER[instruction] ||
133
132
  (raise InstructionError, "unknown encode_entities command `#{instruction.inspect}'")
134
133
  }
135
134
  end
@@ -137,23 +136,23 @@ module HTMLEntities # :nodoc:
137
136
  case instruction
138
137
  when :basic
139
138
  # Handled as basic ASCII
140
- output = (output || string).gsub(BASIC_ENTITY_REGEXP) {
139
+ output = (output || string).gsub(Data::BASIC_ENTITY_REGEXP) {
141
140
  # It's safe to use the simpler [0] here because we know
142
141
  # that the basic entities are ASCII.
143
- '&' << REVERSE_MAP[$&[0]] << ';'
142
+ '&' << Data::REVERSE_MAP[$&[0]] << ';'
144
143
  }
145
144
  when :named
146
145
  # Test everything except printable ASCII
147
- output = (output || string).gsub(UTF8_NON_ASCII_REGEXP) {
146
+ output = (output || string).gsub(Data::UTF8_NON_ASCII_REGEXP) {
148
147
  cp = $&.unpack('U')[0]
149
- (e = REVERSE_MAP[cp]) ? "&#{e};" : $&
148
+ (e = Data::REVERSE_MAP[cp]) ? "&#{e};" : $&
150
149
  }
151
150
  when :decimal
152
- output = (output || string).gsub(UTF8_NON_ASCII_REGEXP) {
151
+ output = (output || string).gsub(Data::UTF8_NON_ASCII_REGEXP) {
153
152
  "&##{$&.unpack('U')[0]};"
154
153
  }
155
154
  when :hexadecimal
156
- output = (output || string).gsub(UTF8_NON_ASCII_REGEXP) {
155
+ output = (output || string).gsub(Data::UTF8_NON_ASCII_REGEXP) {
157
156
  "&#x#{$&.unpack('U')[0].to_s(16)};"
158
157
  }
159
158
  end
@@ -1,15 +1,24 @@
1
1
  require 'htmlentities'
2
2
 
3
3
  #
4
- # This library extends the String class with methods to allow encoding and decoding of
4
+ # This file extends the String class with methods to allow encoding and decoding of
5
5
  # HTML/XML entities from/to their corresponding UTF-8 codepoints.
6
6
  #
7
7
  class String
8
8
 
9
+ #
10
+ # Decode XML and HTML 4.01 entities in a string into their UTF-8
11
+ # equivalents.
12
+ #
9
13
  def decode_entities
10
14
  return HTMLEntities.decode_entities(self)
11
15
  end
12
16
 
17
+ #
18
+ # Encode codepoints in a string into their corresponding entities. See
19
+ # the documentation of HTMLEntities.encode_entities for a list of possible
20
+ # instructions.
21
+ #
13
22
  def encode_entities(*instructions)
14
23
  return HTMLEntities.encode_entities(self, *instructions)
15
24
  end
metadata CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.8.11
3
3
  specification_version: 1
4
4
  name: htmlentities
5
5
  version: !ruby/object:Gem::Version
6
- version: 3.0.0
7
- date: 2006-04-08 00:00:00 +01:00
6
+ version: 3.0.1
7
+ date: 2006-04-09 00:00:00 +01:00
8
8
  summary: A module for encoding and decoding of HTML/XML entities from/to their corresponding UTF-8 codepoints. Optional String class extension for same.
9
9
  require_paths:
10
10
  - lib