htmlentities 3.0.0 → 3.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGES +18 -15
- data/lib/htmlentities.rb +78 -79
- data/lib/htmlentities/string.rb +10 -1
- metadata +2 -2
data/CHANGES
CHANGED
@@ -1,29 +1,32 @@
|
|
1
|
+
== 3.0.1 (2005-04-08)
|
2
|
+
* Improved documentation.
|
3
|
+
|
1
4
|
== 3.0.0 (2005-04-08)
|
2
5
|
* Changed licence to MIT due to confusion with previous 'Fair' licence (my
|
3
6
|
intention was to be liberal, not obscure).
|
4
7
|
* Moved basic functionality out of String class; for previous behaviour,
|
5
|
-
require 'htmlentities/string'
|
6
|
-
* Changed version numbering scheme
|
7
|
-
* Now available as a Gem
|
8
|
+
require 'htmlentities/string'.
|
9
|
+
* Changed version numbering scheme.
|
10
|
+
* Now available as a Gem.
|
8
11
|
|
9
12
|
== 2.2 (2005-11-07)
|
10
|
-
* Important bug fixes -- thanks to Moonwolf
|
13
|
+
* Important bug fixes -- thanks to Moonwolf.
|
11
14
|
* Decoding hexadecimal entities now accepts 'f' as a hex digit. (D'oh!)
|
12
|
-
* Decimal decoding edge cases addressed
|
13
|
-
* Test cases added
|
15
|
+
* Decimal decoding edge cases addressed.
|
16
|
+
* Test cases added.
|
14
17
|
|
15
18
|
== 2.1 (2005-10-31)
|
16
|
-
* Removed some unnecessary code in basic entity encoding
|
19
|
+
* Removed some unnecessary code in basic entity encoding.
|
17
20
|
* Improved handling of encoding: commands are now automatically sorted, so the
|
18
|
-
user doesn't have to worry about their order
|
19
|
-
* Now using setup.rb
|
20
|
-
* Tests moved to separate file
|
21
|
+
user doesn't have to worry about their order.
|
22
|
+
* Now using setup.rb.
|
23
|
+
* Tests moved to separate file.
|
21
24
|
|
22
25
|
== 2.0 (2005-08-23)
|
23
|
-
* Added encoding to entities
|
24
|
-
* Decoding interface unchanged
|
25
|
-
* Fixed a bug with handling high codepoints
|
26
|
+
* Added encoding to entities.
|
27
|
+
* Decoding interface unchanged.
|
28
|
+
* Fixed a bug with handling high codepoints.
|
26
29
|
|
27
30
|
== 1.0 (2005-08-03)
|
28
|
-
* Initial release
|
29
|
-
* Decoding only
|
31
|
+
* Initial release.
|
32
|
+
* Decoding only.
|
data/lib/htmlentities.rb
CHANGED
@@ -2,85 +2,84 @@
|
|
2
2
|
# HTML entity encoding and decoding for Ruby
|
3
3
|
#
|
4
4
|
|
5
|
-
module HTMLEntities
|
5
|
+
module HTMLEntities
|
6
6
|
|
7
7
|
class InstructionError < RuntimeError
|
8
8
|
end
|
9
9
|
|
10
|
-
|
11
|
-
# MAP is a hash of all the HTML entities I could discover, as taken
|
12
|
-
# from the w3schools page on the subject:
|
13
|
-
# http://www.w3schools.com/html/html_entitiesref.asp
|
14
|
-
# The format is 'entity name' => codepoint where entity name is given
|
15
|
-
# without the surrounding ampersand and semicolon.
|
16
|
-
#
|
17
|
-
MAP = {
|
18
|
-
'quot' => 34, 'apos' => 39, 'amp' => 38,
|
19
|
-
'lt' => 60, 'gt' => 62, 'nbsp' => 160,
|
20
|
-
'iexcl' => 161, 'curren' => 164, 'cent' => 162,
|
21
|
-
'pound' => 163, 'yen' => 165, 'brvbar' => 166,
|
22
|
-
'sect' => 167, 'uml' => 168, 'copy' => 169,
|
23
|
-
'ordf' => 170, 'laquo' => 171, 'not' => 172,
|
24
|
-
'shy' => 173, 'reg' => 174, 'trade' => 8482,
|
25
|
-
'macr' => 175, 'deg' => 176, 'plusmn' => 177,
|
26
|
-
'sup2' => 178, 'sup3' => 179, 'acute' => 180,
|
27
|
-
'micro' => 181, 'para' => 182, 'middot' => 183,
|
28
|
-
'cedil' => 184, 'sup1' => 185, 'ordm' => 186,
|
29
|
-
'raquo' => 187, 'frac14' => 188, 'frac12' => 189,
|
30
|
-
'frac34' => 190, 'iquest' => 191, 'times' => 215,
|
31
|
-
'divide' => 247, 'Agrave' => 192, 'Aacute' => 193,
|
32
|
-
'Acirc' => 194, 'Atilde' => 195, 'Auml' => 196,
|
33
|
-
'Aring' => 197, 'AElig' => 198, 'Ccedil' => 199,
|
34
|
-
'Egrave' => 200, 'Eacute' => 201, 'Ecirc' => 202,
|
35
|
-
'Euml' => 203, 'Igrave' => 204, 'Iacute' => 205,
|
36
|
-
'Icirc' => 206, 'Iuml' => 207, 'ETH' => 208,
|
37
|
-
'Ntilde' => 209, 'Ograve' => 210, 'Oacute' => 211,
|
38
|
-
'Ocirc' => 212, 'Otilde' => 213, 'Ouml' => 214,
|
39
|
-
'Oslash' => 216, 'Ugrave' => 217, 'Uacute' => 218,
|
40
|
-
'Ucirc' => 219, 'Uuml' => 220, 'Yacute' => 221,
|
41
|
-
'THORN' => 222, 'szlig' => 223, 'agrave' => 224,
|
42
|
-
'aacute' => 225, 'acirc' => 226, 'atilde' => 227,
|
43
|
-
'auml' => 228, 'aring' => 229, 'aelig' => 230,
|
44
|
-
'ccedil' => 231, 'egrave' => 232, 'eacute' => 233,
|
45
|
-
'ecirc' => 234, 'euml' => 235, 'igrave' => 236,
|
46
|
-
'iacute' => 237, 'icirc' => 238, 'iuml' => 239,
|
47
|
-
'eth' => 240, 'ntilde' => 241, 'ograve' => 242,
|
48
|
-
'oacute' => 243, 'ocirc' => 244, 'otilde' => 245,
|
49
|
-
'ouml' => 246, 'oslash' => 248, 'ugrave' => 249,
|
50
|
-
'uacute' => 250, 'ucirc' => 251, 'uuml' => 252,
|
51
|
-
'yacute' => 253, 'thorn' => 254, 'yuml' => 255,
|
52
|
-
'OElig' => 338, 'oelig' => 339, 'Scaron' => 352,
|
53
|
-
'scaron' => 353, 'Yuml' => 376, 'circ' => 710,
|
54
|
-
'tilde' => 732, 'ensp' => 8194, 'emsp' => 8195,
|
55
|
-
'thinsp' => 8201, 'zwnj' => 8204, 'zwj' => 8205,
|
56
|
-
'lrm' => 8206, 'rlm' => 8207, 'ndash' => 8211,
|
57
|
-
'mdash' => 8212, 'lsquo' => 8216, 'rsquo' => 8217,
|
58
|
-
'sbquo' => 8218, 'ldquo' => 8220, 'rdquo' => 8221,
|
59
|
-
'bdquo' => 8222, 'dagger' => 8224, 'Dagger' => 8225,
|
60
|
-
'hellip' => 8230, 'permil' => 8240, 'lsaquo' => 8249,
|
61
|
-
'rsaquo' => 8250, 'euro' => 8364
|
62
|
-
}
|
10
|
+
module Data #:nodoc:
|
63
11
|
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
12
|
+
#
|
13
|
+
# MAP is a hash of all the HTML entities I could discover, as taken
|
14
|
+
# from the w3schools page on the subject:
|
15
|
+
# http://www.w3schools.com/html/html_entitiesref.asp
|
16
|
+
# The format is 'entity name' => codepoint where entity name is given
|
17
|
+
# without the surrounding ampersand and semicolon.
|
18
|
+
#
|
19
|
+
MAP = {
|
20
|
+
'quot' => 34, 'apos' => 39, 'amp' => 38,
|
21
|
+
'lt' => 60, 'gt' => 62, 'nbsp' => 160,
|
22
|
+
'iexcl' => 161, 'curren' => 164, 'cent' => 162,
|
23
|
+
'pound' => 163, 'yen' => 165, 'brvbar' => 166,
|
24
|
+
'sect' => 167, 'uml' => 168, 'copy' => 169,
|
25
|
+
'ordf' => 170, 'laquo' => 171, 'not' => 172,
|
26
|
+
'shy' => 173, 'reg' => 174, 'trade' => 8482,
|
27
|
+
'macr' => 175, 'deg' => 176, 'plusmn' => 177,
|
28
|
+
'sup2' => 178, 'sup3' => 179, 'acute' => 180,
|
29
|
+
'micro' => 181, 'para' => 182, 'middot' => 183,
|
30
|
+
'cedil' => 184, 'sup1' => 185, 'ordm' => 186,
|
31
|
+
'raquo' => 187, 'frac14' => 188, 'frac12' => 189,
|
32
|
+
'frac34' => 190, 'iquest' => 191, 'times' => 215,
|
33
|
+
'divide' => 247, 'Agrave' => 192, 'Aacute' => 193,
|
34
|
+
'Acirc' => 194, 'Atilde' => 195, 'Auml' => 196,
|
35
|
+
'Aring' => 197, 'AElig' => 198, 'Ccedil' => 199,
|
36
|
+
'Egrave' => 200, 'Eacute' => 201, 'Ecirc' => 202,
|
37
|
+
'Euml' => 203, 'Igrave' => 204, 'Iacute' => 205,
|
38
|
+
'Icirc' => 206, 'Iuml' => 207, 'ETH' => 208,
|
39
|
+
'Ntilde' => 209, 'Ograve' => 210, 'Oacute' => 211,
|
40
|
+
'Ocirc' => 212, 'Otilde' => 213, 'Ouml' => 214,
|
41
|
+
'Oslash' => 216, 'Ugrave' => 217, 'Uacute' => 218,
|
42
|
+
'Ucirc' => 219, 'Uuml' => 220, 'Yacute' => 221,
|
43
|
+
'THORN' => 222, 'szlig' => 223, 'agrave' => 224,
|
44
|
+
'aacute' => 225, 'acirc' => 226, 'atilde' => 227,
|
45
|
+
'auml' => 228, 'aring' => 229, 'aelig' => 230,
|
46
|
+
'ccedil' => 231, 'egrave' => 232, 'eacute' => 233,
|
47
|
+
'ecirc' => 234, 'euml' => 235, 'igrave' => 236,
|
48
|
+
'iacute' => 237, 'icirc' => 238, 'iuml' => 239,
|
49
|
+
'eth' => 240, 'ntilde' => 241, 'ograve' => 242,
|
50
|
+
'oacute' => 243, 'ocirc' => 244, 'otilde' => 245,
|
51
|
+
'ouml' => 246, 'oslash' => 248, 'ugrave' => 249,
|
52
|
+
'uacute' => 250, 'ucirc' => 251, 'uuml' => 252,
|
53
|
+
'yacute' => 253, 'thorn' => 254, 'yuml' => 255,
|
54
|
+
'OElig' => 338, 'oelig' => 339, 'Scaron' => 352,
|
55
|
+
'scaron' => 353, 'Yuml' => 376, 'circ' => 710,
|
56
|
+
'tilde' => 732, 'ensp' => 8194, 'emsp' => 8195,
|
57
|
+
'thinsp' => 8201, 'zwnj' => 8204, 'zwj' => 8205,
|
58
|
+
'lrm' => 8206, 'rlm' => 8207, 'ndash' => 8211,
|
59
|
+
'mdash' => 8212, 'lsquo' => 8216, 'rsquo' => 8217,
|
60
|
+
'sbquo' => 8218, 'ldquo' => 8220, 'rdquo' => 8221,
|
61
|
+
'bdquo' => 8222, 'dagger' => 8224, 'Dagger' => 8225,
|
62
|
+
'hellip' => 8230, 'permil' => 8240, 'lsaquo' => 8249,
|
63
|
+
'rsaquo' => 8250, 'euro' => 8364
|
64
|
+
}
|
70
65
|
|
71
|
-
|
72
|
-
|
66
|
+
MIN_LENGTH = MAP.keys.map{ |a| a.length }.min
|
67
|
+
MAX_LENGTH = MAP.keys.map{ |a| a.length }.max
|
68
|
+
NAMED_ENTITY_REGEXP = /&([a-z]{#{MIN_LENGTH},#{MAX_LENGTH}});/i
|
69
|
+
REVERSE_MAP = MAP.invert
|
73
70
|
|
74
|
-
|
71
|
+
BASIC_ENTITY_REGEXP = /[<>'"&]/
|
75
72
|
|
76
|
-
|
73
|
+
UTF8_NON_ASCII_REGEXP = /[\x00-\x1f]|[\xc0-\xfd][\x80-\xbf]+/
|
77
74
|
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
75
|
+
ENCODE_ENTITIES_COMMAND_ORDER = {
|
76
|
+
:basic => 0,
|
77
|
+
:named => 1,
|
78
|
+
:decimal => 2,
|
79
|
+
:hexadecimal => 3
|
80
|
+
}
|
81
|
+
|
82
|
+
end
|
84
83
|
|
85
84
|
#
|
86
85
|
# Decode XML and HTML 4.01 entities in a string into their UTF-8
|
@@ -91,8 +90,8 @@ module HTMLEntities # :nodoc:
|
|
91
90
|
# Unknown named entities are not converted
|
92
91
|
#
|
93
92
|
def decode_entities(string)
|
94
|
-
return string.gsub(NAMED_ENTITY_REGEXP) {
|
95
|
-
(cp = MAP[$1]) ? [cp].pack('U') : $&
|
93
|
+
return string.gsub(Data::NAMED_ENTITY_REGEXP) {
|
94
|
+
(cp = Data::MAP[$1]) ? [cp].pack('U') : $&
|
96
95
|
}.gsub(/&#([0-9]{1,7});|&#x([0-9a-f]{1,6});/i) {
|
97
96
|
$1 ? [$1.to_i].pack('U') : [$2.to_i(16)].pack('U')
|
98
97
|
}
|
@@ -129,7 +128,7 @@ module HTMLEntities # :nodoc:
|
|
129
128
|
instructions = [:basic]
|
130
129
|
else
|
131
130
|
instructions = instructions.sort_by { |instruction|
|
132
|
-
ENCODE_ENTITIES_COMMAND_ORDER[instruction] ||
|
131
|
+
Data::ENCODE_ENTITIES_COMMAND_ORDER[instruction] ||
|
133
132
|
(raise InstructionError, "unknown encode_entities command `#{instruction.inspect}'")
|
134
133
|
}
|
135
134
|
end
|
@@ -137,23 +136,23 @@ module HTMLEntities # :nodoc:
|
|
137
136
|
case instruction
|
138
137
|
when :basic
|
139
138
|
# Handled as basic ASCII
|
140
|
-
output = (output || string).gsub(BASIC_ENTITY_REGEXP) {
|
139
|
+
output = (output || string).gsub(Data::BASIC_ENTITY_REGEXP) {
|
141
140
|
# It's safe to use the simpler [0] here because we know
|
142
141
|
# that the basic entities are ASCII.
|
143
|
-
'&' << REVERSE_MAP[$&[0]] << ';'
|
142
|
+
'&' << Data::REVERSE_MAP[$&[0]] << ';'
|
144
143
|
}
|
145
144
|
when :named
|
146
145
|
# Test everything except printable ASCII
|
147
|
-
output = (output || string).gsub(UTF8_NON_ASCII_REGEXP) {
|
146
|
+
output = (output || string).gsub(Data::UTF8_NON_ASCII_REGEXP) {
|
148
147
|
cp = $&.unpack('U')[0]
|
149
|
-
(e = REVERSE_MAP[cp]) ? "&#{e};" : $&
|
148
|
+
(e = Data::REVERSE_MAP[cp]) ? "&#{e};" : $&
|
150
149
|
}
|
151
150
|
when :decimal
|
152
|
-
output = (output || string).gsub(UTF8_NON_ASCII_REGEXP) {
|
151
|
+
output = (output || string).gsub(Data::UTF8_NON_ASCII_REGEXP) {
|
153
152
|
"&##{$&.unpack('U')[0]};"
|
154
153
|
}
|
155
154
|
when :hexadecimal
|
156
|
-
output = (output || string).gsub(UTF8_NON_ASCII_REGEXP) {
|
155
|
+
output = (output || string).gsub(Data::UTF8_NON_ASCII_REGEXP) {
|
157
156
|
"&#x#{$&.unpack('U')[0].to_s(16)};"
|
158
157
|
}
|
159
158
|
end
|
data/lib/htmlentities/string.rb
CHANGED
@@ -1,15 +1,24 @@
|
|
1
1
|
require 'htmlentities'
|
2
2
|
|
3
3
|
#
|
4
|
-
# This
|
4
|
+
# This file extends the String class with methods to allow encoding and decoding of
|
5
5
|
# HTML/XML entities from/to their corresponding UTF-8 codepoints.
|
6
6
|
#
|
7
7
|
class String
|
8
8
|
|
9
|
+
#
|
10
|
+
# Decode XML and HTML 4.01 entities in a string into their UTF-8
|
11
|
+
# equivalents.
|
12
|
+
#
|
9
13
|
def decode_entities
|
10
14
|
return HTMLEntities.decode_entities(self)
|
11
15
|
end
|
12
16
|
|
17
|
+
#
|
18
|
+
# Encode codepoints in a string into their corresponding entities. See
|
19
|
+
# the documentation of HTMLEntities.encode_entities for a list of possible
|
20
|
+
# instructions.
|
21
|
+
#
|
13
22
|
def encode_entities(*instructions)
|
14
23
|
return HTMLEntities.encode_entities(self, *instructions)
|
15
24
|
end
|
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.8.11
|
|
3
3
|
specification_version: 1
|
4
4
|
name: htmlentities
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 3.0.
|
7
|
-
date: 2006-04-
|
6
|
+
version: 3.0.1
|
7
|
+
date: 2006-04-09 00:00:00 +01:00
|
8
8
|
summary: A module for encoding and decoding of HTML/XML entities from/to their corresponding UTF-8 codepoints. Optional String class extension for same.
|
9
9
|
require_paths:
|
10
10
|
- lib
|