htmlentities 3.0.0 → 3.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGES +18 -15
- data/lib/htmlentities.rb +78 -79
- data/lib/htmlentities/string.rb +10 -1
- metadata +2 -2
data/CHANGES
CHANGED
@@ -1,29 +1,32 @@
|
|
1
|
+
== 3.0.1 (2005-04-08)
|
2
|
+
* Improved documentation.
|
3
|
+
|
1
4
|
== 3.0.0 (2005-04-08)
|
2
5
|
* Changed licence to MIT due to confusion with previous 'Fair' licence (my
|
3
6
|
intention was to be liberal, not obscure).
|
4
7
|
* Moved basic functionality out of String class; for previous behaviour,
|
5
|
-
require 'htmlentities/string'
|
6
|
-
* Changed version numbering scheme
|
7
|
-
* Now available as a Gem
|
8
|
+
require 'htmlentities/string'.
|
9
|
+
* Changed version numbering scheme.
|
10
|
+
* Now available as a Gem.
|
8
11
|
|
9
12
|
== 2.2 (2005-11-07)
|
10
|
-
* Important bug fixes -- thanks to Moonwolf
|
13
|
+
* Important bug fixes -- thanks to Moonwolf.
|
11
14
|
* Decoding hexadecimal entities now accepts 'f' as a hex digit. (D'oh!)
|
12
|
-
* Decimal decoding edge cases addressed
|
13
|
-
* Test cases added
|
15
|
+
* Decimal decoding edge cases addressed.
|
16
|
+
* Test cases added.
|
14
17
|
|
15
18
|
== 2.1 (2005-10-31)
|
16
|
-
* Removed some unnecessary code in basic entity encoding
|
19
|
+
* Removed some unnecessary code in basic entity encoding.
|
17
20
|
* Improved handling of encoding: commands are now automatically sorted, so the
|
18
|
-
user doesn't have to worry about their order
|
19
|
-
* Now using setup.rb
|
20
|
-
* Tests moved to separate file
|
21
|
+
user doesn't have to worry about their order.
|
22
|
+
* Now using setup.rb.
|
23
|
+
* Tests moved to separate file.
|
21
24
|
|
22
25
|
== 2.0 (2005-08-23)
|
23
|
-
* Added encoding to entities
|
24
|
-
* Decoding interface unchanged
|
25
|
-
* Fixed a bug with handling high codepoints
|
26
|
+
* Added encoding to entities.
|
27
|
+
* Decoding interface unchanged.
|
28
|
+
* Fixed a bug with handling high codepoints.
|
26
29
|
|
27
30
|
== 1.0 (2005-08-03)
|
28
|
-
* Initial release
|
29
|
-
* Decoding only
|
31
|
+
* Initial release.
|
32
|
+
* Decoding only.
|
data/lib/htmlentities.rb
CHANGED
@@ -2,85 +2,84 @@
|
|
2
2
|
# HTML entity encoding and decoding for Ruby
|
3
3
|
#
|
4
4
|
|
5
|
-
module HTMLEntities
|
5
|
+
module HTMLEntities
|
6
6
|
|
7
7
|
class InstructionError < RuntimeError
|
8
8
|
end
|
9
9
|
|
10
|
-
|
11
|
-
# MAP is a hash of all the HTML entities I could discover, as taken
|
12
|
-
# from the w3schools page on the subject:
|
13
|
-
# http://www.w3schools.com/html/html_entitiesref.asp
|
14
|
-
# The format is 'entity name' => codepoint where entity name is given
|
15
|
-
# without the surrounding ampersand and semicolon.
|
16
|
-
#
|
17
|
-
MAP = {
|
18
|
-
'quot' => 34, 'apos' => 39, 'amp' => 38,
|
19
|
-
'lt' => 60, 'gt' => 62, 'nbsp' => 160,
|
20
|
-
'iexcl' => 161, 'curren' => 164, 'cent' => 162,
|
21
|
-
'pound' => 163, 'yen' => 165, 'brvbar' => 166,
|
22
|
-
'sect' => 167, 'uml' => 168, 'copy' => 169,
|
23
|
-
'ordf' => 170, 'laquo' => 171, 'not' => 172,
|
24
|
-
'shy' => 173, 'reg' => 174, 'trade' => 8482,
|
25
|
-
'macr' => 175, 'deg' => 176, 'plusmn' => 177,
|
26
|
-
'sup2' => 178, 'sup3' => 179, 'acute' => 180,
|
27
|
-
'micro' => 181, 'para' => 182, 'middot' => 183,
|
28
|
-
'cedil' => 184, 'sup1' => 185, 'ordm' => 186,
|
29
|
-
'raquo' => 187, 'frac14' => 188, 'frac12' => 189,
|
30
|
-
'frac34' => 190, 'iquest' => 191, 'times' => 215,
|
31
|
-
'divide' => 247, 'Agrave' => 192, 'Aacute' => 193,
|
32
|
-
'Acirc' => 194, 'Atilde' => 195, 'Auml' => 196,
|
33
|
-
'Aring' => 197, 'AElig' => 198, 'Ccedil' => 199,
|
34
|
-
'Egrave' => 200, 'Eacute' => 201, 'Ecirc' => 202,
|
35
|
-
'Euml' => 203, 'Igrave' => 204, 'Iacute' => 205,
|
36
|
-
'Icirc' => 206, 'Iuml' => 207, 'ETH' => 208,
|
37
|
-
'Ntilde' => 209, 'Ograve' => 210, 'Oacute' => 211,
|
38
|
-
'Ocirc' => 212, 'Otilde' => 213, 'Ouml' => 214,
|
39
|
-
'Oslash' => 216, 'Ugrave' => 217, 'Uacute' => 218,
|
40
|
-
'Ucirc' => 219, 'Uuml' => 220, 'Yacute' => 221,
|
41
|
-
'THORN' => 222, 'szlig' => 223, 'agrave' => 224,
|
42
|
-
'aacute' => 225, 'acirc' => 226, 'atilde' => 227,
|
43
|
-
'auml' => 228, 'aring' => 229, 'aelig' => 230,
|
44
|
-
'ccedil' => 231, 'egrave' => 232, 'eacute' => 233,
|
45
|
-
'ecirc' => 234, 'euml' => 235, 'igrave' => 236,
|
46
|
-
'iacute' => 237, 'icirc' => 238, 'iuml' => 239,
|
47
|
-
'eth' => 240, 'ntilde' => 241, 'ograve' => 242,
|
48
|
-
'oacute' => 243, 'ocirc' => 244, 'otilde' => 245,
|
49
|
-
'ouml' => 246, 'oslash' => 248, 'ugrave' => 249,
|
50
|
-
'uacute' => 250, 'ucirc' => 251, 'uuml' => 252,
|
51
|
-
'yacute' => 253, 'thorn' => 254, 'yuml' => 255,
|
52
|
-
'OElig' => 338, 'oelig' => 339, 'Scaron' => 352,
|
53
|
-
'scaron' => 353, 'Yuml' => 376, 'circ' => 710,
|
54
|
-
'tilde' => 732, 'ensp' => 8194, 'emsp' => 8195,
|
55
|
-
'thinsp' => 8201, 'zwnj' => 8204, 'zwj' => 8205,
|
56
|
-
'lrm' => 8206, 'rlm' => 8207, 'ndash' => 8211,
|
57
|
-
'mdash' => 8212, 'lsquo' => 8216, 'rsquo' => 8217,
|
58
|
-
'sbquo' => 8218, 'ldquo' => 8220, 'rdquo' => 8221,
|
59
|
-
'bdquo' => 8222, 'dagger' => 8224, 'Dagger' => 8225,
|
60
|
-
'hellip' => 8230, 'permil' => 8240, 'lsaquo' => 8249,
|
61
|
-
'rsaquo' => 8250, 'euro' => 8364
|
62
|
-
}
|
10
|
+
module Data #:nodoc:
|
63
11
|
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
12
|
+
#
|
13
|
+
# MAP is a hash of all the HTML entities I could discover, as taken
|
14
|
+
# from the w3schools page on the subject:
|
15
|
+
# http://www.w3schools.com/html/html_entitiesref.asp
|
16
|
+
# The format is 'entity name' => codepoint where entity name is given
|
17
|
+
# without the surrounding ampersand and semicolon.
|
18
|
+
#
|
19
|
+
MAP = {
|
20
|
+
'quot' => 34, 'apos' => 39, 'amp' => 38,
|
21
|
+
'lt' => 60, 'gt' => 62, 'nbsp' => 160,
|
22
|
+
'iexcl' => 161, 'curren' => 164, 'cent' => 162,
|
23
|
+
'pound' => 163, 'yen' => 165, 'brvbar' => 166,
|
24
|
+
'sect' => 167, 'uml' => 168, 'copy' => 169,
|
25
|
+
'ordf' => 170, 'laquo' => 171, 'not' => 172,
|
26
|
+
'shy' => 173, 'reg' => 174, 'trade' => 8482,
|
27
|
+
'macr' => 175, 'deg' => 176, 'plusmn' => 177,
|
28
|
+
'sup2' => 178, 'sup3' => 179, 'acute' => 180,
|
29
|
+
'micro' => 181, 'para' => 182, 'middot' => 183,
|
30
|
+
'cedil' => 184, 'sup1' => 185, 'ordm' => 186,
|
31
|
+
'raquo' => 187, 'frac14' => 188, 'frac12' => 189,
|
32
|
+
'frac34' => 190, 'iquest' => 191, 'times' => 215,
|
33
|
+
'divide' => 247, 'Agrave' => 192, 'Aacute' => 193,
|
34
|
+
'Acirc' => 194, 'Atilde' => 195, 'Auml' => 196,
|
35
|
+
'Aring' => 197, 'AElig' => 198, 'Ccedil' => 199,
|
36
|
+
'Egrave' => 200, 'Eacute' => 201, 'Ecirc' => 202,
|
37
|
+
'Euml' => 203, 'Igrave' => 204, 'Iacute' => 205,
|
38
|
+
'Icirc' => 206, 'Iuml' => 207, 'ETH' => 208,
|
39
|
+
'Ntilde' => 209, 'Ograve' => 210, 'Oacute' => 211,
|
40
|
+
'Ocirc' => 212, 'Otilde' => 213, 'Ouml' => 214,
|
41
|
+
'Oslash' => 216, 'Ugrave' => 217, 'Uacute' => 218,
|
42
|
+
'Ucirc' => 219, 'Uuml' => 220, 'Yacute' => 221,
|
43
|
+
'THORN' => 222, 'szlig' => 223, 'agrave' => 224,
|
44
|
+
'aacute' => 225, 'acirc' => 226, 'atilde' => 227,
|
45
|
+
'auml' => 228, 'aring' => 229, 'aelig' => 230,
|
46
|
+
'ccedil' => 231, 'egrave' => 232, 'eacute' => 233,
|
47
|
+
'ecirc' => 234, 'euml' => 235, 'igrave' => 236,
|
48
|
+
'iacute' => 237, 'icirc' => 238, 'iuml' => 239,
|
49
|
+
'eth' => 240, 'ntilde' => 241, 'ograve' => 242,
|
50
|
+
'oacute' => 243, 'ocirc' => 244, 'otilde' => 245,
|
51
|
+
'ouml' => 246, 'oslash' => 248, 'ugrave' => 249,
|
52
|
+
'uacute' => 250, 'ucirc' => 251, 'uuml' => 252,
|
53
|
+
'yacute' => 253, 'thorn' => 254, 'yuml' => 255,
|
54
|
+
'OElig' => 338, 'oelig' => 339, 'Scaron' => 352,
|
55
|
+
'scaron' => 353, 'Yuml' => 376, 'circ' => 710,
|
56
|
+
'tilde' => 732, 'ensp' => 8194, 'emsp' => 8195,
|
57
|
+
'thinsp' => 8201, 'zwnj' => 8204, 'zwj' => 8205,
|
58
|
+
'lrm' => 8206, 'rlm' => 8207, 'ndash' => 8211,
|
59
|
+
'mdash' => 8212, 'lsquo' => 8216, 'rsquo' => 8217,
|
60
|
+
'sbquo' => 8218, 'ldquo' => 8220, 'rdquo' => 8221,
|
61
|
+
'bdquo' => 8222, 'dagger' => 8224, 'Dagger' => 8225,
|
62
|
+
'hellip' => 8230, 'permil' => 8240, 'lsaquo' => 8249,
|
63
|
+
'rsaquo' => 8250, 'euro' => 8364
|
64
|
+
}
|
70
65
|
|
71
|
-
|
72
|
-
|
66
|
+
MIN_LENGTH = MAP.keys.map{ |a| a.length }.min
|
67
|
+
MAX_LENGTH = MAP.keys.map{ |a| a.length }.max
|
68
|
+
NAMED_ENTITY_REGEXP = /&([a-z]{#{MIN_LENGTH},#{MAX_LENGTH}});/i
|
69
|
+
REVERSE_MAP = MAP.invert
|
73
70
|
|
74
|
-
|
71
|
+
BASIC_ENTITY_REGEXP = /[<>'"&]/
|
75
72
|
|
76
|
-
|
73
|
+
UTF8_NON_ASCII_REGEXP = /[\x00-\x1f]|[\xc0-\xfd][\x80-\xbf]+/
|
77
74
|
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
75
|
+
ENCODE_ENTITIES_COMMAND_ORDER = {
|
76
|
+
:basic => 0,
|
77
|
+
:named => 1,
|
78
|
+
:decimal => 2,
|
79
|
+
:hexadecimal => 3
|
80
|
+
}
|
81
|
+
|
82
|
+
end
|
84
83
|
|
85
84
|
#
|
86
85
|
# Decode XML and HTML 4.01 entities in a string into their UTF-8
|
@@ -91,8 +90,8 @@ module HTMLEntities # :nodoc:
|
|
91
90
|
# Unknown named entities are not converted
|
92
91
|
#
|
93
92
|
def decode_entities(string)
|
94
|
-
return string.gsub(NAMED_ENTITY_REGEXP) {
|
95
|
-
(cp = MAP[$1]) ? [cp].pack('U') : $&
|
93
|
+
return string.gsub(Data::NAMED_ENTITY_REGEXP) {
|
94
|
+
(cp = Data::MAP[$1]) ? [cp].pack('U') : $&
|
96
95
|
}.gsub(/&#([0-9]{1,7});|&#x([0-9a-f]{1,6});/i) {
|
97
96
|
$1 ? [$1.to_i].pack('U') : [$2.to_i(16)].pack('U')
|
98
97
|
}
|
@@ -129,7 +128,7 @@ module HTMLEntities # :nodoc:
|
|
129
128
|
instructions = [:basic]
|
130
129
|
else
|
131
130
|
instructions = instructions.sort_by { |instruction|
|
132
|
-
ENCODE_ENTITIES_COMMAND_ORDER[instruction] ||
|
131
|
+
Data::ENCODE_ENTITIES_COMMAND_ORDER[instruction] ||
|
133
132
|
(raise InstructionError, "unknown encode_entities command `#{instruction.inspect}'")
|
134
133
|
}
|
135
134
|
end
|
@@ -137,23 +136,23 @@ module HTMLEntities # :nodoc:
|
|
137
136
|
case instruction
|
138
137
|
when :basic
|
139
138
|
# Handled as basic ASCII
|
140
|
-
output = (output || string).gsub(BASIC_ENTITY_REGEXP) {
|
139
|
+
output = (output || string).gsub(Data::BASIC_ENTITY_REGEXP) {
|
141
140
|
# It's safe to use the simpler [0] here because we know
|
142
141
|
# that the basic entities are ASCII.
|
143
|
-
'&' << REVERSE_MAP[$&[0]] << ';'
|
142
|
+
'&' << Data::REVERSE_MAP[$&[0]] << ';'
|
144
143
|
}
|
145
144
|
when :named
|
146
145
|
# Test everything except printable ASCII
|
147
|
-
output = (output || string).gsub(UTF8_NON_ASCII_REGEXP) {
|
146
|
+
output = (output || string).gsub(Data::UTF8_NON_ASCII_REGEXP) {
|
148
147
|
cp = $&.unpack('U')[0]
|
149
|
-
(e = REVERSE_MAP[cp]) ? "&#{e};" : $&
|
148
|
+
(e = Data::REVERSE_MAP[cp]) ? "&#{e};" : $&
|
150
149
|
}
|
151
150
|
when :decimal
|
152
|
-
output = (output || string).gsub(UTF8_NON_ASCII_REGEXP) {
|
151
|
+
output = (output || string).gsub(Data::UTF8_NON_ASCII_REGEXP) {
|
153
152
|
"&##{$&.unpack('U')[0]};"
|
154
153
|
}
|
155
154
|
when :hexadecimal
|
156
|
-
output = (output || string).gsub(UTF8_NON_ASCII_REGEXP) {
|
155
|
+
output = (output || string).gsub(Data::UTF8_NON_ASCII_REGEXP) {
|
157
156
|
"&#x#{$&.unpack('U')[0].to_s(16)};"
|
158
157
|
}
|
159
158
|
end
|
data/lib/htmlentities/string.rb
CHANGED
@@ -1,15 +1,24 @@
|
|
1
1
|
require 'htmlentities'
|
2
2
|
|
3
3
|
#
|
4
|
-
# This
|
4
|
+
# This file extends the String class with methods to allow encoding and decoding of
|
5
5
|
# HTML/XML entities from/to their corresponding UTF-8 codepoints.
|
6
6
|
#
|
7
7
|
class String
|
8
8
|
|
9
|
+
#
|
10
|
+
# Decode XML and HTML 4.01 entities in a string into their UTF-8
|
11
|
+
# equivalents.
|
12
|
+
#
|
9
13
|
def decode_entities
|
10
14
|
return HTMLEntities.decode_entities(self)
|
11
15
|
end
|
12
16
|
|
17
|
+
#
|
18
|
+
# Encode codepoints in a string into their corresponding entities. See
|
19
|
+
# the documentation of HTMLEntities.encode_entities for a list of possible
|
20
|
+
# instructions.
|
21
|
+
#
|
13
22
|
def encode_entities(*instructions)
|
14
23
|
return HTMLEntities.encode_entities(self, *instructions)
|
15
24
|
end
|
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.8.11
|
|
3
3
|
specification_version: 1
|
4
4
|
name: htmlentities
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 3.0.
|
7
|
-
date: 2006-04-
|
6
|
+
version: 3.0.1
|
7
|
+
date: 2006-04-09 00:00:00 +01:00
|
8
8
|
summary: A module for encoding and decoding of HTML/XML entities from/to their corresponding UTF-8 codepoints. Optional String class extension for same.
|
9
9
|
require_paths:
|
10
10
|
- lib
|