htmlentities 3.0.0

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGES ADDED
@@ -0,0 +1,29 @@
1
+ == 3.0.0 (2005-04-08)
2
+ * Changed licence to MIT due to confusion with previous 'Fair' licence (my
3
+ intention was to be liberal, not obscure).
4
+ * Moved basic functionality out of String class; for previous behaviour,
5
+ require 'htmlentities/string'
6
+ * Changed version numbering scheme
7
+ * Now available as a Gem
8
+
9
+ == 2.2 (2005-11-07)
10
+ * Important bug fixes -- thanks to Moonwolf
11
+ * Decoding hexadecimal entities now accepts 'f' as a hex digit. (D'oh!)
12
+ * Decimal decoding edge cases addressed
13
+ * Test cases added
14
+
15
+ == 2.1 (2005-10-31)
16
+ * Removed some unnecessary code in basic entity encoding
17
+ * Improved handling of encoding: commands are now automatically sorted, so the
18
+ user doesn't have to worry about their order
19
+ * Now using setup.rb
20
+ * Tests moved to separate file
21
+
22
+ == 2.0 (2005-08-23)
23
+ * Added encoding to entities
24
+ * Decoding interface unchanged
25
+ * Fixed a bug with handling high codepoints
26
+
27
+ == 1.0 (2005-08-03)
28
+ * Initial release
29
+ * Decoding only
data/COPYING ADDED
@@ -0,0 +1,21 @@
1
+ == Licence (MIT)
2
+
3
+ Copyright (c) 2005-2006 Paul Battley
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/README ADDED
@@ -0,0 +1,23 @@
1
+ == HTMLEntities
2
+
3
+ HTML entity encoding and decoding for Ruby
4
+
5
+ The HTMLEntities module facilitates encoding and decoding of
6
+ HTML/XML entities from/to their corresponding UTF-8 codepoints.
7
+
8
+ To install (requires root/admin privileges):
9
+
10
+ ruby setup.rb
11
+
12
+ Alternatively, you can just use the gem.
13
+
14
+ == Licence
15
+
16
+ This code is free to use under the terms of the MIT licence. If you'd like to
17
+ negotiate a different licence for a specific use, just contact me -- I'll
18
+ almost certainly permit it.
19
+
20
+ == Contact
21
+
22
+ Comments are welcome. Send an email to pbattley@gmail.com.
23
+
@@ -0,0 +1,167 @@
1
+ #
2
+ # HTML entity encoding and decoding for Ruby
3
+ #
4
+
5
+ module HTMLEntities # :nodoc:
6
+
7
+ class InstructionError < RuntimeError
8
+ end
9
+
10
+ #
11
+ # MAP is a hash of all the HTML entities I could discover, as taken
12
+ # from the w3schools page on the subject:
13
+ # http://www.w3schools.com/html/html_entitiesref.asp
14
+ # The format is 'entity name' => codepoint where entity name is given
15
+ # without the surrounding ampersand and semicolon.
16
+ #
17
+ MAP = {
18
+ 'quot' => 34, 'apos' => 39, 'amp' => 38,
19
+ 'lt' => 60, 'gt' => 62, 'nbsp' => 160,
20
+ 'iexcl' => 161, 'curren' => 164, 'cent' => 162,
21
+ 'pound' => 163, 'yen' => 165, 'brvbar' => 166,
22
+ 'sect' => 167, 'uml' => 168, 'copy' => 169,
23
+ 'ordf' => 170, 'laquo' => 171, 'not' => 172,
24
+ 'shy' => 173, 'reg' => 174, 'trade' => 8482,
25
+ 'macr' => 175, 'deg' => 176, 'plusmn' => 177,
26
+ 'sup2' => 178, 'sup3' => 179, 'acute' => 180,
27
+ 'micro' => 181, 'para' => 182, 'middot' => 183,
28
+ 'cedil' => 184, 'sup1' => 185, 'ordm' => 186,
29
+ 'raquo' => 187, 'frac14' => 188, 'frac12' => 189,
30
+ 'frac34' => 190, 'iquest' => 191, 'times' => 215,
31
+ 'divide' => 247, 'Agrave' => 192, 'Aacute' => 193,
32
+ 'Acirc' => 194, 'Atilde' => 195, 'Auml' => 196,
33
+ 'Aring' => 197, 'AElig' => 198, 'Ccedil' => 199,
34
+ 'Egrave' => 200, 'Eacute' => 201, 'Ecirc' => 202,
35
+ 'Euml' => 203, 'Igrave' => 204, 'Iacute' => 205,
36
+ 'Icirc' => 206, 'Iuml' => 207, 'ETH' => 208,
37
+ 'Ntilde' => 209, 'Ograve' => 210, 'Oacute' => 211,
38
+ 'Ocirc' => 212, 'Otilde' => 213, 'Ouml' => 214,
39
+ 'Oslash' => 216, 'Ugrave' => 217, 'Uacute' => 218,
40
+ 'Ucirc' => 219, 'Uuml' => 220, 'Yacute' => 221,
41
+ 'THORN' => 222, 'szlig' => 223, 'agrave' => 224,
42
+ 'aacute' => 225, 'acirc' => 226, 'atilde' => 227,
43
+ 'auml' => 228, 'aring' => 229, 'aelig' => 230,
44
+ 'ccedil' => 231, 'egrave' => 232, 'eacute' => 233,
45
+ 'ecirc' => 234, 'euml' => 235, 'igrave' => 236,
46
+ 'iacute' => 237, 'icirc' => 238, 'iuml' => 239,
47
+ 'eth' => 240, 'ntilde' => 241, 'ograve' => 242,
48
+ 'oacute' => 243, 'ocirc' => 244, 'otilde' => 245,
49
+ 'ouml' => 246, 'oslash' => 248, 'ugrave' => 249,
50
+ 'uacute' => 250, 'ucirc' => 251, 'uuml' => 252,
51
+ 'yacute' => 253, 'thorn' => 254, 'yuml' => 255,
52
+ 'OElig' => 338, 'oelig' => 339, 'Scaron' => 352,
53
+ 'scaron' => 353, 'Yuml' => 376, 'circ' => 710,
54
+ 'tilde' => 732, 'ensp' => 8194, 'emsp' => 8195,
55
+ 'thinsp' => 8201, 'zwnj' => 8204, 'zwj' => 8205,
56
+ 'lrm' => 8206, 'rlm' => 8207, 'ndash' => 8211,
57
+ 'mdash' => 8212, 'lsquo' => 8216, 'rsquo' => 8217,
58
+ 'sbquo' => 8218, 'ldquo' => 8220, 'rdquo' => 8221,
59
+ 'bdquo' => 8222, 'dagger' => 8224, 'Dagger' => 8225,
60
+ 'hellip' => 8230, 'permil' => 8240, 'lsaquo' => 8249,
61
+ 'rsaquo' => 8250, 'euro' => 8364
62
+ }
63
+
64
+ MIN_LENGTH = MAP.keys.map{ |a| a.length }.min
65
+ MAX_LENGTH = MAP.keys.map{ |a| a.length }.max
66
+
67
+ # Precompile the regexp
68
+ NAMED_ENTITY_REGEXP =
69
+ /&([a-z]{#{HTMLEntities::MIN_LENGTH},#{HTMLEntities::MAX_LENGTH}});/i
70
+
71
+ # Reverse map for converting characters to named entities
72
+ REVERSE_MAP = MAP.invert
73
+
74
+ BASIC_ENTITY_REGEXP = /[<>'"&]/
75
+
76
+ UTF8_NON_ASCII_REGEXP = /[\x00-\x1f]|[\xc0-\xfd][\x80-\xbf]+/
77
+
78
+ ENCODE_ENTITIES_COMMAND_ORDER = {
79
+ :basic => 0,
80
+ :named => 1,
81
+ :decimal => 2,
82
+ :hexadecimal => 3
83
+ }
84
+
85
+ #
86
+ # Decode XML and HTML 4.01 entities in a string into their UTF-8
87
+ # equivalents. Obviously, if your string is not already in UTF-8, you'd
88
+ # better convert it before using this method, or the output will be mixed
89
+ # up.
90
+ #
91
+ # Unknown named entities are not converted
92
+ #
93
+ def decode_entities(string)
94
+ return string.gsub(NAMED_ENTITY_REGEXP) {
95
+ (cp = MAP[$1]) ? [cp].pack('U') : $&
96
+ }.gsub(/&#([0-9]{1,7});|&#x([0-9a-f]{1,6});/i) {
97
+ $1 ? [$1.to_i].pack('U') : [$2.to_i(16)].pack('U')
98
+ }
99
+ end
100
+
101
+ #
102
+ # Encode codepoints into their corresponding entities. Various operations
103
+ # are possible, and may be specified in order:
104
+ #
105
+ # :basic :: Convert the five XML entities ('"<>&)
106
+ # :named :: Convert non-ASCII characters to their named HTML 4.01 equivalent
107
+ # :decimal :: Convert non-ASCII characters to decimal entities (e.g. &#1234;)
108
+ # :hexadecimal :: Convert non-ASCII characters to hexadecimal entities (e.g. # &#x12ab;)
109
+ #
110
+ # You can specify the commands in any order, but they will be executed in
111
+ # the order listed above to ensure that entity ampersands are not
112
+ # clobbered and that named entities are replaced before numeric ones.
113
+ #
114
+ # If no instructions are specified, :basic will be used.
115
+ #
116
+ # Examples:
117
+ # encode_entities(str) - XML-safe
118
+ # encode_entities(str, :basic, :decimal) - XML-safe and 7-bit clean
119
+ # encode_entities(str, :basic, :named, :decimal) - 7-bit clean, with all
120
+ # non-ASCII characters replaced with their named entity where possible, and
121
+ # decimal equivalents otherwise.
122
+ #
123
+ # Note: It is the program's responsibility to ensure that the string
124
+ # contains valid UTF-8 before calling this method.
125
+ #
126
+ def encode_entities(string, *instructions)
127
+ output = nil
128
+ if (instructions.empty?)
129
+ instructions = [:basic]
130
+ else
131
+ instructions = instructions.sort_by { |instruction|
132
+ ENCODE_ENTITIES_COMMAND_ORDER[instruction] ||
133
+ (raise InstructionError, "unknown encode_entities command `#{instruction.inspect}'")
134
+ }
135
+ end
136
+ instructions.each do |instruction|
137
+ case instruction
138
+ when :basic
139
+ # Handled as basic ASCII
140
+ output = (output || string).gsub(BASIC_ENTITY_REGEXP) {
141
+ # It's safe to use the simpler [0] here because we know
142
+ # that the basic entities are ASCII.
143
+ '&' << REVERSE_MAP[$&[0]] << ';'
144
+ }
145
+ when :named
146
+ # Test everything except printable ASCII
147
+ output = (output || string).gsub(UTF8_NON_ASCII_REGEXP) {
148
+ cp = $&.unpack('U')[0]
149
+ (e = REVERSE_MAP[cp]) ? "&#{e};" : $&
150
+ }
151
+ when :decimal
152
+ output = (output || string).gsub(UTF8_NON_ASCII_REGEXP) {
153
+ "&##{$&.unpack('U')[0]};"
154
+ }
155
+ when :hexadecimal
156
+ output = (output || string).gsub(UTF8_NON_ASCII_REGEXP) {
157
+ "&#x#{$&.unpack('U')[0].to_s(16)};"
158
+ }
159
+ end
160
+ end
161
+ return output
162
+ end
163
+
164
+ extend self
165
+
166
+ end
167
+
@@ -0,0 +1,17 @@
1
+ require 'htmlentities'
2
+
3
+ #
4
+ # This library extends the String class with methods to allow encoding and decoding of
5
+ # HTML/XML entities from/to their corresponding UTF-8 codepoints.
6
+ #
7
+ class String
8
+
9
+ def decode_entities
10
+ return HTMLEntities.decode_entities(self)
11
+ end
12
+
13
+ def encode_entities(*instructions)
14
+ return HTMLEntities.encode_entities(self, *instructions)
15
+ end
16
+
17
+ end
data/test/all.rb ADDED
@@ -0,0 +1,3 @@
1
+ Dir[File.dirname(__FILE__)+'/*_test.rb'].each do |test|
2
+ require test
3
+ end
@@ -0,0 +1,129 @@
1
+ $: << File.dirname(__FILE__) + '/../lib/'
2
+ require 'htmlentities'
3
+ require 'test/unit'
4
+
5
+ $KCODE = 'u'
6
+
7
+ class TestHTMLEntities < Test::Unit::TestCase
8
+
9
+ def test_basic_decoding
10
+ assert_decode('&', '&amp;')
11
+ assert_decode('<', '&lt;')
12
+ assert_decode('"', '&quot;')
13
+ end
14
+
15
+ def test_basic_encoding
16
+ assert_encode('&amp;', '&', :basic)
17
+ assert_encode('&quot;', '"')
18
+ assert_encode('&lt;', '<', :basic)
19
+ assert_encode('&lt;', '<')
20
+ end
21
+
22
+ def test_extended_decoding
23
+ assert_decode('±', '&plusmn;')
24
+ assert_decode('ð', '&eth;')
25
+ assert_decode('Œ', '&OElig;')
26
+ assert_decode('œ', '&oelig;')
27
+ end
28
+
29
+ def test_extended_encoding
30
+ assert_encode('&plusmn;', '±', :named)
31
+ assert_encode('&eth;', 'ð', :named)
32
+ assert_encode('&OElig;', 'Œ', :named)
33
+ assert_encode('&oelig;', 'œ', :named)
34
+ end
35
+
36
+ def test_decimal_decoding
37
+ assert_decode('“', '&#8220;')
38
+ assert_decode('…', '&#8230;')
39
+ assert_decode(' ', '&#32;')
40
+ end
41
+
42
+ def test_decimal_encoding
43
+ assert_encode('&#8220;', '“', :decimal)
44
+ assert_encode('&#8230;', '…', :decimal)
45
+ end
46
+
47
+ def test_hexadecimal_decoding
48
+ assert_decode('−', '&#x2212;')
49
+ assert_decode('—', '&#x2014;')
50
+ assert_decode('`', '&#x0060;')
51
+ assert_decode('`', '&#x60;')
52
+ end
53
+
54
+ def test_hexadecimal_encoding
55
+ assert_encode('&#x2212;', '−', :hexadecimal)
56
+ assert_encode('&#x2014;', '—', :hexadecimal)
57
+ end
58
+
59
+ def test_mixed_decoding
60
+ # Just a random headline - I needed something with accented letters.
61
+ assert_decode(
62
+ 'Le tabac pourrait bientôt être banni dans tous les lieux publics en France',
63
+ 'Le tabac pourrait bient&ocirc;t &#234;tre banni dans tous les lieux publics en France'
64
+ )
65
+ assert_decode(
66
+ '"bientôt" & 文字',
67
+ '&quot;bient&ocirc;t&quot; &amp; &#25991;&#x5b57;'
68
+ )
69
+ end
70
+
71
+ def test_mixed_encoding
72
+ assert_encode(
73
+ '&quot;bient&ocirc;t&quot; &amp; &#x6587;&#x5b57;',
74
+ '"bientôt" & 文字', :basic, :named, :hexadecimal
75
+ )
76
+ assert_encode(
77
+ '&quot;bient&ocirc;t&quot; &amp; &#25991;&#23383;',
78
+ '"bientôt" & 文字', :basic, :named, :decimal
79
+ )
80
+ end
81
+
82
+ def test_mixed_encoding_with_sort
83
+ assert_encode(
84
+ '&quot;bient&ocirc;t&quot; &amp; &#x6587;&#x5b57;',
85
+ '"bientôt" & 文字', :named, :hexadecimal, :basic
86
+ )
87
+ assert_encode(
88
+ '&quot;bient&ocirc;t&quot; &amp; &#25991;&#23383;',
89
+ '"bientôt" & 文字', :decimal, :named, :basic
90
+ )
91
+ end
92
+
93
+ def test_detect_illegal_encoding_command
94
+ assert_raise(HTMLEntities::InstructionError) {
95
+ HTMLEntities.encode_entities('foo', :bar, :baz)
96
+ }
97
+ end
98
+
99
+ def test_edge_case_decoding
100
+ assert_decode('', '')
101
+ assert_decode('&bogus;', '&bogus;')
102
+ assert_decode('&amp;', '&amp;amp;')
103
+ end
104
+
105
+ def test_edge_case_encoding
106
+ assert_encode('`', '`')
107
+ assert_encode(' ', ' ')
108
+ assert_encode('&amp;amp;', '&amp;')
109
+ assert_encode('&amp;amp;', '&amp;')
110
+ end
111
+
112
+ # Faults found and patched by Moonwolf
113
+ def test_moonwolf_decoding
114
+ assert_decode("\x2", '&#2;')
115
+ assert_decode("\xf", '&#xf;')
116
+ end
117
+
118
+ private
119
+
120
+ def assert_decode(expected, input)
121
+ assert_equal(expected, HTMLEntities.decode_entities(input))
122
+ end
123
+
124
+ def assert_encode(expected, input, *args)
125
+ assert_equal(expected, HTMLEntities.encode_entities(input, *args))
126
+ end
127
+
128
+ end
129
+
@@ -0,0 +1,24 @@
1
+ $: << File.dirname(__FILE__) + '/../lib/'
2
+ require 'htmlentities/string'
3
+ require 'test/unit'
4
+
5
+ $KCODE = 'u'
6
+
7
+ class TestHTMLEntities < Test::Unit::TestCase
8
+
9
+ def test_string_responds_correctly_to_decode_entities
10
+ assert_equal('±', '&plusmn;'.decode_entities)
11
+ end
12
+
13
+ def test_string_responds_correctly_to_encode_entities_with_no_parameters
14
+ assert_equal('&quot;', '"'.encode_entities)
15
+ end
16
+
17
+ def test_string_responds_correctly_to_encode_entities_with_multiple_parameters
18
+ assert_equal(
19
+ '&quot;bient&ocirc;t&quot; &amp; &#x6587;&#x5b57;',
20
+ '"bientôt" & 文字'.encode_entities(:basic, :named, :hexadecimal)
21
+ )
22
+ end
23
+
24
+ end
metadata ADDED
@@ -0,0 +1,54 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.8.11
3
+ specification_version: 1
4
+ name: htmlentities
5
+ version: !ruby/object:Gem::Version
6
+ version: 3.0.0
7
+ date: 2006-04-08 00:00:00 +01:00
8
+ summary: A module for encoding and decoding of HTML/XML entities from/to their corresponding UTF-8 codepoints. Optional String class extension for same.
9
+ require_paths:
10
+ - lib
11
+ email: pbattley@gmail.com
12
+ homepage:
13
+ rubyforge_project:
14
+ description:
15
+ autorequire:
16
+ default_executable:
17
+ bindir: bin
18
+ has_rdoc: true
19
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
20
+ requirements:
21
+ - - ">"
22
+ - !ruby/object:Gem::Version
23
+ version: 0.0.0
24
+ version:
25
+ platform: ruby
26
+ signing_key:
27
+ cert_chain:
28
+ authors:
29
+ - Paul Battley
30
+ files:
31
+ - lib/htmlentities.rb
32
+ - lib/htmlentities/string.rb
33
+ - test/all.rb
34
+ - test/entities_test.rb
35
+ - test/string_test.rb
36
+ - README
37
+ - CHANGES
38
+ - COPYING
39
+ test_files:
40
+ - test/all.rb
41
+ rdoc_options: []
42
+
43
+ extra_rdoc_files:
44
+ - README
45
+ - CHANGES
46
+ - COPYING
47
+ executables: []
48
+
49
+ extensions: []
50
+
51
+ requirements: []
52
+
53
+ dependencies: []
54
+