htmlentities 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGES ADDED
@@ -0,0 +1,29 @@
1
+ == 3.0.0 (2005-04-08)
2
+ * Changed licence to MIT due to confusion with previous 'Fair' licence (my
3
+ intention was to be liberal, not obscure).
4
+ * Moved basic functionality out of String class; for previous behaviour,
5
+ require 'htmlentities/string'
6
+ * Changed version numbering scheme
7
+ * Now available as a Gem
8
+
9
+ == 2.2 (2005-11-07)
10
+ * Important bug fixes -- thanks to Moonwolf
11
+ * Decoding hexadecimal entities now accepts 'f' as a hex digit. (D'oh!)
12
+ * Decimal decoding edge cases addressed
13
+ * Test cases added
14
+
15
+ == 2.1 (2005-10-31)
16
+ * Removed some unnecessary code in basic entity encoding
17
+ * Improved handling of encoding: commands are now automatically sorted, so the
18
+ user doesn't have to worry about their order
19
+ * Now using setup.rb
20
+ * Tests moved to separate file
21
+
22
+ == 2.0 (2005-08-23)
23
+ * Added encoding to entities
24
+ * Decoding interface unchanged
25
+ * Fixed a bug with handling high codepoints
26
+
27
+ == 1.0 (2005-08-03)
28
+ * Initial release
29
+ * Decoding only
data/COPYING ADDED
@@ -0,0 +1,21 @@
1
+ == Licence (MIT)
2
+
3
+ Copyright (c) 2005-2006 Paul Battley
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/README ADDED
@@ -0,0 +1,23 @@
1
+ == HTMLEntities
2
+
3
+ HTML entity encoding and decoding for Ruby
4
+
5
+ The HTMLEntities module facilitates encoding and decoding of
6
+ HTML/XML entities from/to their corresponding UTF-8 codepoints.
7
+
8
+ To install (requires root/admin privileges):
9
+
10
+ ruby setup.rb
11
+
12
+ Alternatively, you can just use the gem.
13
+
14
+ == Licence
15
+
16
+ This code is free to use under the terms of the MIT licence. If you'd like to
17
+ negotiate a different licence for a specific use, just contact me -- I'll
18
+ almost certainly permit it.
19
+
20
+ == Contact
21
+
22
+ Comments are welcome. Send an email to pbattley@gmail.com.
23
+
@@ -0,0 +1,167 @@
1
+ #
2
+ # HTML entity encoding and decoding for Ruby
3
+ #
4
+
5
+ module HTMLEntities # :nodoc:
6
+
7
+ class InstructionError < RuntimeError
8
+ end
9
+
10
+ #
11
+ # MAP is a hash of all the HTML entities I could discover, as taken
12
+ # from the w3schools page on the subject:
13
+ # http://www.w3schools.com/html/html_entitiesref.asp
14
+ # The format is 'entity name' => codepoint where entity name is given
15
+ # without the surrounding ampersand and semicolon.
16
+ #
17
+ MAP = {
18
+ 'quot' => 34, 'apos' => 39, 'amp' => 38,
19
+ 'lt' => 60, 'gt' => 62, 'nbsp' => 160,
20
+ 'iexcl' => 161, 'curren' => 164, 'cent' => 162,
21
+ 'pound' => 163, 'yen' => 165, 'brvbar' => 166,
22
+ 'sect' => 167, 'uml' => 168, 'copy' => 169,
23
+ 'ordf' => 170, 'laquo' => 171, 'not' => 172,
24
+ 'shy' => 173, 'reg' => 174, 'trade' => 8482,
25
+ 'macr' => 175, 'deg' => 176, 'plusmn' => 177,
26
+ 'sup2' => 178, 'sup3' => 179, 'acute' => 180,
27
+ 'micro' => 181, 'para' => 182, 'middot' => 183,
28
+ 'cedil' => 184, 'sup1' => 185, 'ordm' => 186,
29
+ 'raquo' => 187, 'frac14' => 188, 'frac12' => 189,
30
+ 'frac34' => 190, 'iquest' => 191, 'times' => 215,
31
+ 'divide' => 247, 'Agrave' => 192, 'Aacute' => 193,
32
+ 'Acirc' => 194, 'Atilde' => 195, 'Auml' => 196,
33
+ 'Aring' => 197, 'AElig' => 198, 'Ccedil' => 199,
34
+ 'Egrave' => 200, 'Eacute' => 201, 'Ecirc' => 202,
35
+ 'Euml' => 203, 'Igrave' => 204, 'Iacute' => 205,
36
+ 'Icirc' => 206, 'Iuml' => 207, 'ETH' => 208,
37
+ 'Ntilde' => 209, 'Ograve' => 210, 'Oacute' => 211,
38
+ 'Ocirc' => 212, 'Otilde' => 213, 'Ouml' => 214,
39
+ 'Oslash' => 216, 'Ugrave' => 217, 'Uacute' => 218,
40
+ 'Ucirc' => 219, 'Uuml' => 220, 'Yacute' => 221,
41
+ 'THORN' => 222, 'szlig' => 223, 'agrave' => 224,
42
+ 'aacute' => 225, 'acirc' => 226, 'atilde' => 227,
43
+ 'auml' => 228, 'aring' => 229, 'aelig' => 230,
44
+ 'ccedil' => 231, 'egrave' => 232, 'eacute' => 233,
45
+ 'ecirc' => 234, 'euml' => 235, 'igrave' => 236,
46
+ 'iacute' => 237, 'icirc' => 238, 'iuml' => 239,
47
+ 'eth' => 240, 'ntilde' => 241, 'ograve' => 242,
48
+ 'oacute' => 243, 'ocirc' => 244, 'otilde' => 245,
49
+ 'ouml' => 246, 'oslash' => 248, 'ugrave' => 249,
50
+ 'uacute' => 250, 'ucirc' => 251, 'uuml' => 252,
51
+ 'yacute' => 253, 'thorn' => 254, 'yuml' => 255,
52
+ 'OElig' => 338, 'oelig' => 339, 'Scaron' => 352,
53
+ 'scaron' => 353, 'Yuml' => 376, 'circ' => 710,
54
+ 'tilde' => 732, 'ensp' => 8194, 'emsp' => 8195,
55
+ 'thinsp' => 8201, 'zwnj' => 8204, 'zwj' => 8205,
56
+ 'lrm' => 8206, 'rlm' => 8207, 'ndash' => 8211,
57
+ 'mdash' => 8212, 'lsquo' => 8216, 'rsquo' => 8217,
58
+ 'sbquo' => 8218, 'ldquo' => 8220, 'rdquo' => 8221,
59
+ 'bdquo' => 8222, 'dagger' => 8224, 'Dagger' => 8225,
60
+ 'hellip' => 8230, 'permil' => 8240, 'lsaquo' => 8249,
61
+ 'rsaquo' => 8250, 'euro' => 8364
62
+ }
63
+
64
+ MIN_LENGTH = MAP.keys.map{ |a| a.length }.min
65
+ MAX_LENGTH = MAP.keys.map{ |a| a.length }.max
66
+
67
+ # Precompile the regexp
68
+ NAMED_ENTITY_REGEXP =
69
+ /&([a-z]{#{HTMLEntities::MIN_LENGTH},#{HTMLEntities::MAX_LENGTH}});/i
70
+
71
+ # Reverse map for converting characters to named entities
72
+ REVERSE_MAP = MAP.invert
73
+
74
+ BASIC_ENTITY_REGEXP = /[<>'"&]/
75
+
76
+ UTF8_NON_ASCII_REGEXP = /[\x00-\x1f]|[\xc0-\xfd][\x80-\xbf]+/
77
+
78
+ ENCODE_ENTITIES_COMMAND_ORDER = {
79
+ :basic => 0,
80
+ :named => 1,
81
+ :decimal => 2,
82
+ :hexadecimal => 3
83
+ }
84
+
85
+ #
86
+ # Decode XML and HTML 4.01 entities in a string into their UTF-8
87
+ # equivalents. Obviously, if your string is not already in UTF-8, you'd
88
+ # better convert it before using this method, or the output will be mixed
89
+ # up.
90
+ #
91
+ # Unknown named entities are not converted
92
+ #
93
+ def decode_entities(string)
94
+ return string.gsub(NAMED_ENTITY_REGEXP) {
95
+ (cp = MAP[$1]) ? [cp].pack('U') : $&
96
+ }.gsub(/&#([0-9]{1,7});|&#x([0-9a-f]{1,6});/i) {
97
+ $1 ? [$1.to_i].pack('U') : [$2.to_i(16)].pack('U')
98
+ }
99
+ end
100
+
101
+ #
102
+ # Encode codepoints into their corresponding entities. Various operations
103
+ # are possible, and may be specified in order:
104
+ #
105
+ # :basic :: Convert the five XML entities ('"<>&)
106
+ # :named :: Convert non-ASCII characters to their named HTML 4.01 equivalent
107
+ # :decimal :: Convert non-ASCII characters to decimal entities (e.g. &#1234;)
108
+ # :hexadecimal :: Convert non-ASCII characters to hexadecimal entities (e.g. # &#x12ab;)
109
+ #
110
+ # You can specify the commands in any order, but they will be executed in
111
+ # the order listed above to ensure that entity ampersands are not
112
+ # clobbered and that named entities are replaced before numeric ones.
113
+ #
114
+ # If no instructions are specified, :basic will be used.
115
+ #
116
+ # Examples:
117
+ # encode_entities(str) - XML-safe
118
+ # encode_entities(str, :basic, :decimal) - XML-safe and 7-bit clean
119
+ # encode_entities(str, :basic, :named, :decimal) - 7-bit clean, with all
120
+ # non-ASCII characters replaced with their named entity where possible, and
121
+ # decimal equivalents otherwise.
122
+ #
123
+ # Note: It is the program's responsibility to ensure that the string
124
+ # contains valid UTF-8 before calling this method.
125
+ #
126
+ def encode_entities(string, *instructions)
127
+ output = nil
128
+ if (instructions.empty?)
129
+ instructions = [:basic]
130
+ else
131
+ instructions = instructions.sort_by { |instruction|
132
+ ENCODE_ENTITIES_COMMAND_ORDER[instruction] ||
133
+ (raise InstructionError, "unknown encode_entities command `#{instruction.inspect}'")
134
+ }
135
+ end
136
+ instructions.each do |instruction|
137
+ case instruction
138
+ when :basic
139
+ # Handled as basic ASCII
140
+ output = (output || string).gsub(BASIC_ENTITY_REGEXP) {
141
+ # It's safe to use the simpler [0] here because we know
142
+ # that the basic entities are ASCII.
143
+ '&' << REVERSE_MAP[$&[0]] << ';'
144
+ }
145
+ when :named
146
+ # Test everything except printable ASCII
147
+ output = (output || string).gsub(UTF8_NON_ASCII_REGEXP) {
148
+ cp = $&.unpack('U')[0]
149
+ (e = REVERSE_MAP[cp]) ? "&#{e};" : $&
150
+ }
151
+ when :decimal
152
+ output = (output || string).gsub(UTF8_NON_ASCII_REGEXP) {
153
+ "&##{$&.unpack('U')[0]};"
154
+ }
155
+ when :hexadecimal
156
+ output = (output || string).gsub(UTF8_NON_ASCII_REGEXP) {
157
+ "&#x#{$&.unpack('U')[0].to_s(16)};"
158
+ }
159
+ end
160
+ end
161
+ return output
162
+ end
163
+
164
+ extend self
165
+
166
+ end
167
+
@@ -0,0 +1,17 @@
1
+ require 'htmlentities'
2
+
3
+ #
4
+ # This library extends the String class with methods to allow encoding and decoding of
5
+ # HTML/XML entities from/to their corresponding UTF-8 codepoints.
6
+ #
7
+ class String
8
+
9
+ def decode_entities
10
+ return HTMLEntities.decode_entities(self)
11
+ end
12
+
13
+ def encode_entities(*instructions)
14
+ return HTMLEntities.encode_entities(self, *instructions)
15
+ end
16
+
17
+ end
data/test/all.rb ADDED
@@ -0,0 +1,3 @@
1
+ Dir[File.dirname(__FILE__)+'/*_test.rb'].each do |test|
2
+ require test
3
+ end
@@ -0,0 +1,129 @@
1
+ $: << File.dirname(__FILE__) + '/../lib/'
2
+ require 'htmlentities'
3
+ require 'test/unit'
4
+
5
+ $KCODE = 'u'
6
+
7
+ class TestHTMLEntities < Test::Unit::TestCase
8
+
9
+ def test_basic_decoding
10
+ assert_decode('&', '&amp;')
11
+ assert_decode('<', '&lt;')
12
+ assert_decode('"', '&quot;')
13
+ end
14
+
15
+ def test_basic_encoding
16
+ assert_encode('&amp;', '&', :basic)
17
+ assert_encode('&quot;', '"')
18
+ assert_encode('&lt;', '<', :basic)
19
+ assert_encode('&lt;', '<')
20
+ end
21
+
22
+ def test_extended_decoding
23
+ assert_decode('±', '&plusmn;')
24
+ assert_decode('ð', '&eth;')
25
+ assert_decode('Œ', '&OElig;')
26
+ assert_decode('œ', '&oelig;')
27
+ end
28
+
29
+ def test_extended_encoding
30
+ assert_encode('&plusmn;', '±', :named)
31
+ assert_encode('&eth;', 'ð', :named)
32
+ assert_encode('&OElig;', 'Œ', :named)
33
+ assert_encode('&oelig;', 'œ', :named)
34
+ end
35
+
36
+ def test_decimal_decoding
37
+ assert_decode('“', '&#8220;')
38
+ assert_decode('…', '&#8230;')
39
+ assert_decode(' ', '&#32;')
40
+ end
41
+
42
+ def test_decimal_encoding
43
+ assert_encode('&#8220;', '“', :decimal)
44
+ assert_encode('&#8230;', '…', :decimal)
45
+ end
46
+
47
+ def test_hexadecimal_decoding
48
+ assert_decode('−', '&#x2212;')
49
+ assert_decode('—', '&#x2014;')
50
+ assert_decode('`', '&#x0060;')
51
+ assert_decode('`', '&#x60;')
52
+ end
53
+
54
+ def test_hexadecimal_encoding
55
+ assert_encode('&#x2212;', '−', :hexadecimal)
56
+ assert_encode('&#x2014;', '—', :hexadecimal)
57
+ end
58
+
59
+ def test_mixed_decoding
60
+ # Just a random headline - I needed something with accented letters.
61
+ assert_decode(
62
+ 'Le tabac pourrait bientôt être banni dans tous les lieux publics en France',
63
+ 'Le tabac pourrait bient&ocirc;t &#234;tre banni dans tous les lieux publics en France'
64
+ )
65
+ assert_decode(
66
+ '"bientôt" & 文字',
67
+ '&quot;bient&ocirc;t&quot; &amp; &#25991;&#x5b57;'
68
+ )
69
+ end
70
+
71
+ def test_mixed_encoding
72
+ assert_encode(
73
+ '&quot;bient&ocirc;t&quot; &amp; &#x6587;&#x5b57;',
74
+ '"bientôt" & 文字', :basic, :named, :hexadecimal
75
+ )
76
+ assert_encode(
77
+ '&quot;bient&ocirc;t&quot; &amp; &#25991;&#23383;',
78
+ '"bientôt" & 文字', :basic, :named, :decimal
79
+ )
80
+ end
81
+
82
+ def test_mixed_encoding_with_sort
83
+ assert_encode(
84
+ '&quot;bient&ocirc;t&quot; &amp; &#x6587;&#x5b57;',
85
+ '"bientôt" & 文字', :named, :hexadecimal, :basic
86
+ )
87
+ assert_encode(
88
+ '&quot;bient&ocirc;t&quot; &amp; &#25991;&#23383;',
89
+ '"bientôt" & 文字', :decimal, :named, :basic
90
+ )
91
+ end
92
+
93
+ def test_detect_illegal_encoding_command
94
+ assert_raise(HTMLEntities::InstructionError) {
95
+ HTMLEntities.encode_entities('foo', :bar, :baz)
96
+ }
97
+ end
98
+
99
+ def test_edge_case_decoding
100
+ assert_decode('', '')
101
+ assert_decode('&bogus;', '&bogus;')
102
+ assert_decode('&amp;', '&amp;amp;')
103
+ end
104
+
105
+ def test_edge_case_encoding
106
+ assert_encode('`', '`')
107
+ assert_encode(' ', ' ')
108
+ assert_encode('&amp;amp;', '&amp;')
109
+ assert_encode('&amp;amp;', '&amp;')
110
+ end
111
+
112
+ # Faults found and patched by Moonwolf
113
+ def test_moonwolf_decoding
114
+ assert_decode("\x2", '&#2;')
115
+ assert_decode("\xf", '&#xf;')
116
+ end
117
+
118
+ private
119
+
120
+ def assert_decode(expected, input)
121
+ assert_equal(expected, HTMLEntities.decode_entities(input))
122
+ end
123
+
124
+ def assert_encode(expected, input, *args)
125
+ assert_equal(expected, HTMLEntities.encode_entities(input, *args))
126
+ end
127
+
128
+ end
129
+
@@ -0,0 +1,24 @@
1
+ $: << File.dirname(__FILE__) + '/../lib/'
2
+ require 'htmlentities/string'
3
+ require 'test/unit'
4
+
5
+ $KCODE = 'u'
6
+
7
+ class TestHTMLEntities < Test::Unit::TestCase
8
+
9
+ def test_string_responds_correctly_to_decode_entities
10
+ assert_equal('±', '&plusmn;'.decode_entities)
11
+ end
12
+
13
+ def test_string_responds_correctly_to_encode_entities_with_no_parameters
14
+ assert_equal('&quot;', '"'.encode_entities)
15
+ end
16
+
17
+ def test_string_responds_correctly_to_encode_entities_with_multiple_parameters
18
+ assert_equal(
19
+ '&quot;bient&ocirc;t&quot; &amp; &#x6587;&#x5b57;',
20
+ '"bientôt" & 文字'.encode_entities(:basic, :named, :hexadecimal)
21
+ )
22
+ end
23
+
24
+ end
metadata ADDED
@@ -0,0 +1,54 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.8.11
3
+ specification_version: 1
4
+ name: htmlentities
5
+ version: !ruby/object:Gem::Version
6
+ version: 3.0.0
7
+ date: 2006-04-08 00:00:00 +01:00
8
+ summary: A module for encoding and decoding of HTML/XML entities from/to their corresponding UTF-8 codepoints. Optional String class extension for same.
9
+ require_paths:
10
+ - lib
11
+ email: pbattley@gmail.com
12
+ homepage:
13
+ rubyforge_project:
14
+ description:
15
+ autorequire:
16
+ default_executable:
17
+ bindir: bin
18
+ has_rdoc: true
19
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
20
+ requirements:
21
+ - - ">"
22
+ - !ruby/object:Gem::Version
23
+ version: 0.0.0
24
+ version:
25
+ platform: ruby
26
+ signing_key:
27
+ cert_chain:
28
+ authors:
29
+ - Paul Battley
30
+ files:
31
+ - lib/htmlentities.rb
32
+ - lib/htmlentities/string.rb
33
+ - test/all.rb
34
+ - test/entities_test.rb
35
+ - test/string_test.rb
36
+ - README
37
+ - CHANGES
38
+ - COPYING
39
+ test_files:
40
+ - test/all.rb
41
+ rdoc_options: []
42
+
43
+ extra_rdoc_files:
44
+ - README
45
+ - CHANGES
46
+ - COPYING
47
+ executables: []
48
+
49
+ extensions: []
50
+
51
+ requirements: []
52
+
53
+ dependencies: []
54
+