spk-html5 0.10.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (74) hide show
  1. data/History.txt +10 -0
  2. data/Manifest.txt +73 -0
  3. data/README +45 -0
  4. data/Rakefile.rb +33 -0
  5. data/bin/html5 +7 -0
  6. data/lib/html5.rb +13 -0
  7. data/lib/html5/cli.rb +248 -0
  8. data/lib/html5/constants.rb +1061 -0
  9. data/lib/html5/filters/base.rb +10 -0
  10. data/lib/html5/filters/inject_meta_charset.rb +82 -0
  11. data/lib/html5/filters/iso639codes.rb +755 -0
  12. data/lib/html5/filters/optionaltags.rb +198 -0
  13. data/lib/html5/filters/rfc2046.rb +31 -0
  14. data/lib/html5/filters/rfc3987.rb +91 -0
  15. data/lib/html5/filters/sanitizer.rb +15 -0
  16. data/lib/html5/filters/validator.rb +834 -0
  17. data/lib/html5/filters/whitespace.rb +36 -0
  18. data/lib/html5/html5parser.rb +247 -0
  19. data/lib/html5/html5parser/after_after_body_phase.rb +43 -0
  20. data/lib/html5/html5parser/after_after_frameset_phase.rb +32 -0
  21. data/lib/html5/html5parser/after_body_phase.rb +46 -0
  22. data/lib/html5/html5parser/after_frameset_phase.rb +33 -0
  23. data/lib/html5/html5parser/after_head_phase.rb +55 -0
  24. data/lib/html5/html5parser/before_head_phase.rb +44 -0
  25. data/lib/html5/html5parser/before_html_phase.rb +41 -0
  26. data/lib/html5/html5parser/in_body_phase.rb +636 -0
  27. data/lib/html5/html5parser/in_caption_phase.rb +69 -0
  28. data/lib/html5/html5parser/in_cell_phase.rb +78 -0
  29. data/lib/html5/html5parser/in_column_group_phase.rb +55 -0
  30. data/lib/html5/html5parser/in_foreign_content_phase.rb +50 -0
  31. data/lib/html5/html5parser/in_frameset_phase.rb +56 -0
  32. data/lib/html5/html5parser/in_head_phase.rb +143 -0
  33. data/lib/html5/html5parser/in_row_phase.rb +96 -0
  34. data/lib/html5/html5parser/in_select_phase.rb +90 -0
  35. data/lib/html5/html5parser/in_select_table_phase.rb +35 -0
  36. data/lib/html5/html5parser/in_table_body_phase.rb +92 -0
  37. data/lib/html5/html5parser/in_table_phase.rb +177 -0
  38. data/lib/html5/html5parser/initial_phase.rb +133 -0
  39. data/lib/html5/html5parser/phase.rb +171 -0
  40. data/lib/html5/inputstream.rb +735 -0
  41. data/lib/html5/liberalxmlparser.rb +158 -0
  42. data/lib/html5/sanitizer.rb +209 -0
  43. data/lib/html5/serializer.rb +2 -0
  44. data/lib/html5/serializer/htmlserializer.rb +179 -0
  45. data/lib/html5/serializer/xhtmlserializer.rb +20 -0
  46. data/lib/html5/sniffer.rb +45 -0
  47. data/lib/html5/tokenizer.rb +1059 -0
  48. data/lib/html5/treebuilders.rb +24 -0
  49. data/lib/html5/treebuilders/base.rb +339 -0
  50. data/lib/html5/treebuilders/hpricot.rb +231 -0
  51. data/lib/html5/treebuilders/rexml.rb +215 -0
  52. data/lib/html5/treebuilders/simpletree.rb +191 -0
  53. data/lib/html5/treewalkers.rb +26 -0
  54. data/lib/html5/treewalkers/base.rb +162 -0
  55. data/lib/html5/treewalkers/hpricot.rb +48 -0
  56. data/lib/html5/treewalkers/rexml.rb +48 -0
  57. data/lib/html5/treewalkers/simpletree.rb +48 -0
  58. data/lib/html5/version.rb +3 -0
  59. data/test/preamble.rb +69 -0
  60. data/test/test_cli.rb +16 -0
  61. data/test/test_encoding.rb +35 -0
  62. data/test/test_input_stream.rb +26 -0
  63. data/test/test_lxp.rb +283 -0
  64. data/test/test_parser.rb +63 -0
  65. data/test/test_sanitizer.rb +173 -0
  66. data/test/test_serializer.rb +67 -0
  67. data/test/test_sniffer.rb +27 -0
  68. data/test/test_stream.rb +71 -0
  69. data/test/test_tokenizer.rb +95 -0
  70. data/test/test_treewalkers.rb +135 -0
  71. data/test/test_validator.rb +31 -0
  72. data/test/tokenizer_test_parser.rb +67 -0
  73. data/test19.rb +38 -0
  74. metadata +198 -0
@@ -0,0 +1,10 @@
1
+ == 0.10.0 2007-10-08
2
+ * proof-of-concept validator
3
+ * easier to localize error reporting
4
+ * many unit tests
5
+
6
+ == 0.1.0 / 2007-08-07
7
+
8
+ * 1 major enhancement
9
+ * Birthday!
10
+
@@ -0,0 +1,73 @@
1
+ History.txt
2
+ Manifest.txt
3
+ README
4
+ Rakefile.rb
5
+ bin/html5
6
+ lib/html5.rb
7
+ lib/html5/cli.rb
8
+ lib/html5/constants.rb
9
+ lib/html5/filters/base.rb
10
+ lib/html5/filters/inject_meta_charset.rb
11
+ lib/html5/filters/iso639codes.rb
12
+ lib/html5/filters/optionaltags.rb
13
+ lib/html5/filters/rfc2046.rb
14
+ lib/html5/filters/rfc3987.rb
15
+ lib/html5/filters/sanitizer.rb
16
+ lib/html5/filters/validator.rb
17
+ lib/html5/filters/whitespace.rb
18
+ lib/html5/html5parser.rb
19
+ lib/html5/html5parser/after_after_body_phase.rb
20
+ lib/html5/html5parser/after_after_frameset_phase.rb
21
+ lib/html5/html5parser/after_body_phase.rb
22
+ lib/html5/html5parser/after_frameset_phase.rb
23
+ lib/html5/html5parser/after_head_phase.rb
24
+ lib/html5/html5parser/before_head_phase.rb
25
+ lib/html5/html5parser/before_html_phase.rb
26
+ lib/html5/html5parser/in_body_phase.rb
27
+ lib/html5/html5parser/in_caption_phase.rb
28
+ lib/html5/html5parser/in_cell_phase.rb
29
+ lib/html5/html5parser/in_column_group_phase.rb
30
+ lib/html5/html5parser/in_foreign_content_phase.rb
31
+ lib/html5/html5parser/in_frameset_phase.rb
32
+ lib/html5/html5parser/in_head_phase.rb
33
+ lib/html5/html5parser/in_row_phase.rb
34
+ lib/html5/html5parser/in_select_phase.rb
35
+ lib/html5/html5parser/in_select_table_phase.rb
36
+ lib/html5/html5parser/in_table_body_phase.rb
37
+ lib/html5/html5parser/in_table_phase.rb
38
+ lib/html5/html5parser/initial_phase.rb
39
+ lib/html5/html5parser/phase.rb
40
+ lib/html5/inputstream.rb
41
+ lib/html5/liberalxmlparser.rb
42
+ lib/html5/sanitizer.rb
43
+ lib/html5/serializer.rb
44
+ lib/html5/serializer/htmlserializer.rb
45
+ lib/html5/serializer/xhtmlserializer.rb
46
+ lib/html5/sniffer.rb
47
+ lib/html5/tokenizer.rb
48
+ lib/html5/treebuilders.rb
49
+ lib/html5/treebuilders/base.rb
50
+ lib/html5/treebuilders/hpricot.rb
51
+ lib/html5/treebuilders/rexml.rb
52
+ lib/html5/treebuilders/simpletree.rb
53
+ lib/html5/treewalkers.rb
54
+ lib/html5/treewalkers/base.rb
55
+ lib/html5/treewalkers/hpricot.rb
56
+ lib/html5/treewalkers/rexml.rb
57
+ lib/html5/treewalkers/simpletree.rb
58
+ lib/html5/version.rb
59
+ test/preamble.rb
60
+ test/test_cli.rb
61
+ test/test_encoding.rb
62
+ test/test_input_stream.rb
63
+ test/test_lxp.rb
64
+ test/test_parser.rb
65
+ test/test_sanitizer.rb
66
+ test/test_serializer.rb
67
+ test/test_sniffer.rb
68
+ test/test_stream.rb
69
+ test/test_tokenizer.rb
70
+ test/test_treewalkers.rb
71
+ test/test_validator.rb
72
+ test/tokenizer_test_parser.rb
73
+ test19.rb
data/README ADDED
@@ -0,0 +1,45 @@
1
+ html5
2
+ by Ryan King, et al
3
+ http://code.google.com/p/html5lib
4
+
5
+ == DESCRIPTION:
6
+
7
+ A ruby implementation of the parsing algorithm in HTML5.
8
+
9
+
10
+ == FEATURES/PROBLEMS:
11
+
12
+
13
+
14
+ == SYNOPSIS:
15
+
16
+ TODO
17
+
18
+ == REQUIREMENTS:
19
+
20
+ * chardet, only tested with 0.9.0
21
+
22
+ == INSTALL:
23
+
24
+ * sudo gem install html5
25
+
26
+ == LICENSE:
27
+
28
+ Copyright (c) 2006-2007 The Authors
29
+
30
+ Contributers:
31
+ James Graham - jg307@cam.ac.uk
32
+ Anne van Kesteren - annevankesteren@gmail.com
33
+ Lachlan Hunt - lachlan.hunt@lachy.id.au
34
+ Matt McDonald - kanashii@kanashii.ca
35
+ Sam Ruby - rubys@intertwingly.net
36
+ Ian Hickson (Google) - ian@hixie.ch
37
+ Thomas Broyer - t.broyer@ltgt.net
38
+ Jacques Distler - distler@golem.ph.utexas.edu
39
+ Ryan King - ryan@theryanking.com
40
+
41
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
42
+
43
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
44
+
45
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,33 @@
1
+ require 'rake'
2
+ require 'hoe'
3
+ require 'lib/html5/version'
4
+
5
+ Hoe.new("spk-html5", HTML5::VERSION) do |p|
6
+ p.name = "spk-html5"
7
+ p.description = p.paragraphs_of('README', 2).join("\n\n")
8
+ p.summary = "HTML5 parser/tokenizer."
9
+
10
+ p.author = ['Ryan King'] # TODO: add more names
11
+ p.email = 'ryan@theryanking.com'
12
+ p.url = 'http://code.google.com/p/html5lib'
13
+ p.need_zip = true
14
+
15
+ p.extra_deps << ['rchardet', '>= 1.3']
16
+ p.changes = p.paragraphs_of('History.txt', 0..1).join("\n\n")
17
+ end
18
+
19
+ require 'rcov/rcovtask'
20
+
21
+ namespace :test do
22
+ namespace :coverage do
23
+ desc "Delete aggregate coverage data."
24
+ task(:clean) { rm_f "coverage.data" }
25
+ end
26
+ desc 'Aggregate code coverage for unit, functional and integration tests'
27
+ Rcov::RcovTask.new(:coverage => "test:coverage:clean") do |t|
28
+ t.libs << "test"
29
+ t.test_files = FileList["test/test_*.rb"]
30
+ t.output_dir = "test/coverage/"
31
+ t.verbose = true
32
+ end
33
+ end
@@ -0,0 +1,7 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $: << 'lib'
4
+
5
+ require 'html5/cli'
6
+
7
+ HTML5::CLI.run
@@ -0,0 +1,13 @@
1
+ require 'html5/html5parser'
2
+ require 'html5/version'
3
+
4
+ module HTML5
5
+
6
+ def self.parse(stream, options={})
7
+ HTMLParser.parse(stream, options)
8
+ end
9
+
10
+ def self.parse_fragment(stream, options={})
11
+ HTMLParser.parse_fragment(stream, options)
12
+ end
13
+ end
@@ -0,0 +1,248 @@
1
+ $:.unshift File.dirname(__FILE__), 'lib'
2
+ require 'html5'
3
+ require 'ostruct'
4
+ require 'optparse'
5
+
6
+ module HTML5::CLI
7
+
8
+ def self.parse_opts argv
9
+ options = OpenStruct.new
10
+ options.profile = false
11
+ options.time = false
12
+ options.output = :html
13
+ options.treebuilder = 'simpletree'
14
+ options.error = false
15
+ options.encoding = false
16
+ options.parsemethod = :parse
17
+ options.serializer = {
18
+ :encoding => 'utf-8',
19
+ :omit_optional_tags => false,
20
+ :inject_meta_charset => false
21
+ }
22
+
23
+ opts = OptionParser.new do |opts|
24
+ opts.separator ""
25
+ opts.separator "Parse Options:"
26
+
27
+ opts.on("-b", "--treebuilder NAME") do |treebuilder|
28
+ options.treebuilder = treebuilder
29
+ end
30
+
31
+ opts.on("-f", "--fragment CONTAINER", "Parse as a fragment") do |container|
32
+ options.parsemethod = :parse_fragment
33
+ options.container = container if container
34
+ end
35
+
36
+ opts.separator ""
37
+ opts.separator "Filter Options:"
38
+
39
+ opts.on("--[no-]inject-meta-charset", "inject <meta charset>") do |inject|
40
+ options.serializer[:inject_meta_charset] = inject
41
+ end
42
+
43
+ opts.on("--[no-]strip-whitespace", "strip unnecessary whitespace") do |strip|
44
+ options.serializer[:strip_whitespace] = strip
45
+ end
46
+
47
+ opts.on("--[no-]sanitize", "escape unsafe tags") do |sanitize|
48
+ options.serializer[:sanitize] = sanitize
49
+ end
50
+
51
+ opts.separator ""
52
+ opts.separator "Output Options:"
53
+
54
+ opts.on("--tree", "output as debug tree") do |tree|
55
+ options.output = :tree
56
+ end
57
+
58
+ opts.on("-x", "--xml", "output as xml") do |xml|
59
+ options.output = :xml
60
+ options.treebuilder = "rexml"
61
+ end
62
+
63
+ opts.on("--[no-]html", "Output as html") do |html|
64
+ options.output = (html ? :html : nil)
65
+ end
66
+
67
+ opts.on("--hilite", "Output as formatted highlighted code.") do |hilite|
68
+ options.output = :hilite
69
+ end
70
+
71
+ opts.on("-e", "--error", "Print a list of parse errors") do |error|
72
+ options.error = error
73
+ end
74
+
75
+ opts.separator ""
76
+ opts.separator "Serialization Options:"
77
+
78
+ opts.on("--[no-]omit-optional-tags", "Omit optional tags") do |omit|
79
+ options.serializer[:omit_optional_tags] = omit
80
+ end
81
+
82
+ opts.on("--[no-]quote-attr-values", "Quote attribute values") do |quote|
83
+ options.serializer[:quote_attr_values] = quote
84
+ end
85
+
86
+ opts.on("--[no-]use-best-quote-char", "Use best quote character") do |best|
87
+ options.serializer[:use_best_quote_char] = best
88
+ end
89
+
90
+ opts.on("--quote-char C", "Use specified quote character") do |c|
91
+ options.serializer[:quote_char] = c
92
+ end
93
+
94
+ opts.on("--[no-]minimize-boolean-attributes", "Minimize boolean attributes") do |min|
95
+ options.serializer[:minimize_boolean_attributes] = min
96
+ end
97
+
98
+ opts.on("--[no-]use-trailing-solidus", "Use trailing solidus") do |slash|
99
+ options.serializer[:use_trailing_solidus] = slash
100
+ end
101
+
102
+ opts.on("--[no-]escape-lt-in-attrs", "Escape less than signs in attribute values") do |lt|
103
+ options.serializer[:escape_lt_in_attrs] = lt
104
+ end
105
+
106
+ opts.on("--[no-]escape-rcdata", "Escape rcdata element values") do |rcdata|
107
+ options.serializer[:escape_rcdata] = rcdata
108
+ end
109
+
110
+ opts.separator ""
111
+ opts.separator "Other Options:"
112
+
113
+ opts.on("-p", "--[no-]profile", "Profile the run") do |profile|
114
+ options.profile = profile
115
+ end
116
+
117
+ opts.on("-t", "--[no-]time", "Time the run") do |time|
118
+ options.time = time
119
+ end
120
+
121
+ opts.on("-c", "--[no-]encoding", "Print character encoding used") do |encoding|
122
+ options.encoding = encoding
123
+ end
124
+
125
+ opts.on_tail("-h", "--help", "Show this message") do
126
+ puts opts
127
+ exit
128
+ end
129
+
130
+
131
+ end
132
+ opts.parse!(argv)
133
+ options
134
+ end
135
+
136
+ def self.open_input f
137
+ if f
138
+ begin
139
+ if f[0..6] == 'http://'
140
+ require 'open-uri'
141
+ f = URI.parse(f).open
142
+ encoding = f.charset
143
+ elsif f == '-'
144
+ f = $stdin
145
+ else
146
+ f = open(f)
147
+ end
148
+ rescue
149
+ end
150
+ else
151
+ $stderr.write("No filename provided. Use -h for help\n")
152
+ exit(1)
153
+ end
154
+ f
155
+ end
156
+
157
+ def self.parse(opts, args)
158
+ encoding = nil
159
+
160
+ f = open_input args.last
161
+
162
+ require 'html5/treebuilders'
163
+ treebuilder = HTML5::TreeBuilders[opts.treebuilder]
164
+
165
+ if opts.output == :xml
166
+ require 'html5/liberalxmlparser'
167
+ p = HTML5::XMLParser.new(:tree=>treebuilder)
168
+ else
169
+ require 'html5/html5parser'
170
+ p = HTML5::HTMLParser.new(:tree=>treebuilder)
171
+ end
172
+
173
+ if opts.parsemethod == :parse
174
+ args = [f, encoding]
175
+ else
176
+ args = [f, (opts.container || 'div'), encoding]
177
+ end
178
+
179
+ if opts.profile
180
+ require 'profiler'
181
+ Profiler__::start_profile
182
+ p.send(opts.parsemethod, *args)
183
+ Profiler__::stop_profile
184
+ Profiler__::print_profile($stderr)
185
+ elsif opts.time
186
+ require 'time' # TODO: switch to benchmark
187
+ t0 = Time.new
188
+ document = p.send(opts.parsemethod, *args)
189
+ t1 = Time.new
190
+ print_output(p, document, opts)
191
+ t2 = Time.new
192
+ puts "\n\nRun took: #{t1-t0}s (plus #{t2-t1}s to print the output)"
193
+ else
194
+ document = p.send(opts.parsemethod, *args)
195
+ print_output(p, document, opts)
196
+ end
197
+ end
198
+
199
+ def self.print_output(parser, document, opts)
200
+ puts "Encoding: #{parser.tokenizer.stream.char_encoding}" if opts.encoding
201
+
202
+ case opts.output
203
+ when :xml
204
+ print document
205
+ when :html
206
+ require 'html5/treewalkers'
207
+ tokens = HTML5::TreeWalkers[opts.treebuilder].new(document)
208
+ require 'html5/serializer'
209
+ puts HTML5::HTMLSerializer.serialize(tokens, opts.serializer)
210
+ when :hilite
211
+ print document.hilite
212
+ when :tree
213
+ document = [document] unless document.respond_to?(:each)
214
+ document.each {|fragment| puts parser.tree.testSerializer(fragment)}
215
+ end
216
+
217
+ if opts.error
218
+ errList=[]
219
+ for pos, errorcode, datavars in parser.errors
220
+ formatstring = HTML5::E[errorcode] || 'Unknown error "%(errorcode)"'
221
+ message = PythonicTemplate.new(formatstring).to_s(datavars)
222
+ errList << "Line #{pos[0]} Col #{pos[1]} " + message
223
+ end
224
+ $stdout.write("\nParse errors:\n" + errList.join("\n")+"\n")
225
+ end
226
+ end
227
+
228
+ class PythonicTemplate
229
+ # convert Python format string into a Ruby string, ready to eval
230
+ def initialize format
231
+ @format = format.dup
232
+ @format.gsub!('"', '\\"')
233
+ @format.gsub!(/%\((\w+)\)/, '#{@_\1}')
234
+ @format = '"' + @format + '"'
235
+ end
236
+
237
+ # evaluate string
238
+ def to_s(vars=nil)
239
+ vars.each {|var,value| eval "@_#{var}=#{value.dump}"} if vars
240
+ eval @format
241
+ end
242
+ end
243
+
244
+ def self.run
245
+ options = parse_opts ARGV
246
+ parse options, ARGV
247
+ end
248
+ end
@@ -0,0 +1,1061 @@
1
+ module HTML5
2
+
3
+ class EOF < Exception; end
4
+
5
+ def self._(str); str end
6
+
7
+ CONTENT_MODEL_FLAGS = [
8
+ :PCDATA,
9
+ :RCDATA,
10
+ :CDATA,
11
+ :PLAINTEXT
12
+ ]
13
+
14
+ SCOPING_ELEMENTS = %w[
15
+ applet
16
+ button
17
+ caption
18
+ html
19
+ marquee
20
+ object
21
+ table
22
+ td
23
+ th
24
+ ]
25
+
26
+ FORMATTING_ELEMENTS = %w[
27
+ a
28
+ b
29
+ big
30
+ em
31
+ font
32
+ i
33
+ nobr
34
+ s
35
+ small
36
+ strike
37
+ strong
38
+ tt
39
+ u
40
+ ]
41
+
42
+ SPECIAL_ELEMENTS = %w[
43
+ address
44
+ area
45
+ base
46
+ basefont
47
+ bgsound
48
+ blockquote
49
+ body
50
+ br
51
+ center
52
+ col
53
+ colgroup
54
+ dd
55
+ dir
56
+ div
57
+ dl
58
+ dt
59
+ embed
60
+ fieldset
61
+ form
62
+ frame
63
+ frameset
64
+ h1
65
+ h2
66
+ h3
67
+ h4
68
+ h5
69
+ h6
70
+ head
71
+ hr
72
+ iframe
73
+ image
74
+ img
75
+ input
76
+ isindex
77
+ li
78
+ link
79
+ listing
80
+ menu
81
+ meta
82
+ noembed
83
+ noframes
84
+ noscript
85
+ ol
86
+ optgroup
87
+ option
88
+ p
89
+ param
90
+ plaintext
91
+ pre
92
+ script
93
+ select
94
+ spacer
95
+ style
96
+ tbody
97
+ textarea
98
+ tfoot
99
+ thead
100
+ title
101
+ tr
102
+ ul
103
+ wbr
104
+ ]
105
+
106
+ SPACE_CHARACTERS = %W[
107
+ \t
108
+ \n
109
+ \x0B
110
+ \x0C
111
+ \x20
112
+ \r
113
+ ]
114
+
115
+ TABLE_INSERT_MODE_ELEMENTS = %w[
116
+ table
117
+ tbody
118
+ tfoot
119
+ thead
120
+ tr
121
+ ]
122
+
123
+ ASCII_LOWERCASE = ('a'..'z').to_a.join('')
124
+ ASCII_UPPERCASE = ('A'..'Z').to_a.join('')
125
+ ASCII_LETTERS = ASCII_LOWERCASE + ASCII_UPPERCASE
126
+ DIGITS = '0'..'9'
127
+ HEX_DIGITS = DIGITS.to_a + ('a'..'f').to_a + ('A'..'F').to_a
128
+
129
+ # Heading elements need to be ordered
130
+ HEADING_ELEMENTS = %w[
131
+ h1
132
+ h2
133
+ h3
134
+ h4
135
+ h5
136
+ h6
137
+ ]
138
+
139
+ # XXX What about event-source and command?
140
+ VOID_ELEMENTS = %w[
141
+ base
142
+ link
143
+ meta
144
+ hr
145
+ br
146
+ img
147
+ embed
148
+ param
149
+ area
150
+ col
151
+ input
152
+ ]
153
+
154
+ CDATA_ELEMENTS = %w[title textarea]
155
+
156
+ RCDATA_ELEMENTS = %w[
157
+ style
158
+ script
159
+ xmp
160
+ iframe
161
+ noembed
162
+ noframes
163
+ noscript
164
+ ]
165
+
166
+ BOOLEAN_ATTRIBUTES = {
167
+ :global => %w[irrelevant],
168
+ 'style' => %w[scoped],
169
+ 'img' => %w[ismap],
170
+ 'audio' => %w[autoplay controls],
171
+ 'video' => %w[autoplay controls],
172
+ 'script' => %w[defer async],
173
+ 'details' => %w[open],
174
+ 'datagrid' => %w[multiple disabled],
175
+ 'command' => %w[hidden disabled checked default],
176
+ 'menu' => %w[autosubmit],
177
+ 'fieldset' => %w[disabled readonly],
178
+ 'option' => %w[disabled readonly selected],
179
+ 'optgroup' => %w[disabled readonly],
180
+ 'button' => %w[disabled autofocus],
181
+ 'input' => %w[disabled readonly required autofocus checked ismap],
182
+ 'select' => %w[disabled readonly autofocus multiple],
183
+ 'output' => %w[disabled readonly]
184
+
185
+ }
186
+
187
+ # entitiesWindows1252 has to be _ordered_ and needs to have an index.
188
+ ENTITIES_WINDOWS1252 = [
189
+ 8364, # 0x80 0x20AC EURO SIGN
190
+ 65533, # 0x81 UNDEFINED
191
+ 8218, # 0x82 0x201A SINGLE LOW-9 QUOTATION MARK
192
+ 402, # 0x83 0x0192 LATIN SMALL LETTER F WITH HOOK
193
+ 8222, # 0x84 0x201E DOUBLE LOW-9 QUOTATION MARK
194
+ 8230, # 0x85 0x2026 HORIZONTAL ELLIPSIS
195
+ 8224, # 0x86 0x2020 DAGGER
196
+ 8225, # 0x87 0x2021 DOUBLE DAGGER
197
+ 710, # 0x88 0x02C6 MODIFIER LETTER CIRCUMFLEX ACCENT
198
+ 8240, # 0x89 0x2030 PER MILLE SIGN
199
+ 352, # 0x8A 0x0160 LATIN CAPITAL LETTER S WITH CARON
200
+ 8249, # 0x8B 0x2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
201
+ 338, # 0x8C 0x0152 LATIN CAPITAL LIGATURE OE
202
+ 65533, # 0x8D UNDEFINED
203
+ 381, # 0x8E 0x017D LATIN CAPITAL LETTER Z WITH CARON
204
+ 65533, # 0x8F UNDEFINED
205
+ 65533, # 0x90 UNDEFINED
206
+ 8216, # 0x91 0x2018 LEFT SINGLE QUOTATION MARK
207
+ 8217, # 0x92 0x2019 RIGHT SINGLE QUOTATION MARK
208
+ 8220, # 0x93 0x201C LEFT DOUBLE QUOTATION MARK
209
+ 8221, # 0x94 0x201D RIGHT DOUBLE QUOTATION MARK
210
+ 8226, # 0x95 0x2022 BULLET
211
+ 8211, # 0x96 0x2013 EN DASH
212
+ 8212, # 0x97 0x2014 EM DASH
213
+ 732, # 0x98 0x02DC SMALL TILDE
214
+ 8482, # 0x99 0x2122 TRADE MARK SIGN
215
+ 353, # 0x9A 0x0161 LATIN SMALL LETTER S WITH CARON
216
+ 8250, # 0x9B 0x203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
217
+ 339, # 0x9C 0x0153 LATIN SMALL LIGATURE OE
218
+ 65533, # 0x9D UNDEFINED
219
+ 382, # 0x9E 0x017E LATIN SMALL LETTER Z WITH CARON
220
+ 376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS
221
+ ]
222
+
223
+ # ENTITIES was generated from Python using the following code:
224
+ #
225
+ # import constants
226
+ # entities = constants.entities.items()
227
+ # entities.sort()
228
+ # list = [ ' '.join([repr(entity), '=>', ord(value)<128 and
229
+ # repr(str(value)) or repr(value.encode('utf-8')).replace("'",'"')])
230
+ # for entity, value in entities]
231
+ # print ' ENTITIES = {\n ' + ',\n '.join(list) + '\n }'
232
+
233
+ ENTITIES = {
234
+ 'AElig' => "\xc3\x86",
235
+ 'AElig;' => "\xc3\x86",
236
+ 'AMP' => '&',
237
+ 'AMP;' => '&',
238
+ 'Aacute' => "\xc3\x81",
239
+ 'Aacute;' => "\xc3\x81",
240
+ 'Acirc' => "\xc3\x82",
241
+ 'Acirc;' => "\xc3\x82",
242
+ 'Agrave' => "\xc3\x80",
243
+ 'Agrave;' => "\xc3\x80",
244
+ 'Alpha;' => "\xce\x91",
245
+ 'Aring' => "\xc3\x85",
246
+ 'Aring;' => "\xc3\x85",
247
+ 'Atilde' => "\xc3\x83",
248
+ 'Atilde;' => "\xc3\x83",
249
+ 'Auml' => "\xc3\x84",
250
+ 'Auml;' => "\xc3\x84",
251
+ 'Beta;' => "\xce\x92",
252
+ 'COPY' => "\xc2\xa9",
253
+ 'COPY;' => "\xc2\xa9",
254
+ 'Ccedil' => "\xc3\x87",
255
+ 'Ccedil;' => "\xc3\x87",
256
+ 'Chi;' => "\xce\xa7",
257
+ 'Dagger;' => "\xe2\x80\xa1",
258
+ 'Delta;' => "\xce\x94",
259
+ 'ETH' => "\xc3\x90",
260
+ 'ETH;' => "\xc3\x90",
261
+ 'Eacute' => "\xc3\x89",
262
+ 'Eacute;' => "\xc3\x89",
263
+ 'Ecirc' => "\xc3\x8a",
264
+ 'Ecirc;' => "\xc3\x8a",
265
+ 'Egrave' => "\xc3\x88",
266
+ 'Egrave;' => "\xc3\x88",
267
+ 'Epsilon;' => "\xce\x95",
268
+ 'Eta;' => "\xce\x97",
269
+ 'Euml' => "\xc3\x8b",
270
+ 'Euml;' => "\xc3\x8b",
271
+ 'GT' => '>',
272
+ 'GT;' => '>',
273
+ 'Gamma;' => "\xce\x93",
274
+ 'Iacute' => "\xc3\x8d",
275
+ 'Iacute;' => "\xc3\x8d",
276
+ 'Icirc' => "\xc3\x8e",
277
+ 'Icirc;' => "\xc3\x8e",
278
+ 'Igrave' => "\xc3\x8c",
279
+ 'Igrave;' => "\xc3\x8c",
280
+ 'Iota;' => "\xce\x99",
281
+ 'Iuml' => "\xc3\x8f",
282
+ 'Iuml;' => "\xc3\x8f",
283
+ 'Kappa;' => "\xce\x9a",
284
+ 'LT' => '<',
285
+ 'LT;' => '<',
286
+ 'Lambda;' => "\xce\x9b",
287
+ 'Mu;' => "\xce\x9c",
288
+ 'Ntilde' => "\xc3\x91",
289
+ 'Ntilde;' => "\xc3\x91",
290
+ 'Nu;' => "\xce\x9d",
291
+ 'OElig;' => "\xc5\x92",
292
+ 'Oacute' => "\xc3\x93",
293
+ 'Oacute;' => "\xc3\x93",
294
+ 'Ocirc' => "\xc3\x94",
295
+ 'Ocirc;' => "\xc3\x94",
296
+ 'Ograve' => "\xc3\x92",
297
+ 'Ograve;' => "\xc3\x92",
298
+ 'Omega;' => "\xce\xa9",
299
+ 'Omicron;' => "\xce\x9f",
300
+ 'Oslash' => "\xc3\x98",
301
+ 'Oslash;' => "\xc3\x98",
302
+ 'Otilde' => "\xc3\x95",
303
+ 'Otilde;' => "\xc3\x95",
304
+ 'Ouml' => "\xc3\x96",
305
+ 'Ouml;' => "\xc3\x96",
306
+ 'Phi;' => "\xce\xa6",
307
+ 'Pi;' => "\xce\xa0",
308
+ 'Prime;' => "\xe2\x80\xb3",
309
+ 'Psi;' => "\xce\xa8",
310
+ 'QUOT' => '"',
311
+ 'QUOT;' => '"',
312
+ 'REG' => "\xc2\xae",
313
+ 'REG;' => "\xc2\xae",
314
+ 'Rho;' => "\xce\xa1",
315
+ 'Scaron;' => "\xc5\xa0",
316
+ 'Sigma;' => "\xce\xa3",
317
+ 'THORN' => "\xc3\x9e",
318
+ 'THORN;' => "\xc3\x9e",
319
+ 'TRADE;' => "\xe2\x84\xa2",
320
+ 'Tau;' => "\xce\xa4",
321
+ 'Theta;' => "\xce\x98",
322
+ 'Uacute' => "\xc3\x9a",
323
+ 'Uacute;' => "\xc3\x9a",
324
+ 'Ucirc' => "\xc3\x9b",
325
+ 'Ucirc;' => "\xc3\x9b",
326
+ 'Ugrave' => "\xc3\x99",
327
+ 'Ugrave;' => "\xc3\x99",
328
+ 'Upsilon;' => "\xce\xa5",
329
+ 'Uuml' => "\xc3\x9c",
330
+ 'Uuml;' => "\xc3\x9c",
331
+ 'Xi;' => "\xce\x9e",
332
+ 'Yacute' => "\xc3\x9d",
333
+ 'Yacute;' => "\xc3\x9d",
334
+ 'Yuml;' => "\xc5\xb8",
335
+ 'Zeta;' => "\xce\x96",
336
+ 'aacute' => "\xc3\xa1",
337
+ 'aacute;' => "\xc3\xa1",
338
+ 'acirc' => "\xc3\xa2",
339
+ 'acirc;' => "\xc3\xa2",
340
+ 'acute' => "\xc2\xb4",
341
+ 'acute;' => "\xc2\xb4",
342
+ 'aelig' => "\xc3\xa6",
343
+ 'aelig;' => "\xc3\xa6",
344
+ 'agrave' => "\xc3\xa0",
345
+ 'agrave;' => "\xc3\xa0",
346
+ 'alefsym;' => "\xe2\x84\xb5",
347
+ 'alpha;' => "\xce\xb1",
348
+ 'amp' => '&',
349
+ 'amp;' => '&',
350
+ 'and;' => "\xe2\x88\xa7",
351
+ 'ang;' => "\xe2\x88\xa0",
352
+ 'apos;' => "'",
353
+ 'aring' => "\xc3\xa5",
354
+ 'aring;' => "\xc3\xa5",
355
+ 'asymp;' => "\xe2\x89\x88",
356
+ 'atilde' => "\xc3\xa3",
357
+ 'atilde;' => "\xc3\xa3",
358
+ 'auml' => "\xc3\xa4",
359
+ 'auml;' => "\xc3\xa4",
360
+ 'bdquo;' => "\xe2\x80\x9e",
361
+ 'beta;' => "\xce\xb2",
362
+ 'brvbar' => "\xc2\xa6",
363
+ 'brvbar;' => "\xc2\xa6",
364
+ 'bull;' => "\xe2\x80\xa2",
365
+ 'cap;' => "\xe2\x88\xa9",
366
+ 'ccedil' => "\xc3\xa7",
367
+ 'ccedil;' => "\xc3\xa7",
368
+ 'cedil' => "\xc2\xb8",
369
+ 'cedil;' => "\xc2\xb8",
370
+ 'cent' => "\xc2\xa2",
371
+ 'cent;' => "\xc2\xa2",
372
+ 'chi;' => "\xcf\x87",
373
+ 'circ;' => "\xcb\x86",
374
+ 'clubs;' => "\xe2\x99\xa3",
375
+ 'cong;' => "\xe2\x89\x85",
376
+ 'copy' => "\xc2\xa9",
377
+ 'copy;' => "\xc2\xa9",
378
+ 'crarr;' => "\xe2\x86\xb5",
379
+ 'cup;' => "\xe2\x88\xaa",
380
+ 'curren' => "\xc2\xa4",
381
+ 'curren;' => "\xc2\xa4",
382
+ 'dArr;' => "\xe2\x87\x93",
383
+ 'dagger;' => "\xe2\x80\xa0",
384
+ 'darr;' => "\xe2\x86\x93",
385
+ 'deg' => "\xc2\xb0",
386
+ 'deg;' => "\xc2\xb0",
387
+ 'delta;' => "\xce\xb4",
388
+ 'diams;' => "\xe2\x99\xa6",
389
+ 'divide' => "\xc3\xb7",
390
+ 'divide;' => "\xc3\xb7",
391
+ 'eacute' => "\xc3\xa9",
392
+ 'eacute;' => "\xc3\xa9",
393
+ 'ecirc' => "\xc3\xaa",
394
+ 'ecirc;' => "\xc3\xaa",
395
+ 'egrave' => "\xc3\xa8",
396
+ 'egrave;' => "\xc3\xa8",
397
+ 'empty;' => "\xe2\x88\x85",
398
+ 'emsp;' => "\xe2\x80\x83",
399
+ 'ensp;' => "\xe2\x80\x82",
400
+ 'epsilon;' => "\xce\xb5",
401
+ 'equiv;' => "\xe2\x89\xa1",
402
+ 'eta;' => "\xce\xb7",
403
+ 'eth' => "\xc3\xb0",
404
+ 'eth;' => "\xc3\xb0",
405
+ 'euml' => "\xc3\xab",
406
+ 'euml;' => "\xc3\xab",
407
+ 'euro;' => "\xe2\x82\xac",
408
+ 'exist;' => "\xe2\x88\x83",
409
+ 'fnof;' => "\xc6\x92",
410
+ 'forall;' => "\xe2\x88\x80",
411
+ 'frac12' => "\xc2\xbd",
412
+ 'frac12;' => "\xc2\xbd",
413
+ 'frac14' => "\xc2\xbc",
414
+ 'frac14;' => "\xc2\xbc",
415
+ 'frac34' => "\xc2\xbe",
416
+ 'frac34;' => "\xc2\xbe",
417
+ 'frasl;' => "\xe2\x81\x84",
418
+ 'gamma;' => "\xce\xb3",
419
+ 'ge;' => "\xe2\x89\xa5",
420
+ 'gt' => '>',
421
+ 'gt;' => '>',
422
+ 'hArr;' => "\xe2\x87\x94",
423
+ 'harr;' => "\xe2\x86\x94",
424
+ 'hearts;' => "\xe2\x99\xa5",
425
+ 'hellip;' => "\xe2\x80\xa6",
426
+ 'iacute' => "\xc3\xad",
427
+ 'iacute;' => "\xc3\xad",
428
+ 'icirc' => "\xc3\xae",
429
+ 'icirc;' => "\xc3\xae",
430
+ 'iexcl' => "\xc2\xa1",
431
+ 'iexcl;' => "\xc2\xa1",
432
+ 'igrave' => "\xc3\xac",
433
+ 'igrave;' => "\xc3\xac",
434
+ 'image;' => "\xe2\x84\x91",
435
+ 'infin;' => "\xe2\x88\x9e",
436
+ 'int;' => "\xe2\x88\xab",
437
+ 'iota;' => "\xce\xb9",
438
+ 'iquest' => "\xc2\xbf",
439
+ 'iquest;' => "\xc2\xbf",
440
+ 'isin;' => "\xe2\x88\x88",
441
+ 'iuml' => "\xc3\xaf",
442
+ 'iuml;' => "\xc3\xaf",
443
+ 'kappa;' => "\xce\xba",
444
+ 'lArr;' => "\xe2\x87\x90",
445
+ 'lambda;' => "\xce\xbb",
446
+ 'lang;' => "\xe2\x9f\xa8",
447
+ 'laquo' => "\xc2\xab",
448
+ 'laquo;' => "\xc2\xab",
449
+ 'larr;' => "\xe2\x86\x90",
450
+ 'lceil;' => "\xe2\x8c\x88",
451
+ 'ldquo;' => "\xe2\x80\x9c",
452
+ 'le;' => "\xe2\x89\xa4",
453
+ 'lfloor;' => "\xe2\x8c\x8a",
454
+ 'lowast;' => "\xe2\x88\x97",
455
+ 'loz;' => "\xe2\x97\x8a",
456
+ 'lrm;' => "\xe2\x80\x8e",
457
+ 'lsaquo;' => "\xe2\x80\xb9",
458
+ 'lsquo;' => "\xe2\x80\x98",
459
+ 'lt' => '<',
460
+ 'lt;' => '<',
461
+ 'macr' => "\xc2\xaf",
462
+ 'macr;' => "\xc2\xaf",
463
+ 'mdash;' => "\xe2\x80\x94",
464
+ 'micro' => "\xc2\xb5",
465
+ 'micro;' => "\xc2\xb5",
466
+ 'middot' => "\xc2\xb7",
467
+ 'middot;' => "\xc2\xb7",
468
+ 'minus;' => "\xe2\x88\x92",
469
+ 'mu;' => "\xce\xbc",
470
+ 'nabla;' => "\xe2\x88\x87",
471
+ 'nbsp' => "\xc2\xa0",
472
+ 'nbsp;' => "\xc2\xa0",
473
+ 'ndash;' => "\xe2\x80\x93",
474
+ 'ne;' => "\xe2\x89\xa0",
475
+ 'ni;' => "\xe2\x88\x8b",
476
+ 'not' => "\xc2\xac",
477
+ 'not;' => "\xc2\xac",
478
+ 'notin;' => "\xe2\x88\x89",
479
+ 'nsub;' => "\xe2\x8a\x84",
480
+ 'ntilde' => "\xc3\xb1",
481
+ 'ntilde;' => "\xc3\xb1",
482
+ 'nu;' => "\xce\xbd",
483
+ 'oacute' => "\xc3\xb3",
484
+ 'oacute;' => "\xc3\xb3",
485
+ 'ocirc' => "\xc3\xb4",
486
+ 'ocirc;' => "\xc3\xb4",
487
+ 'oelig;' => "\xc5\x93",
488
+ 'ograve' => "\xc3\xb2",
489
+ 'ograve;' => "\xc3\xb2",
490
+ 'oline;' => "\xe2\x80\xbe",
491
+ 'omega;' => "\xcf\x89",
492
+ 'omicron;' => "\xce\xbf",
493
+ 'oplus;' => "\xe2\x8a\x95",
494
+ 'or;' => "\xe2\x88\xa8",
495
+ 'ordf' => "\xc2\xaa",
496
+ 'ordf;' => "\xc2\xaa",
497
+ 'ordm' => "\xc2\xba",
498
+ 'ordm;' => "\xc2\xba",
499
+ 'oslash' => "\xc3\xb8",
500
+ 'oslash;' => "\xc3\xb8",
501
+ 'otilde' => "\xc3\xb5",
502
+ 'otilde;' => "\xc3\xb5",
503
+ 'otimes;' => "\xe2\x8a\x97",
504
+ 'ouml' => "\xc3\xb6",
505
+ 'ouml;' => "\xc3\xb6",
506
+ 'para' => "\xc2\xb6",
507
+ 'para;' => "\xc2\xb6",
508
+ 'part;' => "\xe2\x88\x82",
509
+ 'permil;' => "\xe2\x80\xb0",
510
+ 'perp;' => "\xe2\x8a\xa5",
511
+ 'phi;' => "\xcf\x86",
512
+ 'pi;' => "\xcf\x80",
513
+ 'piv;' => "\xcf\x96",
514
+ 'plusmn' => "\xc2\xb1",
515
+ 'plusmn;' => "\xc2\xb1",
516
+ 'pound' => "\xc2\xa3",
517
+ 'pound;' => "\xc2\xa3",
518
+ 'prime;' => "\xe2\x80\xb2",
519
+ 'prod;' => "\xe2\x88\x8f",
520
+ 'prop;' => "\xe2\x88\x9d",
521
+ 'psi;' => "\xcf\x88",
522
+ 'quot' => '"',
523
+ 'quot;' => '"',
524
+ 'rArr;' => "\xe2\x87\x92",
525
+ 'radic;' => "\xe2\x88\x9a",
526
+ 'rang;' => "\xe2\x9f\xa9",
527
+ 'raquo' => "\xc2\xbb",
528
+ 'raquo;' => "\xc2\xbb",
529
+ 'rarr;' => "\xe2\x86\x92",
530
+ 'rceil;' => "\xe2\x8c\x89",
531
+ 'rdquo;' => "\xe2\x80\x9d",
532
+ 'real;' => "\xe2\x84\x9c",
533
+ 'reg' => "\xc2\xae",
534
+ 'reg;' => "\xc2\xae",
535
+ 'rfloor;' => "\xe2\x8c\x8b",
536
+ 'rho;' => "\xcf\x81",
537
+ 'rlm;' => "\xe2\x80\x8f",
538
+ 'rsaquo;' => "\xe2\x80\xba",
539
+ 'rsquo;' => "\xe2\x80\x99",
540
+ 'sbquo;' => "\xe2\x80\x9a",
541
+ 'scaron;' => "\xc5\xa1",
542
+ 'sdot;' => "\xe2\x8b\x85",
543
+ 'sect' => "\xc2\xa7",
544
+ 'sect;' => "\xc2\xa7",
545
+ 'shy' => "\xc2\xad",
546
+ 'shy;' => "\xc2\xad",
547
+ 'sigma;' => "\xcf\x83",
548
+ 'sigmaf;' => "\xcf\x82",
549
+ 'sim;' => "\xe2\x88\xbc",
550
+ 'spades;' => "\xe2\x99\xa0",
551
+ 'sub;' => "\xe2\x8a\x82",
552
+ 'sube;' => "\xe2\x8a\x86",
553
+ 'sum;' => "\xe2\x88\x91",
554
+ 'sup1' => "\xc2\xb9",
555
+ 'sup1;' => "\xc2\xb9",
556
+ 'sup2' => "\xc2\xb2",
557
+ 'sup2;' => "\xc2\xb2",
558
+ 'sup3' => "\xc2\xb3",
559
+ 'sup3;' => "\xc2\xb3",
560
+ 'sup;' => "\xe2\x8a\x83",
561
+ 'supe;' => "\xe2\x8a\x87",
562
+ 'szlig' => "\xc3\x9f",
563
+ 'szlig;' => "\xc3\x9f",
564
+ 'tau;' => "\xcf\x84",
565
+ 'there4;' => "\xe2\x88\xb4",
566
+ 'theta;' => "\xce\xb8",
567
+ 'thetasym;' => "\xcf\x91",
568
+ 'thinsp;' => "\xe2\x80\x89",
569
+ 'thorn' => "\xc3\xbe",
570
+ 'thorn;' => "\xc3\xbe",
571
+ 'tilde;' => "\xcb\x9c",
572
+ 'times' => "\xc3\x97",
573
+ 'times;' => "\xc3\x97",
574
+ 'trade;' => "\xe2\x84\xa2",
575
+ 'uArr;' => "\xe2\x87\x91",
576
+ 'uacute' => "\xc3\xba",
577
+ 'uacute;' => "\xc3\xba",
578
+ 'uarr;' => "\xe2\x86\x91",
579
+ 'ucirc' => "\xc3\xbb",
580
+ 'ucirc;' => "\xc3\xbb",
581
+ 'ugrave' => "\xc3\xb9",
582
+ 'ugrave;' => "\xc3\xb9",
583
+ 'uml' => "\xc2\xa8",
584
+ 'uml;' => "\xc2\xa8",
585
+ 'upsih;' => "\xcf\x92",
586
+ 'upsilon;' => "\xcf\x85",
587
+ 'uuml' => "\xc3\xbc",
588
+ 'uuml;' => "\xc3\xbc",
589
+ 'weierp;' => "\xe2\x84\x98",
590
+ 'xi;' => "\xce\xbe",
591
+ 'yacute' => "\xc3\xbd",
592
+ 'yacute;' => "\xc3\xbd",
593
+ 'yen' => "\xc2\xa5",
594
+ 'yen;' => "\xc2\xa5",
595
+ 'yuml' => "\xc3\xbf",
596
+ 'yuml;' => "\xc3\xbf",
597
+ 'zeta;' => "\xce\xb6",
598
+ 'zwj;' => "\xe2\x80\x8d",
599
+ 'zwnj;' => "\xe2\x80\x8c"
600
+ }
601
+
602
+ ENCODINGS = %w[
603
+ ansi_x3.4-1968
604
+ iso-ir-6
605
+ ansi_x3.4-1986
606
+ iso_646.irv:1991
607
+ ascii
608
+ iso646-us
609
+ us-ascii
610
+ us
611
+ ibm367
612
+ cp367
613
+ csascii
614
+ ks_c_5601-1987
615
+ korean
616
+ iso-2022-kr
617
+ csiso2022kr
618
+ euc-kr
619
+ iso-2022-jp
620
+ csiso2022jp
621
+ iso-2022-jp-2
622
+ iso-ir-58
623
+ chinese
624
+ csiso58gb231280
625
+ iso_8859-1:1987
626
+ iso-ir-100
627
+ iso_8859-1
628
+ iso-8859-1
629
+ latin1
630
+ l1
631
+ ibm819
632
+ cp819
633
+ csisolatin1
634
+ iso_8859-2:1987
635
+ iso-ir-101
636
+ iso_8859-2
637
+ iso-8859-2
638
+ latin2
639
+ l2
640
+ csisolatin2
641
+ iso_8859-3:1988
642
+ iso-ir-109
643
+ iso_8859-3
644
+ iso-8859-3
645
+ latin3
646
+ l3
647
+ csisolatin3
648
+ iso_8859-4:1988
649
+ iso-ir-110
650
+ iso_8859-4
651
+ iso-8859-4
652
+ latin4
653
+ l4
654
+ csisolatin4
655
+ iso_8859-6:1987
656
+ iso-ir-127
657
+ iso_8859-6
658
+ iso-8859-6
659
+ ecma-114
660
+ asmo-708
661
+ arabic
662
+ csisolatinarabic
663
+ iso_8859-7:1987
664
+ iso-ir-126
665
+ iso_8859-7
666
+ iso-8859-7
667
+ elot_928
668
+ ecma-118
669
+ greek
670
+ greek8
671
+ csisolatingreek
672
+ iso_8859-8:1988
673
+ iso-ir-138
674
+ iso_8859-8
675
+ iso-8859-8
676
+ hebrew
677
+ csisolatinhebrew
678
+ iso_8859-5:1988
679
+ iso-ir-144
680
+ iso_8859-5
681
+ iso-8859-5
682
+ cyrillic
683
+ csisolatincyrillic
684
+ iso_8859-9:1989
685
+ iso-ir-148
686
+ iso_8859-9
687
+ iso-8859-9
688
+ latin5
689
+ l5
690
+ csisolatin5
691
+ iso-8859-10
692
+ iso-ir-157
693
+ l6
694
+ iso_8859-10:1992
695
+ csisolatin6
696
+ latin6
697
+ hp-roman8
698
+ roman8
699
+ r8
700
+ ibm037
701
+ cp037
702
+ csibm037
703
+ ibm424
704
+ cp424
705
+ csibm424
706
+ ibm437
707
+ cp437
708
+ 437
709
+ cspc8codepage437
710
+ ibm500
711
+ cp500
712
+ csibm500
713
+ ibm775
714
+ cp775
715
+ cspc775baltic
716
+ ibm850
717
+ cp850
718
+ 850
719
+ cspc850multilingual
720
+ ibm852
721
+ cp852
722
+ 852
723
+ cspcp852
724
+ ibm855
725
+ cp855
726
+ 855
727
+ csibm855
728
+ ibm857
729
+ cp857
730
+ 857
731
+ csibm857
732
+ ibm860
733
+ cp860
734
+ 860
735
+ csibm860
736
+ ibm861
737
+ cp861
738
+ 861
739
+ cp-is
740
+ csibm861
741
+ ibm862
742
+ cp862
743
+ 862
744
+ cspc862latinhebrew
745
+ ibm863
746
+ cp863
747
+ 863
748
+ csibm863
749
+ ibm864
750
+ cp864
751
+ csibm864
752
+ ibm865
753
+ cp865
754
+ 865
755
+ csibm865
756
+ ibm866
757
+ cp866
758
+ 866
759
+ csibm866
760
+ ibm869
761
+ cp869
762
+ 869
763
+ cp-gr
764
+ csibm869
765
+ ibm1026
766
+ cp1026
767
+ csibm1026
768
+ koi8-r
769
+ cskoi8r
770
+ koi8-u
771
+ big5-hkscs
772
+ ptcp154
773
+ csptcp154
774
+ pt154
775
+ cp154
776
+ utf-7
777
+ utf-16be
778
+ utf-16le
779
+ utf-16
780
+ utf-8
781
+ iso-8859-13
782
+ iso-8859-14
783
+ iso-ir-199
784
+ iso_8859-14:1998
785
+ iso_8859-14
786
+ latin8
787
+ iso-celtic
788
+ l8
789
+ iso-8859-15
790
+ iso_8859-15
791
+ iso-8859-16
792
+ iso-ir-226
793
+ iso_8859-16:2001
794
+ iso_8859-16
795
+ latin10
796
+ l10
797
+ gbk
798
+ cp936
799
+ ms936
800
+ gb18030
801
+ shift_jis
802
+ ms_kanji
803
+ csshiftjis
804
+ euc-jp
805
+ gb2312
806
+ big5
807
+ csbig5
808
+ windows-1250
809
+ windows-1251
810
+ windows-1252
811
+ windows-1253
812
+ windows-1254
813
+ windows-1255
814
+ windows-1256
815
+ windows-1257
816
+ windows-1258
817
+ tis-620
818
+ hz-gb-2312
819
+ ]
820
+
821
+ E = {
822
+ "null-character" =>
823
+ _("Null character in input stream, replaced with U+FFFD."),
824
+ "incorrectly-placed-solidus" =>
825
+ _("Solidus (/) incorrectly placed in tag."),
826
+ "incorrect-cr-newline-entity" =>
827
+ _("Incorrect CR newline entity, replaced with LF."),
828
+ "illegal-windows-1252-entity" =>
829
+ _("Entity used with illegal number (windows-1252 reference)."),
830
+ "cant-convert-numeric-entity" =>
831
+ _("Numeric entity couldn't be converted to character " +
832
+ "(codepoint U+%(charAsInt)08x)."),
833
+ "illegal-codepoint-for-numeric-entity" =>
834
+ _("Numeric entity represents an illegal codepoint=> " +
835
+ "U+%(charAsInt)08x."),
836
+ "numeric-entity-without-semicolon" =>
837
+ _("Numeric entity didn't end with ';'."),
838
+ "expected-numeric-entity-but-got-eof" =>
839
+ _("Numeric entity expected. Got end of file instead."),
840
+ "expected-numeric-entity" =>
841
+ _("Numeric entity expected but none found."),
842
+ "named-entity-without-semicolon" =>
843
+ _("Named entity didn't end with ';'."),
844
+ "expected-named-entity" =>
845
+ _("Named entity expected. Got none."),
846
+ "attributes-in-end-tag" =>
847
+ _("End tag contains unexpected attributes."),
848
+ "expected-tag-name-but-got-right-bracket" =>
849
+ _("Expected tag name. Got '>' instead."),
850
+ "expected-tag-name-but-got-question-mark" =>
851
+ _("Expected tag name. Got '?' instead. (HTML doesn't " +
852
+ "support processing instructions.)"),
853
+ "expected-tag-name" =>
854
+ _("Expected tag name. Got something else instead"),
855
+ "expected-closing-tag-but-got-right-bracket" =>
856
+ _("Expected closing tag. Got '>' instead. Ignoring '</>'."),
857
+ "expected-closing-tag-but-got-eof" =>
858
+ _("Expected closing tag. Unexpected end of file."),
859
+ "expected-closing-tag-but-got-char" =>
860
+ _("Expected closing tag. Unexpected character '%(data)' found."),
861
+ "eof-in-tag-name" =>
862
+ _("Unexpected end of file in the tag name."),
863
+ "expected-attribute-name-but-got-eof" =>
864
+ _("Unexpected end of file. Expected attribute name instead."),
865
+ "eof-in-attribute-name" =>
866
+ _("Unexpected end of file in attribute name."),
867
+ "duplicate-attribute" =>
868
+ _("Dropped duplicate attribute on tag."),
869
+ "expected-end-of-tag-name-but-got-eof" =>
870
+ _("Unexpected end of file. Expected = or end of tag."),
871
+ "expected-attribute-value-but-got-eof" =>
872
+ _("Unexpected end of file. Expected attribute value."),
873
+ "eof-in-attribute-value-double-quote" =>
874
+ _("Unexpected end of file in attribute value (\")."),
875
+ "eof-in-attribute-value-single-quote" =>
876
+ _("Unexpected end of file in attribute value (')."),
877
+ "eof-in-attribute-value-no-quotes" =>
878
+ _("Unexpected end of file in attribute value."),
879
+ "expected-dashes-or-doctype" =>
880
+ _("Expected '--' or 'DOCTYPE'. Not found."),
881
+ "incorrect-comment" =>
882
+ _("Incorrect comment."),
883
+ "eof-in-comment" =>
884
+ _("Unexpected end of file in comment."),
885
+ "eof-in-comment-end-dash" =>
886
+ _("Unexpected end of file in comment (-)"),
887
+ "unexpected-dash-after-double-dash-in-comment" =>
888
+ _("Unexpected '-' after '--' found in comment."),
889
+ "eof-in-comment-double-dash" =>
890
+ _("Unexpected end of file in comment (--)."),
891
+ "unexpected-char-in-comment" =>
892
+ _("Unexpected character in comment found."),
893
+ "need-space-after-doctype" =>
894
+ _("No space after literal string 'DOCTYPE'."),
895
+ "expected-doctype-name-but-got-right-bracket" =>
896
+ _("Unexpected > character. Expected DOCTYPE name."),
897
+ "expected-doctype-name-but-got-eof" =>
898
+ _("Unexpected end of file. Expected DOCTYPE name."),
899
+ "eof-in-doctype-name" =>
900
+ _("Unexpected end of file in DOCTYPE name."),
901
+ "eof-in-doctype" =>
902
+ _("Unexpected end of file in DOCTYPE."),
903
+ "expected-space-or-right-bracket-in-doctype" =>
904
+ _("Expected space or '>'. Got '%(data)'"),
905
+ "unexpected-end-of-doctype" =>
906
+ _("Unexpected end of DOCTYPE."),
907
+ "unexpected-char-in-doctype" =>
908
+ _("Unexpected character in DOCTYPE."),
909
+ "eof-in-bogus-doctype" =>
910
+ _("Unexpected end of file in bogus doctype."),
911
+ "eof-in-innerhtml" =>
912
+ _("Unexpected EOF in inner html mode."),
913
+ "unexpected-doctype" =>
914
+ _("Unexpected DOCTYPE. Ignored."),
915
+ "non-html-root" =>
916
+ _("html needs to be the first start tag."),
917
+ "expected-doctype-but-got-eof" =>
918
+ _("Unexpected End of file. Expected DOCTYPE."),
919
+ "unknown-doctype" =>
920
+ _("Erroneous DOCTYPE."),
921
+ "expected-doctype-but-got-chars" =>
922
+ _("Unexpected non-space characters. Expected DOCTYPE."),
923
+ "expected-doctype-but-got-start-tag" =>
924
+ _("Unexpected start tag (%(name)). Expected DOCTYPE."),
925
+ "expected-doctype-but-got-end-tag" =>
926
+ _("Unexpected end tag (%(name)). Expected DOCTYPE."),
927
+ "end-tag-after-implied-root" =>
928
+ _("Unexpected end tag (%(name)) after the (implied) root element."),
929
+ "expected-named-closing-tag-but-got-eof" =>
930
+ _("Unexpected end of file. Expected end tag (%(name))."),
931
+ "two-heads-are-not-better-than-one" =>
932
+ _("Unexpected start tag head in existing head. Ignored."),
933
+ "unexpected-end-tag" =>
934
+ _("Unexpected end tag (%(name)). Ignored."),
935
+ "unexpected-start-tag-out-of-my-head" =>
936
+ _("Unexpected start tag (%(name)) that can be in head. Moved."),
937
+ "unexpected-start-tag" =>
938
+ _("Unexpected start tag (%(name))."),
939
+ "missing-end-tag" =>
940
+ _("Missing end tag (%(name))."),
941
+ "missing-end-tags" =>
942
+ _("Missing end tags (%(name))."),
943
+ "unexpected-start-tag-implies-end-tag" =>
944
+ _("Unexpected start tag (%(startName)) " +
945
+ "implies end tag (%(endName))."),
946
+ "unexpected-start-tag-treated-as" =>
947
+ _("Unexpected start tag (%(originalName)). Treated as %(newName)."),
948
+ "deprecated-tag" =>
949
+ _("Unexpected start tag %(name). Don't use it!"),
950
+ "unexpected-start-tag-ignored" =>
951
+ _("Unexpected start tag %(name). Ignored."),
952
+ "expected-one-end-tag-but-got-another" =>
953
+ _("Unexpected end tag (%(gotName)). " +
954
+ "Missing end tag (%(expectedName))."),
955
+ "end-tag-too-early" =>
956
+ _("End tag (%(name)) seen too early. Expected other end tag."),
957
+ "end-tag-too-early-named" =>
958
+ _("Unexpected end tag (%(gotName)). Expected end tag (%(expectedName))."),
959
+ "end-tag-too-early-ignored" =>
960
+ _("End tag (%(name)) seen too early. Ignored."),
961
+ "adoption-agency-1.1" =>
962
+ _("End tag (%(name)) violates step 1, " +
963
+ "paragraph 1 of the adoption agency algorithm."),
964
+ "adoption-agency-1.2" =>
965
+ _("End tag (%(name)) violates step 1, " +
966
+ "paragraph 2 of the adoption agency algorithm."),
967
+ "adoption-agency-1.3" =>
968
+ _("End tag (%(name)) violates step 1, " +
969
+ "paragraph 3 of the adoption agency algorithm."),
970
+ "unexpected-end-tag-treated-as" =>
971
+ _("Unexpected end tag (%(originalName)). Treated as %(newName)."),
972
+ "no-end-tag" =>
973
+ _("This element (%(name)) has no end tag."),
974
+ "unexpected-implied-end-tag-in-table" =>
975
+ _("Unexpected implied end tag (%(name)) in the table phase."),
976
+ "unexpected-implied-end-tag-in-table-body" =>
977
+ _("Unexpected implied end tag (%(name)) in the table body phase."),
978
+ "unexpected-char-implies-table-voodoo" =>
979
+ _("Unexpected non-space characters in " +
980
+ "table context caused voodoo mode."),
981
+ "unpexted-hidden-input-in-table" =>
982
+ _("Unexpected input with type hidden in table context."),
983
+ "unexpected-start-tag-implies-table-voodoo" =>
984
+ _("Unexpected start tag (%(name)) in " +
985
+ "table context caused voodoo mode."),
986
+ "unexpected-end-tag-implies-table-voodoo" =>
987
+ _("Unexpected end tag (%(name)) in " +
988
+ "table context caused voodoo mode."),
989
+ "unexpected-cell-in-table-body" =>
990
+ _("Unexpected table cell start tag (%(name)) " +
991
+ "in the table body phase."),
992
+ "unexpected-cell-end-tag" =>
993
+ _("Got table cell end tag (%(name)) " +
994
+ "while required end tags are missing."),
995
+ "unexpected-end-tag-in-table-body" =>
996
+ _("Unexpected end tag (%(name)) in the table body phase. Ignored."),
997
+ "unexpected-implied-end-tag-in-table-row" =>
998
+ _("Unexpected implied end tag (%(name)) in the table row phase."),
999
+ "unexpected-end-tag-in-table-row" =>
1000
+ _("Unexpected end tag (%(name)) in the table row phase. Ignored."),
1001
+ "unexpected-select-in-select" =>
1002
+ _("Unexpected select start tag in the select phase " +
1003
+ "treated as select end tag."),
1004
+ "unexpected-input-in-select" =>
1005
+ _("Unexpected input start tag in the select phase."),
1006
+ "unexpected-start-tag-in-select" =>
1007
+ _("Unexpected start tag token (%(name)) in the select phase. " +
1008
+ "Ignored."),
1009
+ "unexpected-end-tag-in-select" =>
1010
+ _("Unexpected end tag (%(name)) in the select phase. Ignored."),
1011
+ "unexpected-table-element-start-tag-in-select-in-table" =>
1012
+ _("Unexpected table element start tag (%(name)s) in the select in table phase."),
1013
+ "unexpected-table-element-end-tag-in-select-in-table" =>
1014
+ _("Unexpected table element end tag (%(name)s) in the select in table phase."),
1015
+ "unexpected-char-after-body" =>
1016
+ _("Unexpected non-space characters in the after body phase."),
1017
+ "unexpected-start-tag-after-body" =>
1018
+ _("Unexpected start tag token (%(name))" +
1019
+ " in the after body phase."),
1020
+ "unexpected-end-tag-after-body" =>
1021
+ _("Unexpected end tag token (%(name))" +
1022
+ " in the after body phase."),
1023
+ "unexpected-char-in-frameset" =>
1024
+ _("Unepxected characters in the frameset phase. Characters ignored."),
1025
+ "unexpected-start-tag-in-frameset" =>
1026
+ _("Unexpected start tag token (%(name))" +
1027
+ " in the frameset phase. Ignored."),
1028
+ "unexpected-frameset-in-frameset-innerhtml" =>
1029
+ _("Unexpected end tag token (frameset) " +
1030
+ "in the frameset phase (innerHTML)."),
1031
+ "unexpected-end-tag-in-frameset" =>
1032
+ _("Unexpected end tag token (%(name))" +
1033
+ " in the frameset phase. Ignored."),
1034
+ "unexpected-char-after-frameset" =>
1035
+ _("Unexpected non-space characters in the " +
1036
+ "after frameset phase. Ignored."),
1037
+ "unexpected-start-tag-after-frameset" =>
1038
+ _("Unexpected start tag (%(name))" +
1039
+ " in the after frameset phase. Ignored."),
1040
+ "unexpected-end-tag-after-frameset" =>
1041
+ _("Unexpected end tag (%(name))" +
1042
+ " in the after frameset phase. Ignored."),
1043
+ "expected-eof-but-got-char" =>
1044
+ _("Unexpected non-space characters. Expected end of file."),
1045
+ "expected-eof-but-got-start-tag" =>
1046
+ _("Unexpected start tag (%(name))" +
1047
+ ". Expected end of file."),
1048
+ "expected-eof-but-got-end-tag" =>
1049
+ _("Unexpected end tag (%(name))" +
1050
+ ". Expected end of file."),
1051
+ "unexpected-end-table-in-caption" =>
1052
+ _("Unexpected end table tag in caption. Generates implied end caption."),
1053
+ "end-html-in-innerhtml" => _("Unexpected html end tag in inner html mode."),
1054
+ "expected-self-closing-tag" => _("Expected a > after the /."),
1055
+ "self-closing-end-tag" => _("Self closing end tag."),
1056
+ "eof-in-table" => _("Unexpected end of file. Expected table content."),
1057
+ "html-in-foreign-content" => _("HTML start tag \"%(name)\" in a foreign namespace context."),
1058
+ "unexpected-start-tag-in-table" => _("Unexpected %(name). Expected table content."),
1059
+ }
1060
+
1061
+ end