spk-html5 0.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. data/History.txt +10 -0
  2. data/Manifest.txt +73 -0
  3. data/README +45 -0
  4. data/Rakefile.rb +33 -0
  5. data/bin/html5 +7 -0
  6. data/lib/html5.rb +13 -0
  7. data/lib/html5/cli.rb +248 -0
  8. data/lib/html5/constants.rb +1061 -0
  9. data/lib/html5/filters/base.rb +10 -0
  10. data/lib/html5/filters/inject_meta_charset.rb +82 -0
  11. data/lib/html5/filters/iso639codes.rb +755 -0
  12. data/lib/html5/filters/optionaltags.rb +198 -0
  13. data/lib/html5/filters/rfc2046.rb +31 -0
  14. data/lib/html5/filters/rfc3987.rb +91 -0
  15. data/lib/html5/filters/sanitizer.rb +15 -0
  16. data/lib/html5/filters/validator.rb +834 -0
  17. data/lib/html5/filters/whitespace.rb +36 -0
  18. data/lib/html5/html5parser.rb +247 -0
  19. data/lib/html5/html5parser/after_after_body_phase.rb +43 -0
  20. data/lib/html5/html5parser/after_after_frameset_phase.rb +32 -0
  21. data/lib/html5/html5parser/after_body_phase.rb +46 -0
  22. data/lib/html5/html5parser/after_frameset_phase.rb +33 -0
  23. data/lib/html5/html5parser/after_head_phase.rb +55 -0
  24. data/lib/html5/html5parser/before_head_phase.rb +44 -0
  25. data/lib/html5/html5parser/before_html_phase.rb +41 -0
  26. data/lib/html5/html5parser/in_body_phase.rb +636 -0
  27. data/lib/html5/html5parser/in_caption_phase.rb +69 -0
  28. data/lib/html5/html5parser/in_cell_phase.rb +78 -0
  29. data/lib/html5/html5parser/in_column_group_phase.rb +55 -0
  30. data/lib/html5/html5parser/in_foreign_content_phase.rb +50 -0
  31. data/lib/html5/html5parser/in_frameset_phase.rb +56 -0
  32. data/lib/html5/html5parser/in_head_phase.rb +143 -0
  33. data/lib/html5/html5parser/in_row_phase.rb +96 -0
  34. data/lib/html5/html5parser/in_select_phase.rb +90 -0
  35. data/lib/html5/html5parser/in_select_table_phase.rb +35 -0
  36. data/lib/html5/html5parser/in_table_body_phase.rb +92 -0
  37. data/lib/html5/html5parser/in_table_phase.rb +177 -0
  38. data/lib/html5/html5parser/initial_phase.rb +133 -0
  39. data/lib/html5/html5parser/phase.rb +171 -0
  40. data/lib/html5/inputstream.rb +735 -0
  41. data/lib/html5/liberalxmlparser.rb +158 -0
  42. data/lib/html5/sanitizer.rb +209 -0
  43. data/lib/html5/serializer.rb +2 -0
  44. data/lib/html5/serializer/htmlserializer.rb +179 -0
  45. data/lib/html5/serializer/xhtmlserializer.rb +20 -0
  46. data/lib/html5/sniffer.rb +45 -0
  47. data/lib/html5/tokenizer.rb +1059 -0
  48. data/lib/html5/treebuilders.rb +24 -0
  49. data/lib/html5/treebuilders/base.rb +339 -0
  50. data/lib/html5/treebuilders/hpricot.rb +231 -0
  51. data/lib/html5/treebuilders/rexml.rb +215 -0
  52. data/lib/html5/treebuilders/simpletree.rb +191 -0
  53. data/lib/html5/treewalkers.rb +26 -0
  54. data/lib/html5/treewalkers/base.rb +162 -0
  55. data/lib/html5/treewalkers/hpricot.rb +48 -0
  56. data/lib/html5/treewalkers/rexml.rb +48 -0
  57. data/lib/html5/treewalkers/simpletree.rb +48 -0
  58. data/lib/html5/version.rb +3 -0
  59. data/test/preamble.rb +69 -0
  60. data/test/test_cli.rb +16 -0
  61. data/test/test_encoding.rb +35 -0
  62. data/test/test_input_stream.rb +26 -0
  63. data/test/test_lxp.rb +283 -0
  64. data/test/test_parser.rb +63 -0
  65. data/test/test_sanitizer.rb +173 -0
  66. data/test/test_serializer.rb +67 -0
  67. data/test/test_sniffer.rb +27 -0
  68. data/test/test_stream.rb +71 -0
  69. data/test/test_tokenizer.rb +95 -0
  70. data/test/test_treewalkers.rb +135 -0
  71. data/test/test_validator.rb +31 -0
  72. data/test/tokenizer_test_parser.rb +67 -0
  73. data/test19.rb +38 -0
  74. metadata +198 -0
@@ -0,0 +1,36 @@
1
+ require 'html5/constants'
2
+ require 'html5/filters/base'
3
+
4
+ module HTML5
5
+ module Filters
6
+ class WhitespaceFilter < Base
7
+
8
+ SPACE_PRESERVE_ELEMENTS = %w[pre textarea] + RCDATA_ELEMENTS
9
+ SPACES = /[#{SPACE_CHARACTERS.join('')}]+/m
10
+
11
+ def each
12
+ preserve = 0
13
+ __getobj__.each do |token|
14
+ case token[:type]
15
+ when :StartTag
16
+ if preserve > 0 or SPACE_PRESERVE_ELEMENTS.include?(token[:name])
17
+ preserve += 1
18
+ end
19
+
20
+ when :EndTag
21
+ preserve -= 1 if preserve > 0
22
+
23
+ when :SpaceCharacters
24
+ token[:data] = " " if preserve == 0 && token[:data]
25
+
26
+ when :Characters
27
+ token[:data] = token[:data].sub(SPACES,' ') if preserve == 0
28
+ end
29
+
30
+ yield token
31
+ end
32
+ end
33
+ end
34
+ end
35
+ end
36
+
@@ -0,0 +1,247 @@
1
+ require 'html5/constants'
2
+ require 'html5/tokenizer'
3
+ require 'html5/treebuilders/rexml'
4
+
5
+ Dir.glob(File.join(File.dirname(__FILE__), 'html5parser', '*_phase.rb')).each do |path|
6
+ require 'html5/html5parser/' + File.basename(path)
7
+ end
8
+
9
+ module HTML5
10
+
11
+ # Error in parsed document
12
+ class ParseError < Exception; end
13
+ class AssertionError < Exception; end
14
+
15
+ # HTML parser. Generates a tree structure from a stream of (possibly malformed) HTML
16
+ #
17
+ class HTMLParser
18
+
19
+ attr_accessor :phase, :first_start_tag, :inner_html, :last_phase, :insert_from_table, :secondary_phase
20
+
21
+ attr_reader :phases, :tokenizer, :tree, :errors
22
+
23
+ def self.parse(stream, options = {})
24
+ encoding = options.delete(:encoding)
25
+ new(options).parse(stream,encoding)
26
+ end
27
+
28
+ def self.parse_fragment(stream, options = {})
29
+ container = options.delete(:container) || 'div'
30
+ encoding = options.delete(:encoding)
31
+ new(options).parse_fragment(stream, container, encoding)
32
+ end
33
+
34
+ @@phases = %w( initial beforeHtml beforeHead inHead afterHead inBody inTable inCaption
35
+ inColumnGroup inTableBody inRow inCell inSelect inSelectInTable afterBody inFrameset
36
+ afterFrameset afterAfterBody afterAfterFrameset inForeignContent)
37
+
38
+ # :strict - raise an exception when a parse error is encountered
39
+ # :tree - a treebuilder class controlling the type of tree that will be
40
+ # returned. Built in treebuilders can be accessed through
41
+ # HTML5::TreeBuilders[treeType]
42
+ def initialize(options = {})
43
+ @strict = false
44
+ @errors = []
45
+
46
+ @tokenizer = HTMLTokenizer
47
+ @tree = TreeBuilders::REXML::TreeBuilder
48
+
49
+ options.each {|name, value| instance_variable_set("@#{name}", value) }
50
+ @lowercase_attr_name = nil unless instance_variable_defined?("@lowercase_attr_name")
51
+ @lowercase_element_name = nil unless instance_variable_defined?("@lowercase_element_name")
52
+
53
+ @tree = @tree.new
54
+
55
+ @phases = @@phases.inject({}) do |phases, phase_name|
56
+ phase_class_name = phase_name.sub(/(.)/) { $1.upcase } + 'Phase'
57
+ phases[phase_name.to_sym] = HTML5.const_get(phase_class_name).new(self, @tree)
58
+ phases
59
+ end
60
+ end
61
+
62
+ def _parse(stream, inner_html, encoding, container = 'div')
63
+ @tree.reset
64
+ @first_start_tag = false
65
+ @errors = []
66
+
67
+ @tokenizer = @tokenizer.class unless Class === @tokenizer
68
+ @tokenizer = @tokenizer.new(stream, :encoding => encoding,
69
+ :parseMeta => !inner_html, :lowercase_attr_name => @lowercase_attr_name, :lowercase_element_name => @lowercase_element_name)
70
+
71
+ if inner_html
72
+ case @inner_html = container.downcase
73
+ when 'title', 'textarea'
74
+ @tokenizer.content_model_flag = :RCDATA
75
+ when 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript'
76
+ @tokenizer.content_model_flag = :CDATA
77
+ when 'plaintext'
78
+ @tokenizer.content_model_flag = :PLAINTEXT
79
+ else
80
+ # content_model_flag already is PCDATA
81
+ @tokenizer.content_model_flag = :PCDATA
82
+ end
83
+
84
+ @phase = @phases[:beforeHtml]
85
+ @phase.insert_html_element
86
+ reset_insertion_mode
87
+ else
88
+ @inner_html = false
89
+ @phase = @phases[:initial]
90
+ end
91
+
92
+ # We only seem to have InBodyPhase testcases where the following is
93
+ # relevant ... need others too
94
+ @last_phase = nil
95
+
96
+ @tokenizer.each do |token|
97
+ token = normalize_token(token)
98
+
99
+ method = 'process%s' % token[:type]
100
+
101
+ case token[:type]
102
+ when :Characters, :SpaceCharacters, :Comment
103
+ @phase.send method, token[:data]
104
+ when :StartTag
105
+ @phase.send method, token[:name], token[:data], token[:self_closing]
106
+ when :EndTag
107
+ @phase.send method, token[:name]
108
+ when :Doctype
109
+ @phase.send method, token[:name], token[:publicId],
110
+ token[:systemId], token[:correct]
111
+ else
112
+ parse_error(token[:data], token[:datavars])
113
+ end
114
+ end
115
+
116
+ # When the loop finishes it's EOF
117
+ @phase.process_eof
118
+ end
119
+
120
+ # Parse a HTML document into a well-formed tree
121
+ #
122
+ # stream - a filelike object or string containing the HTML to be parsed
123
+ #
124
+ # The optional encoding parameter must be a string that indicates
125
+ # the encoding. If specified, that encoding will be used,
126
+ # regardless of any BOM or later declaration (such as in a meta
127
+ # element)
128
+ def parse(stream, encoding=nil)
129
+ _parse(stream, false, encoding)
130
+ @tree.get_document
131
+ end
132
+
133
+ # Parse a HTML fragment into a well-formed tree fragment
134
+
135
+ # container - name of the element we're setting the inner_html property
136
+ # if set to nil, default to 'div'
137
+ #
138
+ # stream - a filelike object or string containing the HTML to be parsed
139
+ #
140
+ # The optional encoding parameter must be a string that indicates
141
+ # the encoding. If specified, that encoding will be used,
142
+ # regardless of any BOM or later declaration (such as in a meta
143
+ # element)
144
+ def parse_fragment(stream, container='div', encoding=nil)
145
+ _parse(stream, true, encoding, container)
146
+ @tree.get_fragment
147
+ end
148
+
149
+ def parse_error(code = 'XXX-undefined-error', data = {})
150
+ # XXX The idea is to make data mandatory.
151
+ @errors.push([@tokenizer.stream.position, code, data])
152
+ raise ParseError if @strict
153
+ end
154
+
155
+ # HTML5 specific normalizations to the token stream
156
+ def normalize_token(token)
157
+
158
+ if token[:type] == :EmptyTag
159
+ # When a solidus (/) is encountered within a tag name what happens
160
+ # depends on whether the current tag name matches that of a void
161
+ # element. If it matches a void element atheists did the wrong
162
+ # thing and if it doesn't it's wrong for everyone.
163
+
164
+ unless VOID_ELEMENTS.include?(token[:name])
165
+ parse_error("incorrectly-placed-solidus")
166
+ end
167
+
168
+ token[:type] = :StartTag
169
+ end
170
+
171
+ if token[:type] == :StartTag
172
+ token[:name] = token[:name].downcase
173
+
174
+ # We need to remove the duplicate attributes and convert attributes
175
+ # to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
176
+
177
+ unless token[:data].empty?
178
+ data = token[:data].reverse.map {|attr, value| [attr.downcase, value] }
179
+ token[:data] = Hash[*data.flatten]
180
+ end
181
+
182
+ elsif token[:type] == :EndTag
183
+ parse_error("attributes-in-end-tag") unless token[:data].empty?
184
+ token[:name] = token[:name].downcase
185
+ end
186
+
187
+ token
188
+ end
189
+
190
+ @@new_modes = {
191
+ 'select' => :inSelect,
192
+ 'td' => :inCell,
193
+ 'th' => :inCell,
194
+ 'tr' => :inRow,
195
+ 'tbody' => :inTableBody,
196
+ 'thead' => :inTableBody,
197
+ 'tfoot' => :inTableBody,
198
+ 'caption' => :inCaption,
199
+ 'colgroup' => :inColumnGroup,
200
+ 'table' => :inTable,
201
+ 'head' => :inBody,
202
+ 'body' => :inBody,
203
+ 'frameset' => :inFrameset
204
+ }
205
+
206
+ def reset_insertion_mode
207
+ # The name of this method is mostly historical. (It's also used in the
208
+ # specification.)
209
+ last = false
210
+
211
+ @tree.open_elements.reverse.each do |node|
212
+ node_name = node.name
213
+
214
+ if node == @tree.open_elements.first
215
+ last = true
216
+ unless ['td', 'th'].include?(node_name)
217
+ # XXX
218
+ # assert @inner_html
219
+ node_name = @inner_html
220
+ end
221
+ end
222
+
223
+ # Check for conditions that should only happen in the inner_html
224
+ # case
225
+ if ['select', 'colgroup', 'head', 'frameset'].include?(node_name)
226
+ # XXX
227
+ # assert @inner_html
228
+ end
229
+
230
+ if @@new_modes.has_key?(node_name)
231
+ @phase = @phases[@@new_modes[node_name]]
232
+ elsif node_name == 'html'
233
+ @phase = @phases[@tree.head_pointer.nil?? :beforeHead : :afterHead]
234
+ elsif last
235
+ @phase = @phases[:inBody]
236
+ else
237
+ next
238
+ end
239
+
240
+ break
241
+ end
242
+ end
243
+
244
+ def _(string); string; end
245
+ end
246
+
247
+ end
@@ -0,0 +1,43 @@
1
+ require 'html5/html5parser/phase'
2
+
3
+ module HTML5
4
+ class AfterAfterBodyPhase < Phase
5
+
6
+ handle_start 'html'
7
+
8
+ def processComment(data)
9
+ @tree.insert_comment(data)
10
+ end
11
+
12
+ def processDoctype data
13
+ @parser.phases[:inBody].processDoctype(data)
14
+ end
15
+
16
+ def processSpaceCharacters data
17
+ @parser.phases[:inBody].processSpaceCharacters(data)
18
+ end
19
+
20
+ def startTagHtml data
21
+ @parser.phases[:inBody].startTagHtml(data)
22
+ end
23
+
24
+ def startTagOther name, attributes
25
+ parse_error("unexpected-start-tag", {'name' => name})
26
+ @parser.phase = @parser.phases[:inBody]
27
+ @parser.phase.processStartTag(name, attributes)
28
+ end
29
+
30
+ def endTagOther name
31
+ parse_error("unexpected-end-tag", {'name' => name})
32
+ @parser.phase = @parser.phases[:inBody]
33
+ @parser.phase.processEndTag(name)
34
+ end
35
+
36
+ def processCharacters data
37
+ parse_error "unexpected-char-after-body"
38
+ @parser.phase = @parser.phases[:inBody]
39
+ @parser.phase.processCharacters(data)
40
+ end
41
+
42
+ end
43
+ end
@@ -0,0 +1,32 @@
1
+ require 'html5/html5parser/phase'
2
+
3
+ module HTML5
4
+ class AfterAfterFramesetPhase < Phase
5
+
6
+ handle_start 'html', 'noframes'
7
+
8
+ def processComment(data)
9
+ @tree.insert_comment(data)
10
+ end
11
+
12
+ def processDoctype data
13
+ @parser.phases[:inBody].processDoctype(data)
14
+ end
15
+
16
+ def processSpaceCharacters data
17
+ @parser.phases[:inBody].processSpaceCharacters(data)
18
+ end
19
+
20
+ def startTagHtml data
21
+ @parser.phases[:inBody].startTagHtml(data)
22
+ end
23
+
24
+ def startTagNoframes name, attributes
25
+ @parser.phases[:inHead].startTagNoframes(data)
26
+ end
27
+
28
+ def startTagOther name, attributes
29
+ parse_error("unexpected-char-after-body")
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,46 @@
1
+ require 'html5/html5parser/phase'
2
+
3
+ module HTML5
4
+ class AfterBodyPhase < Phase
5
+
6
+ handle_end 'html'
7
+
8
+ def processComment(data)
9
+ # This is needed because data is to be appended to the <html> element
10
+ # here and not to whatever is currently open.
11
+ @tree.insert_comment(data, @tree.open_elements.first)
12
+ end
13
+
14
+ def processCharacters(data)
15
+ parse_error("unexpected-char-after-body")
16
+ @parser.phase = @parser.phases[:inBody]
17
+ @parser.phase.processCharacters(data)
18
+ end
19
+
20
+ def processStartTag(name, attributes, self_closing=false)
21
+ parse_error("unexpected-start-tag-after-body", {"name" => name})
22
+ @parser.phase = @parser.phases[:inBody]
23
+ @parser.phase.processStartTag(name, attributes)
24
+ end
25
+
26
+ def endTagHtml(name)
27
+ if @parser.inner_html
28
+ parse_error "end-html-in-innerhtml"
29
+ else
30
+ # XXX: This may need to be done, not sure
31
+ # Don't set last_phase to the current phase but to the inBody phase
32
+ # instead. No need for extra parse errors if there's something after </html>.
33
+ # Try "<!doctype html>X</html>X" for instance.
34
+ @parser.last_phase = @parser.phase
35
+ @parser.phase = @parser.phases[:afterAfterBody]
36
+ end
37
+ end
38
+
39
+ def endTagOther(name)
40
+ parse_error("unexpected-end-tag-after-body", {"name" => name})
41
+ @parser.phase = @parser.phases[:inBody]
42
+ @parser.phase.processEndTag(name)
43
+ end
44
+
45
+ end
46
+ end
@@ -0,0 +1,33 @@
1
+ require 'html5/html5parser/phase'
2
+
3
+ module HTML5
4
+ class AfterFramesetPhase < Phase
5
+
6
+ # http://www.whatwg.org/specs/web-apps/current-work/#after3
7
+
8
+ handle_start 'html', 'noframes'
9
+
10
+ handle_end 'html'
11
+
12
+ def processCharacters(data)
13
+ parse_error("unexpected-char-after-frameset")
14
+ end
15
+
16
+ def startTagNoframes(name, attributes)
17
+ @parser.phases[:inBody].processStartTag(name, attributes)
18
+ end
19
+
20
+ def startTagOther(name, attributes)
21
+ parse_error("unexpected-start-tag-after-frameset", {"name" => name})
22
+ end
23
+
24
+ def endTagHtml(name)
25
+ @parser.last_phase = @parser.phase
26
+ @parser.phase = @parser.phases[:afterAfterFrameset]
27
+ end
28
+
29
+ def endTagOther(name)
30
+ parse_error("unexpected-end-tag-after-frameset", {"name" => name})
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,55 @@
1
+ require 'html5/html5parser/phase'
2
+
3
+ module HTML5
4
+ class AfterHeadPhase < Phase
5
+
6
+ handle_start 'html', 'body', 'frameset', %w( base link meta script style title ) => 'FromHead'
7
+ handle_end %w( body html br ) => 'BodyHtmlBr'
8
+
9
+ def process_eof
10
+ anything_else
11
+ @parser.phase.process_eof
12
+ end
13
+
14
+ def processCharacters(data)
15
+ anything_else
16
+ @parser.phase.processCharacters(data)
17
+ end
18
+
19
+ def startTagBody(name, attributes)
20
+ @tree.insert_element(name, attributes)
21
+ @parser.phase = @parser.phases[:inBody]
22
+ end
23
+
24
+ def startTagFrameset(name, attributes)
25
+ @tree.insert_element(name, attributes)
26
+ @parser.phase = @parser.phases[:inFrameset]
27
+ end
28
+
29
+ def startTagFromHead(name, attributes)
30
+ parse_error("unexpected-start-tag-out-of-my-head", {"name" => name})
31
+ @parser.phase = @parser.phases[:inHead]
32
+ @parser.phase.processStartTag(name, attributes)
33
+ end
34
+
35
+ def startTagOther(name, attributes)
36
+ anything_else
37
+ @parser.phase.processStartTag(name, attributes)
38
+ end
39
+
40
+ def endTagBodyHtmlBr(name)
41
+ anything_else
42
+ @parser.phase.processEndTag(name)
43
+ end
44
+
45
+ def endTagOther(name)
46
+ parse_error("unexpected-end-tag", {"name" => name})
47
+ end
48
+
49
+ def anything_else
50
+ @tree.insert_element('body', {})
51
+ @parser.phase = @parser.phases[:inBody]
52
+ end
53
+
54
+ end
55
+ end