spk-html5 0.10.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (74) hide show
  1. data/History.txt +10 -0
  2. data/Manifest.txt +73 -0
  3. data/README +45 -0
  4. data/Rakefile.rb +33 -0
  5. data/bin/html5 +7 -0
  6. data/lib/html5.rb +13 -0
  7. data/lib/html5/cli.rb +248 -0
  8. data/lib/html5/constants.rb +1061 -0
  9. data/lib/html5/filters/base.rb +10 -0
  10. data/lib/html5/filters/inject_meta_charset.rb +82 -0
  11. data/lib/html5/filters/iso639codes.rb +755 -0
  12. data/lib/html5/filters/optionaltags.rb +198 -0
  13. data/lib/html5/filters/rfc2046.rb +31 -0
  14. data/lib/html5/filters/rfc3987.rb +91 -0
  15. data/lib/html5/filters/sanitizer.rb +15 -0
  16. data/lib/html5/filters/validator.rb +834 -0
  17. data/lib/html5/filters/whitespace.rb +36 -0
  18. data/lib/html5/html5parser.rb +247 -0
  19. data/lib/html5/html5parser/after_after_body_phase.rb +43 -0
  20. data/lib/html5/html5parser/after_after_frameset_phase.rb +32 -0
  21. data/lib/html5/html5parser/after_body_phase.rb +46 -0
  22. data/lib/html5/html5parser/after_frameset_phase.rb +33 -0
  23. data/lib/html5/html5parser/after_head_phase.rb +55 -0
  24. data/lib/html5/html5parser/before_head_phase.rb +44 -0
  25. data/lib/html5/html5parser/before_html_phase.rb +41 -0
  26. data/lib/html5/html5parser/in_body_phase.rb +636 -0
  27. data/lib/html5/html5parser/in_caption_phase.rb +69 -0
  28. data/lib/html5/html5parser/in_cell_phase.rb +78 -0
  29. data/lib/html5/html5parser/in_column_group_phase.rb +55 -0
  30. data/lib/html5/html5parser/in_foreign_content_phase.rb +50 -0
  31. data/lib/html5/html5parser/in_frameset_phase.rb +56 -0
  32. data/lib/html5/html5parser/in_head_phase.rb +143 -0
  33. data/lib/html5/html5parser/in_row_phase.rb +96 -0
  34. data/lib/html5/html5parser/in_select_phase.rb +90 -0
  35. data/lib/html5/html5parser/in_select_table_phase.rb +35 -0
  36. data/lib/html5/html5parser/in_table_body_phase.rb +92 -0
  37. data/lib/html5/html5parser/in_table_phase.rb +177 -0
  38. data/lib/html5/html5parser/initial_phase.rb +133 -0
  39. data/lib/html5/html5parser/phase.rb +171 -0
  40. data/lib/html5/inputstream.rb +735 -0
  41. data/lib/html5/liberalxmlparser.rb +158 -0
  42. data/lib/html5/sanitizer.rb +209 -0
  43. data/lib/html5/serializer.rb +2 -0
  44. data/lib/html5/serializer/htmlserializer.rb +179 -0
  45. data/lib/html5/serializer/xhtmlserializer.rb +20 -0
  46. data/lib/html5/sniffer.rb +45 -0
  47. data/lib/html5/tokenizer.rb +1059 -0
  48. data/lib/html5/treebuilders.rb +24 -0
  49. data/lib/html5/treebuilders/base.rb +339 -0
  50. data/lib/html5/treebuilders/hpricot.rb +231 -0
  51. data/lib/html5/treebuilders/rexml.rb +215 -0
  52. data/lib/html5/treebuilders/simpletree.rb +191 -0
  53. data/lib/html5/treewalkers.rb +26 -0
  54. data/lib/html5/treewalkers/base.rb +162 -0
  55. data/lib/html5/treewalkers/hpricot.rb +48 -0
  56. data/lib/html5/treewalkers/rexml.rb +48 -0
  57. data/lib/html5/treewalkers/simpletree.rb +48 -0
  58. data/lib/html5/version.rb +3 -0
  59. data/test/preamble.rb +69 -0
  60. data/test/test_cli.rb +16 -0
  61. data/test/test_encoding.rb +35 -0
  62. data/test/test_input_stream.rb +26 -0
  63. data/test/test_lxp.rb +283 -0
  64. data/test/test_parser.rb +63 -0
  65. data/test/test_sanitizer.rb +173 -0
  66. data/test/test_serializer.rb +67 -0
  67. data/test/test_sniffer.rb +27 -0
  68. data/test/test_stream.rb +71 -0
  69. data/test/test_tokenizer.rb +95 -0
  70. data/test/test_treewalkers.rb +135 -0
  71. data/test/test_validator.rb +31 -0
  72. data/test/tokenizer_test_parser.rb +67 -0
  73. data/test19.rb +38 -0
  74. metadata +198 -0
@@ -0,0 +1,36 @@
1
+ require 'html5/constants'
2
+ require 'html5/filters/base'
3
+
4
+ module HTML5
5
+ module Filters
6
+ class WhitespaceFilter < Base
7
+
8
+ SPACE_PRESERVE_ELEMENTS = %w[pre textarea] + RCDATA_ELEMENTS
9
+ SPACES = /[#{SPACE_CHARACTERS.join('')}]+/m
10
+
11
+ def each
12
+ preserve = 0
13
+ __getobj__.each do |token|
14
+ case token[:type]
15
+ when :StartTag
16
+ if preserve > 0 or SPACE_PRESERVE_ELEMENTS.include?(token[:name])
17
+ preserve += 1
18
+ end
19
+
20
+ when :EndTag
21
+ preserve -= 1 if preserve > 0
22
+
23
+ when :SpaceCharacters
24
+ token[:data] = " " if preserve == 0 && token[:data]
25
+
26
+ when :Characters
27
+ token[:data] = token[:data].sub(SPACES,' ') if preserve == 0
28
+ end
29
+
30
+ yield token
31
+ end
32
+ end
33
+ end
34
+ end
35
+ end
36
+
@@ -0,0 +1,247 @@
1
+ require 'html5/constants'
2
+ require 'html5/tokenizer'
3
+ require 'html5/treebuilders/rexml'
4
+
5
+ Dir.glob(File.join(File.dirname(__FILE__), 'html5parser', '*_phase.rb')).each do |path|
6
+ require 'html5/html5parser/' + File.basename(path)
7
+ end
8
+
9
+ module HTML5
10
+
11
+ # Error in parsed document
12
+ class ParseError < Exception; end
13
+ class AssertionError < Exception; end
14
+
15
+ # HTML parser. Generates a tree structure from a stream of (possibly malformed) HTML
16
+ #
17
+ class HTMLParser
18
+
19
+ attr_accessor :phase, :first_start_tag, :inner_html, :last_phase, :insert_from_table, :secondary_phase
20
+
21
+ attr_reader :phases, :tokenizer, :tree, :errors
22
+
23
+ def self.parse(stream, options = {})
24
+ encoding = options.delete(:encoding)
25
+ new(options).parse(stream,encoding)
26
+ end
27
+
28
+ def self.parse_fragment(stream, options = {})
29
+ container = options.delete(:container) || 'div'
30
+ encoding = options.delete(:encoding)
31
+ new(options).parse_fragment(stream, container, encoding)
32
+ end
33
+
34
+ @@phases = %w( initial beforeHtml beforeHead inHead afterHead inBody inTable inCaption
35
+ inColumnGroup inTableBody inRow inCell inSelect inSelectInTable afterBody inFrameset
36
+ afterFrameset afterAfterBody afterAfterFrameset inForeignContent)
37
+
38
+ # :strict - raise an exception when a parse error is encountered
39
+ # :tree - a treebuilder class controlling the type of tree that will be
40
+ # returned. Built in treebuilders can be accessed through
41
+ # HTML5::TreeBuilders[treeType]
42
+ def initialize(options = {})
43
+ @strict = false
44
+ @errors = []
45
+
46
+ @tokenizer = HTMLTokenizer
47
+ @tree = TreeBuilders::REXML::TreeBuilder
48
+
49
+ options.each {|name, value| instance_variable_set("@#{name}", value) }
50
+ @lowercase_attr_name = nil unless instance_variable_defined?("@lowercase_attr_name")
51
+ @lowercase_element_name = nil unless instance_variable_defined?("@lowercase_element_name")
52
+
53
+ @tree = @tree.new
54
+
55
+ @phases = @@phases.inject({}) do |phases, phase_name|
56
+ phase_class_name = phase_name.sub(/(.)/) { $1.upcase } + 'Phase'
57
+ phases[phase_name.to_sym] = HTML5.const_get(phase_class_name).new(self, @tree)
58
+ phases
59
+ end
60
+ end
61
+
62
+ def _parse(stream, inner_html, encoding, container = 'div')
63
+ @tree.reset
64
+ @first_start_tag = false
65
+ @errors = []
66
+
67
+ @tokenizer = @tokenizer.class unless Class === @tokenizer
68
+ @tokenizer = @tokenizer.new(stream, :encoding => encoding,
69
+ :parseMeta => !inner_html, :lowercase_attr_name => @lowercase_attr_name, :lowercase_element_name => @lowercase_element_name)
70
+
71
+ if inner_html
72
+ case @inner_html = container.downcase
73
+ when 'title', 'textarea'
74
+ @tokenizer.content_model_flag = :RCDATA
75
+ when 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript'
76
+ @tokenizer.content_model_flag = :CDATA
77
+ when 'plaintext'
78
+ @tokenizer.content_model_flag = :PLAINTEXT
79
+ else
80
+ # content_model_flag already is PCDATA
81
+ @tokenizer.content_model_flag = :PCDATA
82
+ end
83
+
84
+ @phase = @phases[:beforeHtml]
85
+ @phase.insert_html_element
86
+ reset_insertion_mode
87
+ else
88
+ @inner_html = false
89
+ @phase = @phases[:initial]
90
+ end
91
+
92
+ # We only seem to have InBodyPhase testcases where the following is
93
+ # relevant ... need others too
94
+ @last_phase = nil
95
+
96
+ @tokenizer.each do |token|
97
+ token = normalize_token(token)
98
+
99
+ method = 'process%s' % token[:type]
100
+
101
+ case token[:type]
102
+ when :Characters, :SpaceCharacters, :Comment
103
+ @phase.send method, token[:data]
104
+ when :StartTag
105
+ @phase.send method, token[:name], token[:data], token[:self_closing]
106
+ when :EndTag
107
+ @phase.send method, token[:name]
108
+ when :Doctype
109
+ @phase.send method, token[:name], token[:publicId],
110
+ token[:systemId], token[:correct]
111
+ else
112
+ parse_error(token[:data], token[:datavars])
113
+ end
114
+ end
115
+
116
+ # When the loop finishes it's EOF
117
+ @phase.process_eof
118
+ end
119
+
120
+ # Parse a HTML document into a well-formed tree
121
+ #
122
+ # stream - a filelike object or string containing the HTML to be parsed
123
+ #
124
+ # The optional encoding parameter must be a string that indicates
125
+ # the encoding. If specified, that encoding will be used,
126
+ # regardless of any BOM or later declaration (such as in a meta
127
+ # element)
128
+ def parse(stream, encoding=nil)
129
+ _parse(stream, false, encoding)
130
+ @tree.get_document
131
+ end
132
+
133
+ # Parse a HTML fragment into a well-formed tree fragment
134
+
135
+ # container - name of the element we're setting the inner_html property
136
+ # if set to nil, default to 'div'
137
+ #
138
+ # stream - a filelike object or string containing the HTML to be parsed
139
+ #
140
+ # The optional encoding parameter must be a string that indicates
141
+ # the encoding. If specified, that encoding will be used,
142
+ # regardless of any BOM or later declaration (such as in a meta
143
+ # element)
144
+ def parse_fragment(stream, container='div', encoding=nil)
145
+ _parse(stream, true, encoding, container)
146
+ @tree.get_fragment
147
+ end
148
+
149
+ def parse_error(code = 'XXX-undefined-error', data = {})
150
+ # XXX The idea is to make data mandatory.
151
+ @errors.push([@tokenizer.stream.position, code, data])
152
+ raise ParseError if @strict
153
+ end
154
+
155
+ # HTML5 specific normalizations to the token stream
156
+ def normalize_token(token)
157
+
158
+ if token[:type] == :EmptyTag
159
+ # When a solidus (/) is encountered within a tag name what happens
160
+ # depends on whether the current tag name matches that of a void
161
+ # element. If it matches a void element atheists did the wrong
162
+ # thing and if it doesn't it's wrong for everyone.
163
+
164
+ unless VOID_ELEMENTS.include?(token[:name])
165
+ parse_error("incorrectly-placed-solidus")
166
+ end
167
+
168
+ token[:type] = :StartTag
169
+ end
170
+
171
+ if token[:type] == :StartTag
172
+ token[:name] = token[:name].downcase
173
+
174
+ # We need to remove the duplicate attributes and convert attributes
175
+ # to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
176
+
177
+ unless token[:data].empty?
178
+ data = token[:data].reverse.map {|attr, value| [attr.downcase, value] }
179
+ token[:data] = Hash[*data.flatten]
180
+ end
181
+
182
+ elsif token[:type] == :EndTag
183
+ parse_error("attributes-in-end-tag") unless token[:data].empty?
184
+ token[:name] = token[:name].downcase
185
+ end
186
+
187
+ token
188
+ end
189
+
190
+ @@new_modes = {
191
+ 'select' => :inSelect,
192
+ 'td' => :inCell,
193
+ 'th' => :inCell,
194
+ 'tr' => :inRow,
195
+ 'tbody' => :inTableBody,
196
+ 'thead' => :inTableBody,
197
+ 'tfoot' => :inTableBody,
198
+ 'caption' => :inCaption,
199
+ 'colgroup' => :inColumnGroup,
200
+ 'table' => :inTable,
201
+ 'head' => :inBody,
202
+ 'body' => :inBody,
203
+ 'frameset' => :inFrameset
204
+ }
205
+
206
+ def reset_insertion_mode
207
+ # The name of this method is mostly historical. (It's also used in the
208
+ # specification.)
209
+ last = false
210
+
211
+ @tree.open_elements.reverse.each do |node|
212
+ node_name = node.name
213
+
214
+ if node == @tree.open_elements.first
215
+ last = true
216
+ unless ['td', 'th'].include?(node_name)
217
+ # XXX
218
+ # assert @inner_html
219
+ node_name = @inner_html
220
+ end
221
+ end
222
+
223
+ # Check for conditions that should only happen in the inner_html
224
+ # case
225
+ if ['select', 'colgroup', 'head', 'frameset'].include?(node_name)
226
+ # XXX
227
+ # assert @inner_html
228
+ end
229
+
230
+ if @@new_modes.has_key?(node_name)
231
+ @phase = @phases[@@new_modes[node_name]]
232
+ elsif node_name == 'html'
233
+ @phase = @phases[@tree.head_pointer.nil?? :beforeHead : :afterHead]
234
+ elsif last
235
+ @phase = @phases[:inBody]
236
+ else
237
+ next
238
+ end
239
+
240
+ break
241
+ end
242
+ end
243
+
244
+ def _(string); string; end
245
+ end
246
+
247
+ end
@@ -0,0 +1,43 @@
1
+ require 'html5/html5parser/phase'
2
+
3
+ module HTML5
4
+ class AfterAfterBodyPhase < Phase
5
+
6
+ handle_start 'html'
7
+
8
+ def processComment(data)
9
+ @tree.insert_comment(data)
10
+ end
11
+
12
+ def processDoctype data
13
+ @parser.phases[:inBody].processDoctype(data)
14
+ end
15
+
16
+ def processSpaceCharacters data
17
+ @parser.phases[:inBody].processSpaceCharacters(data)
18
+ end
19
+
20
+ def startTagHtml data
21
+ @parser.phases[:inBody].startTagHtml(data)
22
+ end
23
+
24
+ def startTagOther name, attributes
25
+ parse_error("unexpected-start-tag", {'name' => name})
26
+ @parser.phase = @parser.phases[:inBody]
27
+ @parser.phase.processStartTag(name, attributes)
28
+ end
29
+
30
+ def endTagOther name
31
+ parse_error("unexpected-end-tag", {'name' => name})
32
+ @parser.phase = @parser.phases[:inBody]
33
+ @parser.phase.processEndTag(name)
34
+ end
35
+
36
+ def processCharacters data
37
+ parse_error "unexpected-char-after-body"
38
+ @parser.phase = @parser.phases[:inBody]
39
+ @parser.phase.processCharacters(data)
40
+ end
41
+
42
+ end
43
+ end
@@ -0,0 +1,32 @@
1
+ require 'html5/html5parser/phase'
2
+
3
+ module HTML5
4
+ class AfterAfterFramesetPhase < Phase
5
+
6
+ handle_start 'html', 'noframes'
7
+
8
+ def processComment(data)
9
+ @tree.insert_comment(data)
10
+ end
11
+
12
+ def processDoctype data
13
+ @parser.phases[:inBody].processDoctype(data)
14
+ end
15
+
16
+ def processSpaceCharacters data
17
+ @parser.phases[:inBody].processSpaceCharacters(data)
18
+ end
19
+
20
+ def startTagHtml data
21
+ @parser.phases[:inBody].startTagHtml(data)
22
+ end
23
+
24
+ def startTagNoframes name, attributes
25
+ @parser.phases[:inHead].startTagNoframes(data)
26
+ end
27
+
28
+ def startTagOther name, attributes
29
+ parse_error("unexpected-char-after-body")
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,46 @@
1
+ require 'html5/html5parser/phase'
2
+
3
+ module HTML5
4
+ class AfterBodyPhase < Phase
5
+
6
+ handle_end 'html'
7
+
8
+ def processComment(data)
9
+ # This is needed because data is to be appended to the <html> element
10
+ # here and not to whatever is currently open.
11
+ @tree.insert_comment(data, @tree.open_elements.first)
12
+ end
13
+
14
+ def processCharacters(data)
15
+ parse_error("unexpected-char-after-body")
16
+ @parser.phase = @parser.phases[:inBody]
17
+ @parser.phase.processCharacters(data)
18
+ end
19
+
20
+ def processStartTag(name, attributes, self_closing=false)
21
+ parse_error("unexpected-start-tag-after-body", {"name" => name})
22
+ @parser.phase = @parser.phases[:inBody]
23
+ @parser.phase.processStartTag(name, attributes)
24
+ end
25
+
26
+ def endTagHtml(name)
27
+ if @parser.inner_html
28
+ parse_error "end-html-in-innerhtml"
29
+ else
30
+ # XXX: This may need to be done, not sure
31
+ # Don't set last_phase to the current phase but to the inBody phase
32
+ # instead. No need for extra parse errors if there's something after </html>.
33
+ # Try "<!doctype html>X</html>X" for instance.
34
+ @parser.last_phase = @parser.phase
35
+ @parser.phase = @parser.phases[:afterAfterBody]
36
+ end
37
+ end
38
+
39
+ def endTagOther(name)
40
+ parse_error("unexpected-end-tag-after-body", {"name" => name})
41
+ @parser.phase = @parser.phases[:inBody]
42
+ @parser.phase.processEndTag(name)
43
+ end
44
+
45
+ end
46
+ end
@@ -0,0 +1,33 @@
1
+ require 'html5/html5parser/phase'
2
+
3
+ module HTML5
4
+ class AfterFramesetPhase < Phase
5
+
6
+ # http://www.whatwg.org/specs/web-apps/current-work/#after3
7
+
8
+ handle_start 'html', 'noframes'
9
+
10
+ handle_end 'html'
11
+
12
+ def processCharacters(data)
13
+ parse_error("unexpected-char-after-frameset")
14
+ end
15
+
16
+ def startTagNoframes(name, attributes)
17
+ @parser.phases[:inBody].processStartTag(name, attributes)
18
+ end
19
+
20
+ def startTagOther(name, attributes)
21
+ parse_error("unexpected-start-tag-after-frameset", {"name" => name})
22
+ end
23
+
24
+ def endTagHtml(name)
25
+ @parser.last_phase = @parser.phase
26
+ @parser.phase = @parser.phases[:afterAfterFrameset]
27
+ end
28
+
29
+ def endTagOther(name)
30
+ parse_error("unexpected-end-tag-after-frameset", {"name" => name})
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,55 @@
1
+ require 'html5/html5parser/phase'
2
+
3
+ module HTML5
4
+ class AfterHeadPhase < Phase
5
+
6
+ handle_start 'html', 'body', 'frameset', %w( base link meta script style title ) => 'FromHead'
7
+ handle_end %w( body html br ) => 'BodyHtmlBr'
8
+
9
+ def process_eof
10
+ anything_else
11
+ @parser.phase.process_eof
12
+ end
13
+
14
+ def processCharacters(data)
15
+ anything_else
16
+ @parser.phase.processCharacters(data)
17
+ end
18
+
19
+ def startTagBody(name, attributes)
20
+ @tree.insert_element(name, attributes)
21
+ @parser.phase = @parser.phases[:inBody]
22
+ end
23
+
24
+ def startTagFrameset(name, attributes)
25
+ @tree.insert_element(name, attributes)
26
+ @parser.phase = @parser.phases[:inFrameset]
27
+ end
28
+
29
+ def startTagFromHead(name, attributes)
30
+ parse_error("unexpected-start-tag-out-of-my-head", {"name" => name})
31
+ @parser.phase = @parser.phases[:inHead]
32
+ @parser.phase.processStartTag(name, attributes)
33
+ end
34
+
35
+ def startTagOther(name, attributes)
36
+ anything_else
37
+ @parser.phase.processStartTag(name, attributes)
38
+ end
39
+
40
+ def endTagBodyHtmlBr(name)
41
+ anything_else
42
+ @parser.phase.processEndTag(name)
43
+ end
44
+
45
+ def endTagOther(name)
46
+ parse_error("unexpected-end-tag", {"name" => name})
47
+ end
48
+
49
+ def anything_else
50
+ @tree.insert_element('body', {})
51
+ @parser.phase = @parser.phases[:inBody]
52
+ end
53
+
54
+ end
55
+ end