spk-html5 0.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. data/History.txt +10 -0
  2. data/Manifest.txt +73 -0
  3. data/README +45 -0
  4. data/Rakefile.rb +33 -0
  5. data/bin/html5 +7 -0
  6. data/lib/html5.rb +13 -0
  7. data/lib/html5/cli.rb +248 -0
  8. data/lib/html5/constants.rb +1061 -0
  9. data/lib/html5/filters/base.rb +10 -0
  10. data/lib/html5/filters/inject_meta_charset.rb +82 -0
  11. data/lib/html5/filters/iso639codes.rb +755 -0
  12. data/lib/html5/filters/optionaltags.rb +198 -0
  13. data/lib/html5/filters/rfc2046.rb +31 -0
  14. data/lib/html5/filters/rfc3987.rb +91 -0
  15. data/lib/html5/filters/sanitizer.rb +15 -0
  16. data/lib/html5/filters/validator.rb +834 -0
  17. data/lib/html5/filters/whitespace.rb +36 -0
  18. data/lib/html5/html5parser.rb +247 -0
  19. data/lib/html5/html5parser/after_after_body_phase.rb +43 -0
  20. data/lib/html5/html5parser/after_after_frameset_phase.rb +32 -0
  21. data/lib/html5/html5parser/after_body_phase.rb +46 -0
  22. data/lib/html5/html5parser/after_frameset_phase.rb +33 -0
  23. data/lib/html5/html5parser/after_head_phase.rb +55 -0
  24. data/lib/html5/html5parser/before_head_phase.rb +44 -0
  25. data/lib/html5/html5parser/before_html_phase.rb +41 -0
  26. data/lib/html5/html5parser/in_body_phase.rb +636 -0
  27. data/lib/html5/html5parser/in_caption_phase.rb +69 -0
  28. data/lib/html5/html5parser/in_cell_phase.rb +78 -0
  29. data/lib/html5/html5parser/in_column_group_phase.rb +55 -0
  30. data/lib/html5/html5parser/in_foreign_content_phase.rb +50 -0
  31. data/lib/html5/html5parser/in_frameset_phase.rb +56 -0
  32. data/lib/html5/html5parser/in_head_phase.rb +143 -0
  33. data/lib/html5/html5parser/in_row_phase.rb +96 -0
  34. data/lib/html5/html5parser/in_select_phase.rb +90 -0
  35. data/lib/html5/html5parser/in_select_table_phase.rb +35 -0
  36. data/lib/html5/html5parser/in_table_body_phase.rb +92 -0
  37. data/lib/html5/html5parser/in_table_phase.rb +177 -0
  38. data/lib/html5/html5parser/initial_phase.rb +133 -0
  39. data/lib/html5/html5parser/phase.rb +171 -0
  40. data/lib/html5/inputstream.rb +735 -0
  41. data/lib/html5/liberalxmlparser.rb +158 -0
  42. data/lib/html5/sanitizer.rb +209 -0
  43. data/lib/html5/serializer.rb +2 -0
  44. data/lib/html5/serializer/htmlserializer.rb +179 -0
  45. data/lib/html5/serializer/xhtmlserializer.rb +20 -0
  46. data/lib/html5/sniffer.rb +45 -0
  47. data/lib/html5/tokenizer.rb +1059 -0
  48. data/lib/html5/treebuilders.rb +24 -0
  49. data/lib/html5/treebuilders/base.rb +339 -0
  50. data/lib/html5/treebuilders/hpricot.rb +231 -0
  51. data/lib/html5/treebuilders/rexml.rb +215 -0
  52. data/lib/html5/treebuilders/simpletree.rb +191 -0
  53. data/lib/html5/treewalkers.rb +26 -0
  54. data/lib/html5/treewalkers/base.rb +162 -0
  55. data/lib/html5/treewalkers/hpricot.rb +48 -0
  56. data/lib/html5/treewalkers/rexml.rb +48 -0
  57. data/lib/html5/treewalkers/simpletree.rb +48 -0
  58. data/lib/html5/version.rb +3 -0
  59. data/test/preamble.rb +69 -0
  60. data/test/test_cli.rb +16 -0
  61. data/test/test_encoding.rb +35 -0
  62. data/test/test_input_stream.rb +26 -0
  63. data/test/test_lxp.rb +283 -0
  64. data/test/test_parser.rb +63 -0
  65. data/test/test_sanitizer.rb +173 -0
  66. data/test/test_serializer.rb +67 -0
  67. data/test/test_sniffer.rb +27 -0
  68. data/test/test_stream.rb +71 -0
  69. data/test/test_tokenizer.rb +95 -0
  70. data/test/test_treewalkers.rb +135 -0
  71. data/test/test_validator.rb +31 -0
  72. data/test/tokenizer_test_parser.rb +67 -0
  73. data/test19.rb +38 -0
  74. metadata +198 -0
@@ -0,0 +1,96 @@
1
+ require 'html5/html5parser/phase'
2
+
3
+ module HTML5
4
+ class InRowPhase < Phase
5
+
6
+ # http://www.whatwg.org/specs/web-apps/current-work/#in-row
7
+
8
+ handle_start 'html', %w( td th ) => 'TableCell', %w( caption col colgroup tbody tfoot thead tr ) => 'TableOther'
9
+
10
+ handle_end 'tr', 'table', %w( tbody tfoot thead ) => 'TableRowGroup', %w( body caption col colgroup html td th ) => 'Ignore'
11
+
12
+ def processCharacters(data)
13
+ @parser.phases[:inTable].processCharacters(data)
14
+ end
15
+
16
+ def processSpaceCharacters(data)
17
+ @parser.phases[:inTable].processSpaceCharacters(data)
18
+ end
19
+
20
+ def startTagTableCell(name, attributes)
21
+ clearStackToTableRowContext
22
+ @tree.insert_element(name, attributes)
23
+ @parser.phase = @parser.phases[:inCell]
24
+ @tree.activeFormattingElements.push(Marker)
25
+ end
26
+
27
+ def startTagTableOther(name, attributes)
28
+ ignoreEndTag = ignoreEndTagTr
29
+ endTagTr('tr')
30
+ # XXX how are we sure it's always ignored in the inner_html case?
31
+ @parser.phase.processStartTag(name, attributes) unless ignoreEndTag
32
+ end
33
+
34
+ def startTagOther(name, attributes)
35
+ @parser.phases[:inTable].processStartTag(name, attributes)
36
+ end
37
+
38
+ def endTagTr(name)
39
+ if ignoreEndTagTr
40
+ # inner_html case
41
+ assert @parser.inner_html
42
+ parse_error "unexpected-end-tag", {:name => name}
43
+ else
44
+ clearStackToTableRowContext
45
+ @tree.open_elements.pop
46
+ @parser.phase = @parser.phases[:inTableBody]
47
+ end
48
+ end
49
+
50
+ def endTagTable(name)
51
+ ignoreEndTag = ignoreEndTagTr
52
+ endTagTr('tr')
53
+ # Reprocess the current tag if the tr end tag was not ignored
54
+ # XXX how are we sure it's always ignored in the inner_html case?
55
+ @parser.phase.processEndTag(name) unless ignoreEndTag
56
+ end
57
+
58
+ def endTagTableRowGroup(name)
59
+ if in_scope?(name, true)
60
+ endTagTr('tr')
61
+ @parser.phase.processEndTag(name)
62
+ else
63
+ # inner_html case
64
+ parse_error "unexpected-end-tag", {:name => name}
65
+ end
66
+ end
67
+
68
+ def endTagIgnore(name)
69
+ parse_error("unexpected-end-tag-in-table-row",
70
+ {"name" => name})
71
+ end
72
+
73
+ def endTagOther(name)
74
+ @parser.phases[:inTable].processEndTag(name)
75
+ end
76
+
77
+ def process_eof
78
+ @parser.phases[:inTable].process_eof
79
+ end
80
+
81
+ protected
82
+
83
+ # XXX unify this with other table helper methods
84
+ def clearStackToTableRowContext
85
+ until %w[tr html].include?(name = @tree.open_elements.last.name)
86
+ parse_error("unexpected-implied-end-tag-in-table-row", {"name" => @tree.open_elements.last.name})
87
+ @tree.open_elements.pop
88
+ end
89
+ end
90
+
91
+ def ignoreEndTagTr
92
+ not in_scope?('tr', :tableVariant => true)
93
+ end
94
+
95
+ end
96
+ end
@@ -0,0 +1,90 @@
1
+ require 'html5/html5parser/phase'
2
+
3
+ module HTML5
4
+ class InSelectPhase < Phase
5
+
6
+ # http://www.whatwg.org/specs/web-apps/current-work/#in-select
7
+
8
+ handle_start 'html', 'option', 'optgroup', 'select'
9
+ handle_start 'input'
10
+ handle_end 'option', 'optgroup', 'select', %w( caption table tbody tfoot thead tr td th ) => 'TableElements'
11
+
12
+ def processCharacters(data)
13
+ @tree.insertText(data)
14
+ end
15
+
16
+ def startTagOption(name, attributes)
17
+ # We need to imply </option> if <option> is the current node.
18
+ @tree.open_elements.pop if @tree.open_elements.last.name == 'option'
19
+ @tree.insert_element(name, attributes)
20
+ end
21
+
22
+ def startTagOptgroup(name, attributes)
23
+ @tree.open_elements.pop if @tree.open_elements.last.name == 'option'
24
+ @tree.open_elements.pop if @tree.open_elements.last.name == 'optgroup'
25
+ @tree.insert_element(name, attributes)
26
+ end
27
+
28
+ def startTagSelect(name, attributes)
29
+ parse_error("unexpected-select-in-select")
30
+ endTagSelect('select')
31
+ end
32
+
33
+ def startTagInput(name, attributes)
34
+ @parser.parse_error("unexpected-input-in-select")
35
+ endTagSelect("select")
36
+ @parser.phase.processStartTag(name, attributes)
37
+ end
38
+
39
+ def startTagOther(name, attributes)
40
+ parse_error("unexpected-start-tag-in-select", {"name" => name})
41
+ end
42
+
43
+ def endTagOption(name)
44
+ if @tree.open_elements.last.name == 'option'
45
+ @tree.open_elements.pop
46
+ else
47
+ parse_error("unexpected-end-tag-in-select", {"name" => "option"})
48
+ end
49
+ end
50
+
51
+ def endTagOptgroup(name)
52
+ # </optgroup> implicitly closes <option>
53
+ if @tree.open_elements.last.name == 'option' and @tree.open_elements[-2].name == 'optgroup'
54
+ @tree.open_elements.pop
55
+ end
56
+ # It also closes </optgroup>
57
+ if @tree.open_elements.last.name == 'optgroup'
58
+ @tree.open_elements.pop
59
+ # But nothing else
60
+ else
61
+ parse_error("unexpected-end-tag-in-select",
62
+ {"name" => "optgroup"})
63
+ end
64
+ end
65
+
66
+ def endTagSelect(name)
67
+ if in_scope?('select', true)
68
+ remove_open_elements_until('select')
69
+
70
+ @parser.reset_insertion_mode
71
+ else
72
+ # inner_html case
73
+ parse_error
74
+ end
75
+ end
76
+
77
+ def endTagTableElements(name)
78
+ parse_error("unexpected-end-tag-in-select", {"name" => name})
79
+
80
+ if in_scope?(name, true)
81
+ endTagSelect('select')
82
+ @parser.phase.processEndTag(name)
83
+ end
84
+ end
85
+
86
+ def endTagOther(name)
87
+ parse_error("unexpected-end-tag-in-select", {"name" => name})
88
+ end
89
+ end
90
+ end
@@ -0,0 +1,35 @@
1
+ require 'html5/html5parser/phase'
2
+
3
+ module HTML5
4
+ class InSelectInTablePhase < Phase
5
+
6
+ handle_start %w(caption table tbody tfoot thead tr td th) => 'Table'
7
+ handle_end %w(caption table tbody tfoot thead tr td th) => 'Table'
8
+
9
+ def processCharacters(data)
10
+ @parser.phases[:inSelect].processCharacters(data)
11
+ end
12
+
13
+ def startTagTable(name, attributes)
14
+ @parser.parse_error("unexpected-table-element-start-tag-in-select-in-table", {:name => name})
15
+ endTagOther("select")
16
+ @parser.phase.processStartTag(name, attributes)
17
+ end
18
+
19
+ def startTagOther(name, attributes)
20
+ @parser.phases[:inSelect].processStartTag(name, attributes)
21
+ end
22
+
23
+ def endTagTable(name)
24
+ @parser.parse_error("unexpected-table-element-end-tag-in-select-in-table", {:name => name})
25
+ if @tree.elementInScope(name, true)
26
+ endTagOther("select")
27
+ @parser.phase.processEndTag(name)
28
+ end
29
+ end
30
+
31
+ def endTagOther(name)
32
+ @parser.phases[:inSelect].processEndTag(name)
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,92 @@
1
+ require 'html5/html5parser/phase'
2
+
3
+ module HTML5
4
+ class InTableBodyPhase < Phase
5
+
6
+ # http://www.whatwg.org/specs/web-apps/current-work/#in-table0
7
+
8
+ handle_start 'html', 'tr', %w( td th ) => 'TableCell', %w( caption col colgroup tbody tfoot thead ) => 'TableOther'
9
+
10
+ handle_end 'table', %w( tbody tfoot thead ) => 'TableRowGroup', %w( body caption col colgroup html td th tr ) => 'Ignore'
11
+
12
+ def processCharacters(data)
13
+ @parser.phases[:inTable].processCharacters(data)
14
+ end
15
+
16
+ def processSpaceCharacters(data)
17
+ @parser.phases[:inTable].processSpaceCharacters(data)
18
+ end
19
+
20
+ def process_eof
21
+ @parser.phases[:inTable].process_eof
22
+ end
23
+
24
+ def startTagTr(name, attributes)
25
+ clearStackToTableBodyContext
26
+ @tree.insert_element(name, attributes)
27
+ @parser.phase = @parser.phases[:inRow]
28
+ end
29
+
30
+ def startTagTableCell(name, attributes)
31
+ parse_error("unexpected-cell-in-table-body", {"name" => name})
32
+ startTagTr('tr', {})
33
+ @parser.phase.processStartTag(name, attributes)
34
+ end
35
+
36
+ def startTagTableOther(name, attributes)
37
+ # XXX AT Any ideas on how to share this with endTagTable?
38
+ if in_scope?('tbody', true) or in_scope?('thead', true) or in_scope?('tfoot', true)
39
+ clearStackToTableBodyContext
40
+ endTagTableRowGroup(@tree.open_elements.last.name)
41
+ @parser.phase.processStartTag(name, attributes)
42
+ else
43
+ # inner_html case
44
+ parse_error "unexpected-start-tag", {:name => name}
45
+ end
46
+ end
47
+
48
+ def startTagOther(name, attributes)
49
+ @parser.phases[:inTable].processStartTag(name, attributes)
50
+ end
51
+
52
+ def endTagTableRowGroup(name)
53
+ if in_scope?(name, true)
54
+ clearStackToTableBodyContext
55
+ @tree.open_elements.pop
56
+ @parser.phase = @parser.phases[:inTable]
57
+ else
58
+ parse_error("unexpected-end-tag-in-table-body", {"name" => name})
59
+ end
60
+ end
61
+
62
+ def endTagTable(name)
63
+ if in_scope?('tbody', true) or in_scope?('thead', true) or in_scope?('tfoot', true)
64
+ clearStackToTableBodyContext
65
+ endTagTableRowGroup(@tree.open_elements.last.name)
66
+ @parser.phase.processEndTag(name)
67
+ else
68
+ # inner_html case
69
+ parse_error "unexpected-end-tag", {:name => name}
70
+ end
71
+ end
72
+
73
+ def endTagIgnore(name)
74
+ parse_error("unexpected-end-tag-in-table-body", {"name" => name})
75
+ end
76
+
77
+ def endTagOther(name)
78
+ @parser.phases[:inTable].processEndTag(name)
79
+ end
80
+
81
+ protected
82
+
83
+ def clearStackToTableBodyContext
84
+ until %w[tbody tfoot thead html].include?(name = @tree.open_elements.last.name)
85
+ parse_error("unexpected-implied-end-tag-in-table",
86
+ {"name" => @tree.open_elements.last.name})
87
+ @tree.open_elements.pop
88
+ end
89
+ end
90
+
91
+ end
92
+ end
@@ -0,0 +1,177 @@
1
+ require 'html5/html5parser/phase'
2
+
3
+ module HTML5
4
+ class InTablePhase < Phase
5
+
6
+ # http://www.whatwg.org/specs/web-apps/current-work/#in-table
7
+
8
+ handle_start 'html', 'caption', 'colgroup', 'col', 'table'
9
+
10
+ handle_start %w( tbody tfoot thead ) => 'RowGroup', %w( td th tr ) => 'ImplyTbody'
11
+
12
+ handle_start %w(style script)
13
+
14
+ handle_start 'input'
15
+
16
+ handle_end 'table', %w( body caption col colgroup html tbody td tfoot th thead tr ) => 'Ignore'
17
+
18
+ def processSpaceCharacters(data)
19
+ if !current_table.flags.include?("tainted")
20
+ @tree.insertText(data)
21
+ else
22
+ processCharacters(data)
23
+ end
24
+ end
25
+
26
+ def processCharacters(data)
27
+ if ["style", "script"].include?(@tree.open_elements.last.name)
28
+ @tree.insertText(data)
29
+ else
30
+ if !current_table.flags.include?("tainted")
31
+ @parser.parse_error("unexpected-char-implies-table-voodoo")
32
+ current_table.flags << "tainted"
33
+ end
34
+ # Do the table magic!
35
+ @tree.insert_from_table = true
36
+ @parser.phases[:inBody].processCharacters(data)
37
+ @tree.insert_from_table = false
38
+ end
39
+ end
40
+
41
+ def process_eof
42
+ if @tree.open_elements.last.name != "html"
43
+ @parser.parse_error("eof-in-table")
44
+ else
45
+ assert @parser.innerHTML
46
+ end
47
+ end
48
+
49
+ def startTagCaption(name, attributes)
50
+ clear_stack_to_table_context
51
+ @tree.activeFormattingElements.push(Marker)
52
+ @tree.insert_element(name, attributes)
53
+ @parser.phase = @parser.phases[:inCaption]
54
+ end
55
+
56
+ def startTagColgroup(name, attributes)
57
+ clear_stack_to_table_context
58
+ @tree.insert_element(name, attributes)
59
+ @parser.phase = @parser.phases[:inColumnGroup]
60
+ end
61
+
62
+ def startTagCol(name, attributes)
63
+ startTagColgroup('colgroup', {})
64
+ @parser.phase.processStartTag(name, attributes)
65
+ end
66
+
67
+ def startTagRowGroup(name, attributes)
68
+ clear_stack_to_table_context
69
+ @tree.insert_element(name, attributes)
70
+ @parser.phase = @parser.phases[:inTableBody]
71
+ end
72
+
73
+ def startTagImplyTbody(name, attributes)
74
+ startTagRowGroup('tbody', {})
75
+ @parser.phase.processStartTag(name, attributes)
76
+ end
77
+
78
+ def startTagTable(name, attributes)
79
+ parse_error("unexpected-start-tag-implies-end-tag",
80
+ {"startName" => "table", "endName" => "table"})
81
+ @parser.phase.processEndTag('table')
82
+ @parser.phase.processStartTag(name, attributes) unless @parser.inner_html
83
+ end
84
+
85
+ def startTagOther(name, attributes)
86
+ @parser.parse_error("unexpected-start-tag-implies-table-voodoo", {:name => name})
87
+ if !current_table.flags.include?("tainted")
88
+ current_table.flags.push("tainted")
89
+ end
90
+ @tree.insert_from_table = true
91
+ # Process the start tag in the "in body" mode
92
+ @parser.phases[:inBody].processStartTag(name, attributes)
93
+ @tree.insert_from_table = false
94
+ end
95
+
96
+ def startTagStyleScript(name, attributes)
97
+ if !current_table.flags.include?("tainted")
98
+ @parser.phases[:inHead].processStartTag(name, attributes)
99
+ else
100
+ startTagOther(name, attributes)
101
+ end
102
+ end
103
+
104
+ def startTagInput(name, attributes)
105
+ if attributes.include?("type") &&
106
+ attributes["type"].downcase == "hidden" &&
107
+ !current_table.flags.include?("tainted")
108
+ @parser.parse_error("unpexted-hidden-input-in-table")
109
+ @tree.insert_element(name, attributes)
110
+ # XXX associate with form
111
+ @tree.open_elements.pop
112
+ else
113
+ self.startTagOther(name, attributes)
114
+ end
115
+ end
116
+
117
+ def endTagTable(name)
118
+ if in_scope?('table', true)
119
+ @tree.generateImpliedEndTags
120
+
121
+ unless @tree.open_elements.last.name == 'table'
122
+ parse_error("end-tag-too-early-named",
123
+ {"gotName" => "table",
124
+ "expectedName" => @tree.open_elements.last.name})
125
+ end
126
+
127
+ remove_open_elements_until('table')
128
+
129
+ @parser.reset_insertion_mode
130
+ else
131
+ # inner_html case
132
+ assert @parser.inner_html
133
+ parse_error "unexpected-end-tag", {:name => name}
134
+ end
135
+ end
136
+
137
+ def endTagIgnore(name)
138
+ parse_error("unexpected-end-tag", {"name" => name})
139
+ end
140
+
141
+ def endTagOther(name)
142
+ parse_error("unexpected-end-tag-implies-table-voodoo", {"name" => name})
143
+ # Make all the special element rearranging voodoo kick in
144
+ @tree.insert_from_table = true
145
+ # Process the end tag in the "in body" mode
146
+ @parser.phases[:inBody].processEndTag(name)
147
+ @tree.insert_from_table = false
148
+ end
149
+
150
+ def endStyleScript name
151
+ if !current_table().flags.include?("tainted")
152
+ @parser.phases[:inHead].processEndTag(name)
153
+ else
154
+ endTagOther(name)
155
+ end
156
+ end
157
+
158
+ protected
159
+
160
+ def clear_stack_to_table_context
161
+ # "clear the stack back to a table context"
162
+ until %w[table html].include?(name = @tree.open_elements.last.name)
163
+ parse_error("unexpected-implied-end-tag-in-table",
164
+ {"name" => @tree.open_elements.last.name})
165
+ @tree.open_elements.pop
166
+ end
167
+ # When the current node is <html> it's an inner_html case
168
+ end
169
+
170
+ def current_table
171
+ i = -1
172
+ i -= 1 while @tree.open_elements[i].name != "table"
173
+ @tree.open_elements[i]
174
+ end
175
+
176
+ end
177
+ end