spk-html5 0.10.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (74) hide show
  1. data/History.txt +10 -0
  2. data/Manifest.txt +73 -0
  3. data/README +45 -0
  4. data/Rakefile.rb +33 -0
  5. data/bin/html5 +7 -0
  6. data/lib/html5.rb +13 -0
  7. data/lib/html5/cli.rb +248 -0
  8. data/lib/html5/constants.rb +1061 -0
  9. data/lib/html5/filters/base.rb +10 -0
  10. data/lib/html5/filters/inject_meta_charset.rb +82 -0
  11. data/lib/html5/filters/iso639codes.rb +755 -0
  12. data/lib/html5/filters/optionaltags.rb +198 -0
  13. data/lib/html5/filters/rfc2046.rb +31 -0
  14. data/lib/html5/filters/rfc3987.rb +91 -0
  15. data/lib/html5/filters/sanitizer.rb +15 -0
  16. data/lib/html5/filters/validator.rb +834 -0
  17. data/lib/html5/filters/whitespace.rb +36 -0
  18. data/lib/html5/html5parser.rb +247 -0
  19. data/lib/html5/html5parser/after_after_body_phase.rb +43 -0
  20. data/lib/html5/html5parser/after_after_frameset_phase.rb +32 -0
  21. data/lib/html5/html5parser/after_body_phase.rb +46 -0
  22. data/lib/html5/html5parser/after_frameset_phase.rb +33 -0
  23. data/lib/html5/html5parser/after_head_phase.rb +55 -0
  24. data/lib/html5/html5parser/before_head_phase.rb +44 -0
  25. data/lib/html5/html5parser/before_html_phase.rb +41 -0
  26. data/lib/html5/html5parser/in_body_phase.rb +636 -0
  27. data/lib/html5/html5parser/in_caption_phase.rb +69 -0
  28. data/lib/html5/html5parser/in_cell_phase.rb +78 -0
  29. data/lib/html5/html5parser/in_column_group_phase.rb +55 -0
  30. data/lib/html5/html5parser/in_foreign_content_phase.rb +50 -0
  31. data/lib/html5/html5parser/in_frameset_phase.rb +56 -0
  32. data/lib/html5/html5parser/in_head_phase.rb +143 -0
  33. data/lib/html5/html5parser/in_row_phase.rb +96 -0
  34. data/lib/html5/html5parser/in_select_phase.rb +90 -0
  35. data/lib/html5/html5parser/in_select_table_phase.rb +35 -0
  36. data/lib/html5/html5parser/in_table_body_phase.rb +92 -0
  37. data/lib/html5/html5parser/in_table_phase.rb +177 -0
  38. data/lib/html5/html5parser/initial_phase.rb +133 -0
  39. data/lib/html5/html5parser/phase.rb +171 -0
  40. data/lib/html5/inputstream.rb +735 -0
  41. data/lib/html5/liberalxmlparser.rb +158 -0
  42. data/lib/html5/sanitizer.rb +209 -0
  43. data/lib/html5/serializer.rb +2 -0
  44. data/lib/html5/serializer/htmlserializer.rb +179 -0
  45. data/lib/html5/serializer/xhtmlserializer.rb +20 -0
  46. data/lib/html5/sniffer.rb +45 -0
  47. data/lib/html5/tokenizer.rb +1059 -0
  48. data/lib/html5/treebuilders.rb +24 -0
  49. data/lib/html5/treebuilders/base.rb +339 -0
  50. data/lib/html5/treebuilders/hpricot.rb +231 -0
  51. data/lib/html5/treebuilders/rexml.rb +215 -0
  52. data/lib/html5/treebuilders/simpletree.rb +191 -0
  53. data/lib/html5/treewalkers.rb +26 -0
  54. data/lib/html5/treewalkers/base.rb +162 -0
  55. data/lib/html5/treewalkers/hpricot.rb +48 -0
  56. data/lib/html5/treewalkers/rexml.rb +48 -0
  57. data/lib/html5/treewalkers/simpletree.rb +48 -0
  58. data/lib/html5/version.rb +3 -0
  59. data/test/preamble.rb +69 -0
  60. data/test/test_cli.rb +16 -0
  61. data/test/test_encoding.rb +35 -0
  62. data/test/test_input_stream.rb +26 -0
  63. data/test/test_lxp.rb +283 -0
  64. data/test/test_parser.rb +63 -0
  65. data/test/test_sanitizer.rb +173 -0
  66. data/test/test_serializer.rb +67 -0
  67. data/test/test_sniffer.rb +27 -0
  68. data/test/test_stream.rb +71 -0
  69. data/test/test_tokenizer.rb +95 -0
  70. data/test/test_treewalkers.rb +135 -0
  71. data/test/test_validator.rb +31 -0
  72. data/test/tokenizer_test_parser.rb +67 -0
  73. data/test19.rb +38 -0
  74. metadata +198 -0
@@ -0,0 +1,96 @@
1
+ require 'html5/html5parser/phase'
2
+
3
+ module HTML5
4
+ class InRowPhase < Phase
5
+
6
+ # http://www.whatwg.org/specs/web-apps/current-work/#in-row
7
+
8
+ handle_start 'html', %w( td th ) => 'TableCell', %w( caption col colgroup tbody tfoot thead tr ) => 'TableOther'
9
+
10
+ handle_end 'tr', 'table', %w( tbody tfoot thead ) => 'TableRowGroup', %w( body caption col colgroup html td th ) => 'Ignore'
11
+
12
+ def processCharacters(data)
13
+ @parser.phases[:inTable].processCharacters(data)
14
+ end
15
+
16
+ def processSpaceCharacters(data)
17
+ @parser.phases[:inTable].processSpaceCharacters(data)
18
+ end
19
+
20
+ def startTagTableCell(name, attributes)
21
+ clearStackToTableRowContext
22
+ @tree.insert_element(name, attributes)
23
+ @parser.phase = @parser.phases[:inCell]
24
+ @tree.activeFormattingElements.push(Marker)
25
+ end
26
+
27
+ def startTagTableOther(name, attributes)
28
+ ignoreEndTag = ignoreEndTagTr
29
+ endTagTr('tr')
30
+ # XXX how are we sure it's always ignored in the inner_html case?
31
+ @parser.phase.processStartTag(name, attributes) unless ignoreEndTag
32
+ end
33
+
34
+ def startTagOther(name, attributes)
35
+ @parser.phases[:inTable].processStartTag(name, attributes)
36
+ end
37
+
38
+ def endTagTr(name)
39
+ if ignoreEndTagTr
40
+ # inner_html case
41
+ assert @parser.inner_html
42
+ parse_error "unexpected-end-tag", {:name => name}
43
+ else
44
+ clearStackToTableRowContext
45
+ @tree.open_elements.pop
46
+ @parser.phase = @parser.phases[:inTableBody]
47
+ end
48
+ end
49
+
50
+ def endTagTable(name)
51
+ ignoreEndTag = ignoreEndTagTr
52
+ endTagTr('tr')
53
+ # Reprocess the current tag if the tr end tag was not ignored
54
+ # XXX how are we sure it's always ignored in the inner_html case?
55
+ @parser.phase.processEndTag(name) unless ignoreEndTag
56
+ end
57
+
58
+ def endTagTableRowGroup(name)
59
+ if in_scope?(name, true)
60
+ endTagTr('tr')
61
+ @parser.phase.processEndTag(name)
62
+ else
63
+ # inner_html case
64
+ parse_error "unexpected-end-tag", {:name => name}
65
+ end
66
+ end
67
+
68
+ def endTagIgnore(name)
69
+ parse_error("unexpected-end-tag-in-table-row",
70
+ {"name" => name})
71
+ end
72
+
73
+ def endTagOther(name)
74
+ @parser.phases[:inTable].processEndTag(name)
75
+ end
76
+
77
+ def process_eof
78
+ @parser.phases[:inTable].process_eof
79
+ end
80
+
81
+ protected
82
+
83
+ # XXX unify this with other table helper methods
84
+ def clearStackToTableRowContext
85
+ until %w[tr html].include?(name = @tree.open_elements.last.name)
86
+ parse_error("unexpected-implied-end-tag-in-table-row", {"name" => @tree.open_elements.last.name})
87
+ @tree.open_elements.pop
88
+ end
89
+ end
90
+
91
+ def ignoreEndTagTr
92
+ not in_scope?('tr', :tableVariant => true)
93
+ end
94
+
95
+ end
96
+ end
@@ -0,0 +1,90 @@
1
+ require 'html5/html5parser/phase'
2
+
3
+ module HTML5
4
+ class InSelectPhase < Phase
5
+
6
+ # http://www.whatwg.org/specs/web-apps/current-work/#in-select
7
+
8
+ handle_start 'html', 'option', 'optgroup', 'select'
9
+ handle_start 'input'
10
+ handle_end 'option', 'optgroup', 'select', %w( caption table tbody tfoot thead tr td th ) => 'TableElements'
11
+
12
+ def processCharacters(data)
13
+ @tree.insertText(data)
14
+ end
15
+
16
+ def startTagOption(name, attributes)
17
+ # We need to imply </option> if <option> is the current node.
18
+ @tree.open_elements.pop if @tree.open_elements.last.name == 'option'
19
+ @tree.insert_element(name, attributes)
20
+ end
21
+
22
+ def startTagOptgroup(name, attributes)
23
+ @tree.open_elements.pop if @tree.open_elements.last.name == 'option'
24
+ @tree.open_elements.pop if @tree.open_elements.last.name == 'optgroup'
25
+ @tree.insert_element(name, attributes)
26
+ end
27
+
28
+ def startTagSelect(name, attributes)
29
+ parse_error("unexpected-select-in-select")
30
+ endTagSelect('select')
31
+ end
32
+
33
+ def startTagInput(name, attributes)
34
+ @parser.parse_error("unexpected-input-in-select")
35
+ endTagSelect("select")
36
+ @parser.phase.processStartTag(name, attributes)
37
+ end
38
+
39
+ def startTagOther(name, attributes)
40
+ parse_error("unexpected-start-tag-in-select", {"name" => name})
41
+ end
42
+
43
+ def endTagOption(name)
44
+ if @tree.open_elements.last.name == 'option'
45
+ @tree.open_elements.pop
46
+ else
47
+ parse_error("unexpected-end-tag-in-select", {"name" => "option"})
48
+ end
49
+ end
50
+
51
+ def endTagOptgroup(name)
52
+ # </optgroup> implicitly closes <option>
53
+ if @tree.open_elements.last.name == 'option' and @tree.open_elements[-2].name == 'optgroup'
54
+ @tree.open_elements.pop
55
+ end
56
+ # It also closes </optgroup>
57
+ if @tree.open_elements.last.name == 'optgroup'
58
+ @tree.open_elements.pop
59
+ # But nothing else
60
+ else
61
+ parse_error("unexpected-end-tag-in-select",
62
+ {"name" => "optgroup"})
63
+ end
64
+ end
65
+
66
+ def endTagSelect(name)
67
+ if in_scope?('select', true)
68
+ remove_open_elements_until('select')
69
+
70
+ @parser.reset_insertion_mode
71
+ else
72
+ # inner_html case
73
+ parse_error
74
+ end
75
+ end
76
+
77
+ def endTagTableElements(name)
78
+ parse_error("unexpected-end-tag-in-select", {"name" => name})
79
+
80
+ if in_scope?(name, true)
81
+ endTagSelect('select')
82
+ @parser.phase.processEndTag(name)
83
+ end
84
+ end
85
+
86
+ def endTagOther(name)
87
+ parse_error("unexpected-end-tag-in-select", {"name" => name})
88
+ end
89
+ end
90
+ end
@@ -0,0 +1,35 @@
1
+ require 'html5/html5parser/phase'
2
+
3
+ module HTML5
4
+ class InSelectInTablePhase < Phase
5
+
6
+ handle_start %w(caption table tbody tfoot thead tr td th) => 'Table'
7
+ handle_end %w(caption table tbody tfoot thead tr td th) => 'Table'
8
+
9
+ def processCharacters(data)
10
+ @parser.phases[:inSelect].processCharacters(data)
11
+ end
12
+
13
+ def startTagTable(name, attributes)
14
+ @parser.parse_error("unexpected-table-element-start-tag-in-select-in-table", {:name => name})
15
+ endTagOther("select")
16
+ @parser.phase.processStartTag(name, attributes)
17
+ end
18
+
19
+ def startTagOther(name, attributes)
20
+ @parser.phases[:inSelect].processStartTag(name, attributes)
21
+ end
22
+
23
+ def endTagTable(name)
24
+ @parser.parse_error("unexpected-table-element-end-tag-in-select-in-table", {:name => name})
25
+ if @tree.elementInScope(name, true)
26
+ endTagOther("select")
27
+ @parser.phase.processEndTag(name)
28
+ end
29
+ end
30
+
31
+ def endTagOther(name)
32
+ @parser.phases[:inSelect].processEndTag(name)
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,92 @@
1
+ require 'html5/html5parser/phase'
2
+
3
+ module HTML5
4
+ class InTableBodyPhase < Phase
5
+
6
+ # http://www.whatwg.org/specs/web-apps/current-work/#in-table0
7
+
8
+ handle_start 'html', 'tr', %w( td th ) => 'TableCell', %w( caption col colgroup tbody tfoot thead ) => 'TableOther'
9
+
10
+ handle_end 'table', %w( tbody tfoot thead ) => 'TableRowGroup', %w( body caption col colgroup html td th tr ) => 'Ignore'
11
+
12
+ def processCharacters(data)
13
+ @parser.phases[:inTable].processCharacters(data)
14
+ end
15
+
16
+ def processSpaceCharacters(data)
17
+ @parser.phases[:inTable].processSpaceCharacters(data)
18
+ end
19
+
20
+ def process_eof
21
+ @parser.phases[:inTable].process_eof
22
+ end
23
+
24
+ def startTagTr(name, attributes)
25
+ clearStackToTableBodyContext
26
+ @tree.insert_element(name, attributes)
27
+ @parser.phase = @parser.phases[:inRow]
28
+ end
29
+
30
+ def startTagTableCell(name, attributes)
31
+ parse_error("unexpected-cell-in-table-body", {"name" => name})
32
+ startTagTr('tr', {})
33
+ @parser.phase.processStartTag(name, attributes)
34
+ end
35
+
36
+ def startTagTableOther(name, attributes)
37
+ # XXX AT Any ideas on how to share this with endTagTable?
38
+ if in_scope?('tbody', true) or in_scope?('thead', true) or in_scope?('tfoot', true)
39
+ clearStackToTableBodyContext
40
+ endTagTableRowGroup(@tree.open_elements.last.name)
41
+ @parser.phase.processStartTag(name, attributes)
42
+ else
43
+ # inner_html case
44
+ parse_error "unexpected-start-tag", {:name => name}
45
+ end
46
+ end
47
+
48
+ def startTagOther(name, attributes)
49
+ @parser.phases[:inTable].processStartTag(name, attributes)
50
+ end
51
+
52
+ def endTagTableRowGroup(name)
53
+ if in_scope?(name, true)
54
+ clearStackToTableBodyContext
55
+ @tree.open_elements.pop
56
+ @parser.phase = @parser.phases[:inTable]
57
+ else
58
+ parse_error("unexpected-end-tag-in-table-body", {"name" => name})
59
+ end
60
+ end
61
+
62
+ def endTagTable(name)
63
+ if in_scope?('tbody', true) or in_scope?('thead', true) or in_scope?('tfoot', true)
64
+ clearStackToTableBodyContext
65
+ endTagTableRowGroup(@tree.open_elements.last.name)
66
+ @parser.phase.processEndTag(name)
67
+ else
68
+ # inner_html case
69
+ parse_error "unexpected-end-tag", {:name => name}
70
+ end
71
+ end
72
+
73
+ def endTagIgnore(name)
74
+ parse_error("unexpected-end-tag-in-table-body", {"name" => name})
75
+ end
76
+
77
+ def endTagOther(name)
78
+ @parser.phases[:inTable].processEndTag(name)
79
+ end
80
+
81
+ protected
82
+
83
+ def clearStackToTableBodyContext
84
+ until %w[tbody tfoot thead html].include?(name = @tree.open_elements.last.name)
85
+ parse_error("unexpected-implied-end-tag-in-table",
86
+ {"name" => @tree.open_elements.last.name})
87
+ @tree.open_elements.pop
88
+ end
89
+ end
90
+
91
+ end
92
+ end
@@ -0,0 +1,177 @@
1
+ require 'html5/html5parser/phase'
2
+
3
+ module HTML5
4
+ class InTablePhase < Phase
5
+
6
+ # http://www.whatwg.org/specs/web-apps/current-work/#in-table
7
+
8
+ handle_start 'html', 'caption', 'colgroup', 'col', 'table'
9
+
10
+ handle_start %w( tbody tfoot thead ) => 'RowGroup', %w( td th tr ) => 'ImplyTbody'
11
+
12
+ handle_start %w(style script)
13
+
14
+ handle_start 'input'
15
+
16
+ handle_end 'table', %w( body caption col colgroup html tbody td tfoot th thead tr ) => 'Ignore'
17
+
18
+ def processSpaceCharacters(data)
19
+ if !current_table.flags.include?("tainted")
20
+ @tree.insertText(data)
21
+ else
22
+ processCharacters(data)
23
+ end
24
+ end
25
+
26
+ def processCharacters(data)
27
+ if ["style", "script"].include?(@tree.open_elements.last.name)
28
+ @tree.insertText(data)
29
+ else
30
+ if !current_table.flags.include?("tainted")
31
+ @parser.parse_error("unexpected-char-implies-table-voodoo")
32
+ current_table.flags << "tainted"
33
+ end
34
+ # Do the table magic!
35
+ @tree.insert_from_table = true
36
+ @parser.phases[:inBody].processCharacters(data)
37
+ @tree.insert_from_table = false
38
+ end
39
+ end
40
+
41
+ def process_eof
42
+ if @tree.open_elements.last.name != "html"
43
+ @parser.parse_error("eof-in-table")
44
+ else
45
+ assert @parser.innerHTML
46
+ end
47
+ end
48
+
49
+ def startTagCaption(name, attributes)
50
+ clear_stack_to_table_context
51
+ @tree.activeFormattingElements.push(Marker)
52
+ @tree.insert_element(name, attributes)
53
+ @parser.phase = @parser.phases[:inCaption]
54
+ end
55
+
56
+ def startTagColgroup(name, attributes)
57
+ clear_stack_to_table_context
58
+ @tree.insert_element(name, attributes)
59
+ @parser.phase = @parser.phases[:inColumnGroup]
60
+ end
61
+
62
+ def startTagCol(name, attributes)
63
+ startTagColgroup('colgroup', {})
64
+ @parser.phase.processStartTag(name, attributes)
65
+ end
66
+
67
+ def startTagRowGroup(name, attributes)
68
+ clear_stack_to_table_context
69
+ @tree.insert_element(name, attributes)
70
+ @parser.phase = @parser.phases[:inTableBody]
71
+ end
72
+
73
+ def startTagImplyTbody(name, attributes)
74
+ startTagRowGroup('tbody', {})
75
+ @parser.phase.processStartTag(name, attributes)
76
+ end
77
+
78
+ def startTagTable(name, attributes)
79
+ parse_error("unexpected-start-tag-implies-end-tag",
80
+ {"startName" => "table", "endName" => "table"})
81
+ @parser.phase.processEndTag('table')
82
+ @parser.phase.processStartTag(name, attributes) unless @parser.inner_html
83
+ end
84
+
85
+ def startTagOther(name, attributes)
86
+ @parser.parse_error("unexpected-start-tag-implies-table-voodoo", {:name => name})
87
+ if !current_table.flags.include?("tainted")
88
+ current_table.flags.push("tainted")
89
+ end
90
+ @tree.insert_from_table = true
91
+ # Process the start tag in the "in body" mode
92
+ @parser.phases[:inBody].processStartTag(name, attributes)
93
+ @tree.insert_from_table = false
94
+ end
95
+
96
+ def startTagStyleScript(name, attributes)
97
+ if !current_table.flags.include?("tainted")
98
+ @parser.phases[:inHead].processStartTag(name, attributes)
99
+ else
100
+ startTagOther(name, attributes)
101
+ end
102
+ end
103
+
104
+ def startTagInput(name, attributes)
105
+ if attributes.include?("type") &&
106
+ attributes["type"].downcase == "hidden" &&
107
+ !current_table.flags.include?("tainted")
108
+ @parser.parse_error("unpexted-hidden-input-in-table")
109
+ @tree.insert_element(name, attributes)
110
+ # XXX associate with form
111
+ @tree.open_elements.pop
112
+ else
113
+ self.startTagOther(name, attributes)
114
+ end
115
+ end
116
+
117
+ def endTagTable(name)
118
+ if in_scope?('table', true)
119
+ @tree.generateImpliedEndTags
120
+
121
+ unless @tree.open_elements.last.name == 'table'
122
+ parse_error("end-tag-too-early-named",
123
+ {"gotName" => "table",
124
+ "expectedName" => @tree.open_elements.last.name})
125
+ end
126
+
127
+ remove_open_elements_until('table')
128
+
129
+ @parser.reset_insertion_mode
130
+ else
131
+ # inner_html case
132
+ assert @parser.inner_html
133
+ parse_error "unexpected-end-tag", {:name => name}
134
+ end
135
+ end
136
+
137
+ def endTagIgnore(name)
138
+ parse_error("unexpected-end-tag", {"name" => name})
139
+ end
140
+
141
+ def endTagOther(name)
142
+ parse_error("unexpected-end-tag-implies-table-voodoo", {"name" => name})
143
+ # Make all the special element rearranging voodoo kick in
144
+ @tree.insert_from_table = true
145
+ # Process the end tag in the "in body" mode
146
+ @parser.phases[:inBody].processEndTag(name)
147
+ @tree.insert_from_table = false
148
+ end
149
+
150
+ def endStyleScript name
151
+ if !current_table().flags.include?("tainted")
152
+ @parser.phases[:inHead].processEndTag(name)
153
+ else
154
+ endTagOther(name)
155
+ end
156
+ end
157
+
158
+ protected
159
+
160
+ def clear_stack_to_table_context
161
+ # "clear the stack back to a table context"
162
+ until %w[table html].include?(name = @tree.open_elements.last.name)
163
+ parse_error("unexpected-implied-end-tag-in-table",
164
+ {"name" => @tree.open_elements.last.name})
165
+ @tree.open_elements.pop
166
+ end
167
+ # When the current node is <html> it's an inner_html case
168
+ end
169
+
170
+ def current_table
171
+ i = -1
172
+ i -= 1 while @tree.open_elements[i].name != "table"
173
+ @tree.open_elements[i]
174
+ end
175
+
176
+ end
177
+ end