spk-html5 0.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. data/History.txt +10 -0
  2. data/Manifest.txt +73 -0
  3. data/README +45 -0
  4. data/Rakefile.rb +33 -0
  5. data/bin/html5 +7 -0
  6. data/lib/html5.rb +13 -0
  7. data/lib/html5/cli.rb +248 -0
  8. data/lib/html5/constants.rb +1061 -0
  9. data/lib/html5/filters/base.rb +10 -0
  10. data/lib/html5/filters/inject_meta_charset.rb +82 -0
  11. data/lib/html5/filters/iso639codes.rb +755 -0
  12. data/lib/html5/filters/optionaltags.rb +198 -0
  13. data/lib/html5/filters/rfc2046.rb +31 -0
  14. data/lib/html5/filters/rfc3987.rb +91 -0
  15. data/lib/html5/filters/sanitizer.rb +15 -0
  16. data/lib/html5/filters/validator.rb +834 -0
  17. data/lib/html5/filters/whitespace.rb +36 -0
  18. data/lib/html5/html5parser.rb +247 -0
  19. data/lib/html5/html5parser/after_after_body_phase.rb +43 -0
  20. data/lib/html5/html5parser/after_after_frameset_phase.rb +32 -0
  21. data/lib/html5/html5parser/after_body_phase.rb +46 -0
  22. data/lib/html5/html5parser/after_frameset_phase.rb +33 -0
  23. data/lib/html5/html5parser/after_head_phase.rb +55 -0
  24. data/lib/html5/html5parser/before_head_phase.rb +44 -0
  25. data/lib/html5/html5parser/before_html_phase.rb +41 -0
  26. data/lib/html5/html5parser/in_body_phase.rb +636 -0
  27. data/lib/html5/html5parser/in_caption_phase.rb +69 -0
  28. data/lib/html5/html5parser/in_cell_phase.rb +78 -0
  29. data/lib/html5/html5parser/in_column_group_phase.rb +55 -0
  30. data/lib/html5/html5parser/in_foreign_content_phase.rb +50 -0
  31. data/lib/html5/html5parser/in_frameset_phase.rb +56 -0
  32. data/lib/html5/html5parser/in_head_phase.rb +143 -0
  33. data/lib/html5/html5parser/in_row_phase.rb +96 -0
  34. data/lib/html5/html5parser/in_select_phase.rb +90 -0
  35. data/lib/html5/html5parser/in_select_table_phase.rb +35 -0
  36. data/lib/html5/html5parser/in_table_body_phase.rb +92 -0
  37. data/lib/html5/html5parser/in_table_phase.rb +177 -0
  38. data/lib/html5/html5parser/initial_phase.rb +133 -0
  39. data/lib/html5/html5parser/phase.rb +171 -0
  40. data/lib/html5/inputstream.rb +735 -0
  41. data/lib/html5/liberalxmlparser.rb +158 -0
  42. data/lib/html5/sanitizer.rb +209 -0
  43. data/lib/html5/serializer.rb +2 -0
  44. data/lib/html5/serializer/htmlserializer.rb +179 -0
  45. data/lib/html5/serializer/xhtmlserializer.rb +20 -0
  46. data/lib/html5/sniffer.rb +45 -0
  47. data/lib/html5/tokenizer.rb +1059 -0
  48. data/lib/html5/treebuilders.rb +24 -0
  49. data/lib/html5/treebuilders/base.rb +339 -0
  50. data/lib/html5/treebuilders/hpricot.rb +231 -0
  51. data/lib/html5/treebuilders/rexml.rb +215 -0
  52. data/lib/html5/treebuilders/simpletree.rb +191 -0
  53. data/lib/html5/treewalkers.rb +26 -0
  54. data/lib/html5/treewalkers/base.rb +162 -0
  55. data/lib/html5/treewalkers/hpricot.rb +48 -0
  56. data/lib/html5/treewalkers/rexml.rb +48 -0
  57. data/lib/html5/treewalkers/simpletree.rb +48 -0
  58. data/lib/html5/version.rb +3 -0
  59. data/test/preamble.rb +69 -0
  60. data/test/test_cli.rb +16 -0
  61. data/test/test_encoding.rb +35 -0
  62. data/test/test_input_stream.rb +26 -0
  63. data/test/test_lxp.rb +283 -0
  64. data/test/test_parser.rb +63 -0
  65. data/test/test_sanitizer.rb +173 -0
  66. data/test/test_serializer.rb +67 -0
  67. data/test/test_sniffer.rb +27 -0
  68. data/test/test_stream.rb +71 -0
  69. data/test/test_tokenizer.rb +95 -0
  70. data/test/test_treewalkers.rb +135 -0
  71. data/test/test_validator.rb +31 -0
  72. data/test/tokenizer_test_parser.rb +67 -0
  73. data/test19.rb +38 -0
  74. metadata +198 -0
@@ -0,0 +1,69 @@
1
+ require 'html5/html5parser/phase'
2
+
3
+ module HTML5
4
+ class InCaptionPhase < Phase
5
+
6
+ # http://www.whatwg.org/specs/web-apps/current-work/#in-caption
7
+
8
+ handle_start 'html', %w(caption col colgroup tbody td tfoot th thead tr) => 'TableElement'
9
+
10
+ handle_end 'caption', 'table', %w(body col colgroup html tbody td tfoot th thead tr) => 'Ignore'
11
+
12
+ def ignoreEndTagCaption
13
+ !in_scope?('caption', true)
14
+ end
15
+
16
+ def processCharacters(data)
17
+ @parser.phases[:inBody].processCharacters(data)
18
+ end
19
+
20
+ def startTagTableElement(name, attributes)
21
+ parse_error "unexpected-end-tag", {"name" => name}
22
+ #XXX Have to duplicate logic here to find out if the tag is ignored
23
+ ignoreEndTag = ignoreEndTagCaption
24
+ @parser.phase.processEndTag('caption')
25
+ @parser.phase.processStartTag(name, attributes) unless ignoreEndTag
26
+ end
27
+
28
+ def startTagOther(name, attributes)
29
+ @parser.phases[:inBody].processStartTag(name, attributes)
30
+ end
31
+
32
+ def endTagCaption(name)
33
+ if ignoreEndTagCaption
34
+ # inner_html case
35
+ assert @parser.inner_html
36
+ parse_error "unexpected-end-tag", {"name" => name}
37
+ else
38
+ # AT this code is quite similar to endTagTable in "InTable"
39
+ @tree.generateImpliedEndTags
40
+
41
+ unless @tree.open_elements[-1].name == 'caption'
42
+ parse_error("expected-one-end-tag-but-got-another",
43
+ {"gotName" => "caption",
44
+ "expectedName" => @tree.open_elements.last.name})
45
+ end
46
+
47
+ remove_open_elements_until('caption')
48
+
49
+ @tree.clearActiveFormattingElements
50
+ @parser.phase = @parser.phases[:inTable]
51
+ end
52
+ end
53
+
54
+ def endTagTable(name)
55
+ parse_error "unexpected-end-table-in-caption"
56
+ ignoreEndTag = ignoreEndTagCaption
57
+ @parser.phase.processEndTag('caption')
58
+ @parser.phase.processEndTag(name) unless ignoreEndTag
59
+ end
60
+
61
+ def endTagIgnore(name)
62
+ parse_error("unexpected-end-tag", {"name" => name})
63
+ end
64
+
65
+ def endTagOther(name)
66
+ @parser.phases[:inBody].processEndTag(name)
67
+ end
68
+ end
69
+ end
@@ -0,0 +1,78 @@
1
+ require 'html5/html5parser/phase'
2
+
3
+ module HTML5
4
+ class InCellPhase < Phase
5
+
6
+ # http://www.whatwg.org/specs/web-apps/current-work/#in-cell
7
+
8
+ handle_start 'html', %w( caption col colgroup tbody td tfoot th thead tr ) => 'TableOther'
9
+
10
+ handle_end %w( td th ) => 'TableCell', %w( body caption col colgroup html ) => 'Ignore'
11
+
12
+ handle_end %w( table tbody tfoot thead tr ) => 'Imply'
13
+
14
+ def processCharacters(data)
15
+ @parser.phases[:inBody].processCharacters(data)
16
+ end
17
+
18
+ def startTagTableOther(name, attributes)
19
+ if in_scope?('td', true) or in_scope?('th', true)
20
+ closeCell
21
+ @parser.phase.processStartTag(name, attributes)
22
+ else
23
+ # inner_html case
24
+ parse_error
25
+ end
26
+ end
27
+
28
+ def startTagOther(name, attributes)
29
+ @parser.phases[:inBody].processStartTag(name, attributes)
30
+ end
31
+
32
+ def endTagTableCell(name)
33
+ if in_scope?(name, true)
34
+ @tree.generateImpliedEndTags(name)
35
+ if @tree.open_elements.last.name != name
36
+ parse_error("unexpected-cell-end-tag", {"name" => name})
37
+
38
+ remove_open_elements_until(name)
39
+ else
40
+ @tree.open_elements.pop
41
+ end
42
+ @tree.clearActiveFormattingElements
43
+ @parser.phase = @parser.phases[:inRow]
44
+ else
45
+ parse_error("unexpected-end-tag", {"name" => name})
46
+ end
47
+ end
48
+
49
+ def endTagIgnore(name)
50
+ parse_error("unexpected-end-tag", {"name" => name})
51
+ end
52
+
53
+ def endTagImply(name)
54
+ if in_scope?(name, true)
55
+ closeCell
56
+ @parser.phase.processEndTag(name)
57
+ else
58
+ # sometimes inner_html case
59
+ parse_error "unexpected-end-tag", {:name => name}
60
+ end
61
+ end
62
+
63
+ def endTagOther(name)
64
+ @parser.phases[:inBody].processEndTag(name)
65
+ end
66
+
67
+ protected
68
+
69
+ def closeCell
70
+ if in_scope?('td', true)
71
+ endTagTableCell('td')
72
+ elsif in_scope?('th', true)
73
+ endTagTableCell('th')
74
+ end
75
+ end
76
+
77
+ end
78
+ end
@@ -0,0 +1,55 @@
1
+ require 'html5/html5parser/phase'
2
+
3
+ module HTML5
4
+ class InColumnGroupPhase < Phase
5
+
6
+ # http://www.whatwg.org/specs/web-apps/current-work/#in-column
7
+
8
+ handle_start 'html', 'col'
9
+
10
+ handle_end 'colgroup', 'col'
11
+
12
+ def ignoreEndTagColgroup
13
+ @tree.open_elements[-1].name == 'html'
14
+ end
15
+
16
+ def processCharacters(data)
17
+ ignoreEndTag = ignoreEndTagColgroup
18
+ endTagColgroup("colgroup")
19
+ @parser.phase.processCharacters(data) unless ignoreEndTag
20
+ end
21
+
22
+ def startTagCol(name, attributes)
23
+ @tree.insert_element(name, attributes)
24
+ @tree.open_elements.pop
25
+ end
26
+
27
+ def startTagOther(name, attributes)
28
+ ignoreEndTag = ignoreEndTagColgroup
29
+ endTagColgroup('colgroup')
30
+ @parser.phase.processStartTag(name, attributes) unless ignoreEndTag
31
+ end
32
+
33
+ def endTagColgroup(name)
34
+ if ignoreEndTagColgroup
35
+ # inner_html case
36
+ assert @parser.inner_html
37
+ parse_error "unexpected-end-tag", {:name => name}
38
+ else
39
+ @tree.open_elements.pop
40
+ @parser.phase = @parser.phases[:inTable]
41
+ end
42
+ end
43
+
44
+ def endTagCol(name)
45
+ parse_error("no-end-tag", {"name" => "col"})
46
+ end
47
+
48
+ def endTagOther(name)
49
+ ignoreEndTag = ignoreEndTagColgroup
50
+ endTagColgroup('colgroup')
51
+ @parser.phase.processEndTag(name) unless ignoreEndTag
52
+ end
53
+
54
+ end
55
+ end
@@ -0,0 +1,50 @@
1
+ require 'html5/html5parser/phase'
2
+
3
+ module HTML5
4
+ class InForeignContentPhase < Phase
5
+
6
+ def processCharacters(data)
7
+ @tree.insertText(data)
8
+ end
9
+
10
+ def startTagOther(name, attributes, self_closing)
11
+ if !%w[mglyph malignmark].include?(name) && %w[mi mo mn ms mtext].include?(@tree.open_elements.last.name) &&
12
+ @tree.open_elements.last.namespace == :math
13
+
14
+ @parser.secondary_phase.processStartTag(name, attributes)
15
+ if @parser.phase == @parser.phases[:inForeignContent]
16
+ if !@tree.open_elements.any? {|e| e.namespace }
17
+ @parser.phase = @parser.secondary_phase
18
+ end
19
+ end
20
+ elsif %w[b big blockquote body br center code dd div dl dt em embed font
21
+ h1 h2 h3 h4 h5 h6 head hr i img li listing menu meta nobr ol p pre ruby s small
22
+ span strong strike sub sup table tt u ul var].include?(name)
23
+
24
+ parse_error("html-in-foreign-content", :name => name)
25
+
26
+ until @tree.open_elements.last.namespace == nil
27
+ @tree.open_elements.pop
28
+ end
29
+ @parser.phase = @parser.secondary_phase
30
+ @parser.phase.processStartTag(name, attributes)
31
+ else
32
+ if @tree.open_elements.last.namespace == :math
33
+ attribtues = adjust_mathml_attributes(attributes)
34
+ end
35
+ attributes = adjust_foreign_attributes(attributes)
36
+ @tree.insert_foreign_element(name, attributes, @tree.open_elements.last.namespace)
37
+ @tree.open_elements.pop if self_closing
38
+ end
39
+ end
40
+
41
+ def endTagOther(name)
42
+ @parser.secondary_phase.processEndTag(name)
43
+ if @parser.phase == @parser.phases[:inForeignContent]
44
+ if !@tree.open_elements.any? {|e| e.namespace }
45
+ @parser.phase = @parser.secondary_phase
46
+ end
47
+ end
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,56 @@
1
+ require 'html5/html5parser/phase'
2
+
3
+ module HTML5
4
+ class InFramesetPhase < Phase
5
+
6
+ # http://www.whatwg.org/specs/web-apps/current-work/#in-frameset
7
+
8
+ handle_start 'html', 'frameset', 'frame', 'noframes'
9
+
10
+ handle_end 'frameset', 'noframes'
11
+
12
+ def processCharacters(data)
13
+ parse_error("unexpected-char-in-frameset")
14
+ end
15
+
16
+ def startTagFrameset(name, attributes)
17
+ @tree.insert_element(name, attributes)
18
+ end
19
+
20
+ def startTagFrame(name, attributes)
21
+ @tree.insert_element(name, attributes)
22
+ @tree.open_elements.pop
23
+ end
24
+
25
+ def startTagNoframes(name, attributes)
26
+ @parser.phases[:inBody].processStartTag(name, attributes)
27
+ end
28
+
29
+ def startTagOther(name, attributes)
30
+ parse_error("unexpected-start-tag-in-frameset", {"name" => name})
31
+ end
32
+
33
+ def endTagFrameset(name)
34
+ if @tree.open_elements.last.name == 'html'
35
+ # inner_html case
36
+ parse_error("unexpected-frameset-in-frameset-innerhtml")
37
+ else
38
+ @tree.open_elements.pop
39
+ end
40
+ if (not @parser.inner_html and
41
+ @tree.open_elements.last.name != 'frameset')
42
+ # If we're not in inner_html mode and the the current node is not a
43
+ # "frameset" element (anymore) then switch.
44
+ @parser.phase = @parser.phases[:afterFrameset]
45
+ end
46
+ end
47
+
48
+ def endTagNoframes(name)
49
+ @parser.phases[:inBody].processEndTag(name)
50
+ end
51
+
52
+ def endTagOther(name)
53
+ parse_error("unexpected-end-tag-in-frameset", {"name" => name})
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,143 @@
1
+ require 'html5/html5parser/phase'
2
+
3
+ module HTML5
4
+ class InHeadPhase < Phase
5
+
6
+ handle_start 'html', 'head', 'title', 'style', 'script', 'noscript'
7
+ handle_start %w( base link meta)
8
+
9
+ handle_end 'head'
10
+ handle_end %w( html body br ) => 'ImplyAfterHead'
11
+ handle_end %w( title style script noscript )
12
+
13
+ def process_eof
14
+ if ['title', 'style', 'script'].include?(name = @tree.open_elements.last.name)
15
+ parse_error("expected-named-closing-tag-but-got-eof", {"name" => @tree.open_elements.last.name})
16
+ @tree.open_elements.pop
17
+ end
18
+ anything_else
19
+ @parser.phase.process_eof
20
+ end
21
+
22
+ def processCharacters(data)
23
+ if %w[title style script noscript].include?(@tree.open_elements.last.name)
24
+ @tree.insertText(data)
25
+ else
26
+ anything_else
27
+ @parser.phase.processCharacters(data)
28
+ end
29
+ end
30
+
31
+ def startTagHead(name, attributes)
32
+ parse_error("two-heads-are-not-better-than-one")
33
+ end
34
+
35
+ def startTagTitle(name, attributes)
36
+ if @tree.head_pointer != nil && @parser.phase == @parser.phases[:inHead]
37
+ element = @tree.createElement(name, attributes)
38
+ appendToHead(element)
39
+ @tree.open_elements << element
40
+ else
41
+ @tree.insert_element(name, attributes)
42
+ end
43
+ @parser.tokenizer.content_model_flag = :RCDATA
44
+ end
45
+
46
+ def startTagStyle(name, attributes)
47
+ if @tree.head_pointer != nil and @parser.phase == @parser.phases[:inHead]
48
+ element = @tree.createElement(name, attributes)
49
+ appendToHead(element)
50
+ @tree.open_elements.push(element)
51
+ else
52
+ @tree.insert_element(name, attributes)
53
+ end
54
+ @parser.tokenizer.content_model_flag = :CDATA
55
+ end
56
+
57
+ def startTagNoscript(name, attributes)
58
+ # XXX Need to decide whether to implement the scripting disabled case.
59
+ element = @tree.createElement(name, attributes)
60
+ if @tree.head_pointer !=nil and @parser.phase == @parser.phases[:inHead]
61
+ appendToHead(element)
62
+ else
63
+ @tree.open_elements.last.appendChild(element)
64
+ end
65
+ @tree.open_elements.push(element)
66
+ @parser.tokenizer.content_model_flag = :CDATA
67
+ end
68
+
69
+ def startTagScript(name, attributes)
70
+ #XXX Inner HTML case may be wrong
71
+ element = @tree.createElement(name, attributes)
72
+ element.flags.push("parser-inserted")
73
+ if @tree.head_pointer != nil and @parser.phase == @parser.phases[:inHead]
74
+ appendToHead(element)
75
+ else
76
+ @tree.open_elements.last.appendChild(element)
77
+ end
78
+ @tree.open_elements.push(element)
79
+ @parser.tokenizer.content_model_flag = :CDATA
80
+ end
81
+
82
+ def startTagBaseLinkMeta(name, attributes)
83
+ if @tree.head_pointer != nil and @parser.phase == @parser.phases[:inHead]
84
+ element = @tree.createElement(name, attributes)
85
+ appendToHead(element)
86
+ else
87
+ @tree.insert_element(name, attributes)
88
+ @tree.open_elements.pop
89
+ end
90
+ end
91
+
92
+ def startTagOther(name, attributes)
93
+ anything_else
94
+ @parser.phase.processStartTag(name, attributes)
95
+ end
96
+
97
+ def endTagHead(name)
98
+ if @tree.open_elements.last.name == 'head'
99
+ @tree.open_elements.pop
100
+ else
101
+ parse_error("unexpected-end-tag", {"name" => "head"})
102
+ end
103
+ @parser.phase = @parser.phases[:afterHead]
104
+ end
105
+
106
+ def endTagImplyAfterHead(name)
107
+ anything_else
108
+ @parser.phase.processEndTag(name)
109
+ end
110
+
111
+ def endTagTitleStyleScriptNoscript(name)
112
+ if @tree.open_elements.last.name == name
113
+ @tree.open_elements.pop
114
+ else
115
+ parse_error("unexpected-end-tag", {"name" => name})
116
+ end
117
+ end
118
+
119
+ def endTagOther(name)
120
+ anything_else
121
+ end
122
+
123
+ def anything_else
124
+ if @tree.open_elements.last.name == 'head'
125
+ endTagHead('head')
126
+ else
127
+ @parser.phase = @parser.phases[:afterHead]
128
+ end
129
+ end
130
+
131
+ protected
132
+
133
+ def appendToHead(element)
134
+ if @tree.head_pointer.nil?
135
+ assert @parser.inner_html
136
+ @tree.open_elements.last.appendChild(element)
137
+ else
138
+ @tree.head_pointer.appendChild(element)
139
+ end
140
+ end
141
+
142
+ end
143
+ end