spk-html5 0.10.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (74) hide show
  1. data/History.txt +10 -0
  2. data/Manifest.txt +73 -0
  3. data/README +45 -0
  4. data/Rakefile.rb +33 -0
  5. data/bin/html5 +7 -0
  6. data/lib/html5.rb +13 -0
  7. data/lib/html5/cli.rb +248 -0
  8. data/lib/html5/constants.rb +1061 -0
  9. data/lib/html5/filters/base.rb +10 -0
  10. data/lib/html5/filters/inject_meta_charset.rb +82 -0
  11. data/lib/html5/filters/iso639codes.rb +755 -0
  12. data/lib/html5/filters/optionaltags.rb +198 -0
  13. data/lib/html5/filters/rfc2046.rb +31 -0
  14. data/lib/html5/filters/rfc3987.rb +91 -0
  15. data/lib/html5/filters/sanitizer.rb +15 -0
  16. data/lib/html5/filters/validator.rb +834 -0
  17. data/lib/html5/filters/whitespace.rb +36 -0
  18. data/lib/html5/html5parser.rb +247 -0
  19. data/lib/html5/html5parser/after_after_body_phase.rb +43 -0
  20. data/lib/html5/html5parser/after_after_frameset_phase.rb +32 -0
  21. data/lib/html5/html5parser/after_body_phase.rb +46 -0
  22. data/lib/html5/html5parser/after_frameset_phase.rb +33 -0
  23. data/lib/html5/html5parser/after_head_phase.rb +55 -0
  24. data/lib/html5/html5parser/before_head_phase.rb +44 -0
  25. data/lib/html5/html5parser/before_html_phase.rb +41 -0
  26. data/lib/html5/html5parser/in_body_phase.rb +636 -0
  27. data/lib/html5/html5parser/in_caption_phase.rb +69 -0
  28. data/lib/html5/html5parser/in_cell_phase.rb +78 -0
  29. data/lib/html5/html5parser/in_column_group_phase.rb +55 -0
  30. data/lib/html5/html5parser/in_foreign_content_phase.rb +50 -0
  31. data/lib/html5/html5parser/in_frameset_phase.rb +56 -0
  32. data/lib/html5/html5parser/in_head_phase.rb +143 -0
  33. data/lib/html5/html5parser/in_row_phase.rb +96 -0
  34. data/lib/html5/html5parser/in_select_phase.rb +90 -0
  35. data/lib/html5/html5parser/in_select_table_phase.rb +35 -0
  36. data/lib/html5/html5parser/in_table_body_phase.rb +92 -0
  37. data/lib/html5/html5parser/in_table_phase.rb +177 -0
  38. data/lib/html5/html5parser/initial_phase.rb +133 -0
  39. data/lib/html5/html5parser/phase.rb +171 -0
  40. data/lib/html5/inputstream.rb +735 -0
  41. data/lib/html5/liberalxmlparser.rb +158 -0
  42. data/lib/html5/sanitizer.rb +209 -0
  43. data/lib/html5/serializer.rb +2 -0
  44. data/lib/html5/serializer/htmlserializer.rb +179 -0
  45. data/lib/html5/serializer/xhtmlserializer.rb +20 -0
  46. data/lib/html5/sniffer.rb +45 -0
  47. data/lib/html5/tokenizer.rb +1059 -0
  48. data/lib/html5/treebuilders.rb +24 -0
  49. data/lib/html5/treebuilders/base.rb +339 -0
  50. data/lib/html5/treebuilders/hpricot.rb +231 -0
  51. data/lib/html5/treebuilders/rexml.rb +215 -0
  52. data/lib/html5/treebuilders/simpletree.rb +191 -0
  53. data/lib/html5/treewalkers.rb +26 -0
  54. data/lib/html5/treewalkers/base.rb +162 -0
  55. data/lib/html5/treewalkers/hpricot.rb +48 -0
  56. data/lib/html5/treewalkers/rexml.rb +48 -0
  57. data/lib/html5/treewalkers/simpletree.rb +48 -0
  58. data/lib/html5/version.rb +3 -0
  59. data/test/preamble.rb +69 -0
  60. data/test/test_cli.rb +16 -0
  61. data/test/test_encoding.rb +35 -0
  62. data/test/test_input_stream.rb +26 -0
  63. data/test/test_lxp.rb +283 -0
  64. data/test/test_parser.rb +63 -0
  65. data/test/test_sanitizer.rb +173 -0
  66. data/test/test_serializer.rb +67 -0
  67. data/test/test_sniffer.rb +27 -0
  68. data/test/test_stream.rb +71 -0
  69. data/test/test_tokenizer.rb +95 -0
  70. data/test/test_treewalkers.rb +135 -0
  71. data/test/test_validator.rb +31 -0
  72. data/test/tokenizer_test_parser.rb +67 -0
  73. data/test19.rb +38 -0
  74. metadata +198 -0
@@ -0,0 +1,69 @@
1
+ require 'html5/html5parser/phase'
2
+
3
+ module HTML5
4
+ class InCaptionPhase < Phase
5
+
6
+ # http://www.whatwg.org/specs/web-apps/current-work/#in-caption
7
+
8
+ handle_start 'html', %w(caption col colgroup tbody td tfoot th thead tr) => 'TableElement'
9
+
10
+ handle_end 'caption', 'table', %w(body col colgroup html tbody td tfoot th thead tr) => 'Ignore'
11
+
12
+ def ignoreEndTagCaption
13
+ !in_scope?('caption', true)
14
+ end
15
+
16
+ def processCharacters(data)
17
+ @parser.phases[:inBody].processCharacters(data)
18
+ end
19
+
20
+ def startTagTableElement(name, attributes)
21
+ parse_error "unexpected-end-tag", {"name" => name}
22
+ #XXX Have to duplicate logic here to find out if the tag is ignored
23
+ ignoreEndTag = ignoreEndTagCaption
24
+ @parser.phase.processEndTag('caption')
25
+ @parser.phase.processStartTag(name, attributes) unless ignoreEndTag
26
+ end
27
+
28
+ def startTagOther(name, attributes)
29
+ @parser.phases[:inBody].processStartTag(name, attributes)
30
+ end
31
+
32
+ def endTagCaption(name)
33
+ if ignoreEndTagCaption
34
+ # inner_html case
35
+ assert @parser.inner_html
36
+ parse_error "unexpected-end-tag", {"name" => name}
37
+ else
38
+ # AT this code is quite similar to endTagTable in "InTable"
39
+ @tree.generateImpliedEndTags
40
+
41
+ unless @tree.open_elements[-1].name == 'caption'
42
+ parse_error("expected-one-end-tag-but-got-another",
43
+ {"gotName" => "caption",
44
+ "expectedName" => @tree.open_elements.last.name})
45
+ end
46
+
47
+ remove_open_elements_until('caption')
48
+
49
+ @tree.clearActiveFormattingElements
50
+ @parser.phase = @parser.phases[:inTable]
51
+ end
52
+ end
53
+
54
+ def endTagTable(name)
55
+ parse_error "unexpected-end-table-in-caption"
56
+ ignoreEndTag = ignoreEndTagCaption
57
+ @parser.phase.processEndTag('caption')
58
+ @parser.phase.processEndTag(name) unless ignoreEndTag
59
+ end
60
+
61
+ def endTagIgnore(name)
62
+ parse_error("unexpected-end-tag", {"name" => name})
63
+ end
64
+
65
+ def endTagOther(name)
66
+ @parser.phases[:inBody].processEndTag(name)
67
+ end
68
+ end
69
+ end
@@ -0,0 +1,78 @@
1
+ require 'html5/html5parser/phase'
2
+
3
+ module HTML5
4
+ class InCellPhase < Phase
5
+
6
+ # http://www.whatwg.org/specs/web-apps/current-work/#in-cell
7
+
8
+ handle_start 'html', %w( caption col colgroup tbody td tfoot th thead tr ) => 'TableOther'
9
+
10
+ handle_end %w( td th ) => 'TableCell', %w( body caption col colgroup html ) => 'Ignore'
11
+
12
+ handle_end %w( table tbody tfoot thead tr ) => 'Imply'
13
+
14
+ def processCharacters(data)
15
+ @parser.phases[:inBody].processCharacters(data)
16
+ end
17
+
18
+ def startTagTableOther(name, attributes)
19
+ if in_scope?('td', true) or in_scope?('th', true)
20
+ closeCell
21
+ @parser.phase.processStartTag(name, attributes)
22
+ else
23
+ # inner_html case
24
+ parse_error
25
+ end
26
+ end
27
+
28
+ def startTagOther(name, attributes)
29
+ @parser.phases[:inBody].processStartTag(name, attributes)
30
+ end
31
+
32
+ def endTagTableCell(name)
33
+ if in_scope?(name, true)
34
+ @tree.generateImpliedEndTags(name)
35
+ if @tree.open_elements.last.name != name
36
+ parse_error("unexpected-cell-end-tag", {"name" => name})
37
+
38
+ remove_open_elements_until(name)
39
+ else
40
+ @tree.open_elements.pop
41
+ end
42
+ @tree.clearActiveFormattingElements
43
+ @parser.phase = @parser.phases[:inRow]
44
+ else
45
+ parse_error("unexpected-end-tag", {"name" => name})
46
+ end
47
+ end
48
+
49
+ def endTagIgnore(name)
50
+ parse_error("unexpected-end-tag", {"name" => name})
51
+ end
52
+
53
+ def endTagImply(name)
54
+ if in_scope?(name, true)
55
+ closeCell
56
+ @parser.phase.processEndTag(name)
57
+ else
58
+ # sometimes inner_html case
59
+ parse_error "unexpected-end-tag", {:name => name}
60
+ end
61
+ end
62
+
63
+ def endTagOther(name)
64
+ @parser.phases[:inBody].processEndTag(name)
65
+ end
66
+
67
+ protected
68
+
69
+ def closeCell
70
+ if in_scope?('td', true)
71
+ endTagTableCell('td')
72
+ elsif in_scope?('th', true)
73
+ endTagTableCell('th')
74
+ end
75
+ end
76
+
77
+ end
78
+ end
@@ -0,0 +1,55 @@
1
+ require 'html5/html5parser/phase'
2
+
3
+ module HTML5
4
+ class InColumnGroupPhase < Phase
5
+
6
+ # http://www.whatwg.org/specs/web-apps/current-work/#in-column
7
+
8
+ handle_start 'html', 'col'
9
+
10
+ handle_end 'colgroup', 'col'
11
+
12
+ def ignoreEndTagColgroup
13
+ @tree.open_elements[-1].name == 'html'
14
+ end
15
+
16
+ def processCharacters(data)
17
+ ignoreEndTag = ignoreEndTagColgroup
18
+ endTagColgroup("colgroup")
19
+ @parser.phase.processCharacters(data) unless ignoreEndTag
20
+ end
21
+
22
+ def startTagCol(name, attributes)
23
+ @tree.insert_element(name, attributes)
24
+ @tree.open_elements.pop
25
+ end
26
+
27
+ def startTagOther(name, attributes)
28
+ ignoreEndTag = ignoreEndTagColgroup
29
+ endTagColgroup('colgroup')
30
+ @parser.phase.processStartTag(name, attributes) unless ignoreEndTag
31
+ end
32
+
33
+ def endTagColgroup(name)
34
+ if ignoreEndTagColgroup
35
+ # inner_html case
36
+ assert @parser.inner_html
37
+ parse_error "unexpected-end-tag", {:name => name}
38
+ else
39
+ @tree.open_elements.pop
40
+ @parser.phase = @parser.phases[:inTable]
41
+ end
42
+ end
43
+
44
+ def endTagCol(name)
45
+ parse_error("no-end-tag", {"name" => "col"})
46
+ end
47
+
48
+ def endTagOther(name)
49
+ ignoreEndTag = ignoreEndTagColgroup
50
+ endTagColgroup('colgroup')
51
+ @parser.phase.processEndTag(name) unless ignoreEndTag
52
+ end
53
+
54
+ end
55
+ end
@@ -0,0 +1,50 @@
1
+ require 'html5/html5parser/phase'
2
+
3
+ module HTML5
4
+ class InForeignContentPhase < Phase
5
+
6
+ def processCharacters(data)
7
+ @tree.insertText(data)
8
+ end
9
+
10
+ def startTagOther(name, attributes, self_closing)
11
+ if !%w[mglyph malignmark].include?(name) && %w[mi mo mn ms mtext].include?(@tree.open_elements.last.name) &&
12
+ @tree.open_elements.last.namespace == :math
13
+
14
+ @parser.secondary_phase.processStartTag(name, attributes)
15
+ if @parser.phase == @parser.phases[:inForeignContent]
16
+ if !@tree.open_elements.any? {|e| e.namespace }
17
+ @parser.phase = @parser.secondary_phase
18
+ end
19
+ end
20
+ elsif %w[b big blockquote body br center code dd div dl dt em embed font
21
+ h1 h2 h3 h4 h5 h6 head hr i img li listing menu meta nobr ol p pre ruby s small
22
+ span strong strike sub sup table tt u ul var].include?(name)
23
+
24
+ parse_error("html-in-foreign-content", :name => name)
25
+
26
+ until @tree.open_elements.last.namespace == nil
27
+ @tree.open_elements.pop
28
+ end
29
+ @parser.phase = @parser.secondary_phase
30
+ @parser.phase.processStartTag(name, attributes)
31
+ else
32
+ if @tree.open_elements.last.namespace == :math
33
+ attribtues = adjust_mathml_attributes(attributes)
34
+ end
35
+ attributes = adjust_foreign_attributes(attributes)
36
+ @tree.insert_foreign_element(name, attributes, @tree.open_elements.last.namespace)
37
+ @tree.open_elements.pop if self_closing
38
+ end
39
+ end
40
+
41
+ def endTagOther(name)
42
+ @parser.secondary_phase.processEndTag(name)
43
+ if @parser.phase == @parser.phases[:inForeignContent]
44
+ if !@tree.open_elements.any? {|e| e.namespace }
45
+ @parser.phase = @parser.secondary_phase
46
+ end
47
+ end
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,56 @@
1
+ require 'html5/html5parser/phase'
2
+
3
+ module HTML5
4
+ class InFramesetPhase < Phase
5
+
6
+ # http://www.whatwg.org/specs/web-apps/current-work/#in-frameset
7
+
8
+ handle_start 'html', 'frameset', 'frame', 'noframes'
9
+
10
+ handle_end 'frameset', 'noframes'
11
+
12
+ def processCharacters(data)
13
+ parse_error("unexpected-char-in-frameset")
14
+ end
15
+
16
+ def startTagFrameset(name, attributes)
17
+ @tree.insert_element(name, attributes)
18
+ end
19
+
20
+ def startTagFrame(name, attributes)
21
+ @tree.insert_element(name, attributes)
22
+ @tree.open_elements.pop
23
+ end
24
+
25
+ def startTagNoframes(name, attributes)
26
+ @parser.phases[:inBody].processStartTag(name, attributes)
27
+ end
28
+
29
+ def startTagOther(name, attributes)
30
+ parse_error("unexpected-start-tag-in-frameset", {"name" => name})
31
+ end
32
+
33
+ def endTagFrameset(name)
34
+ if @tree.open_elements.last.name == 'html'
35
+ # inner_html case
36
+ parse_error("unexpected-frameset-in-frameset-innerhtml")
37
+ else
38
+ @tree.open_elements.pop
39
+ end
40
+ if (not @parser.inner_html and
41
+ @tree.open_elements.last.name != 'frameset')
42
+ # If we're not in inner_html mode and the the current node is not a
43
+ # "frameset" element (anymore) then switch.
44
+ @parser.phase = @parser.phases[:afterFrameset]
45
+ end
46
+ end
47
+
48
+ def endTagNoframes(name)
49
+ @parser.phases[:inBody].processEndTag(name)
50
+ end
51
+
52
+ def endTagOther(name)
53
+ parse_error("unexpected-end-tag-in-frameset", {"name" => name})
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,143 @@
1
+ require 'html5/html5parser/phase'
2
+
3
+ module HTML5
4
+ class InHeadPhase < Phase
5
+
6
+ handle_start 'html', 'head', 'title', 'style', 'script', 'noscript'
7
+ handle_start %w( base link meta)
8
+
9
+ handle_end 'head'
10
+ handle_end %w( html body br ) => 'ImplyAfterHead'
11
+ handle_end %w( title style script noscript )
12
+
13
+ def process_eof
14
+ if ['title', 'style', 'script'].include?(name = @tree.open_elements.last.name)
15
+ parse_error("expected-named-closing-tag-but-got-eof", {"name" => @tree.open_elements.last.name})
16
+ @tree.open_elements.pop
17
+ end
18
+ anything_else
19
+ @parser.phase.process_eof
20
+ end
21
+
22
+ def processCharacters(data)
23
+ if %w[title style script noscript].include?(@tree.open_elements.last.name)
24
+ @tree.insertText(data)
25
+ else
26
+ anything_else
27
+ @parser.phase.processCharacters(data)
28
+ end
29
+ end
30
+
31
+ def startTagHead(name, attributes)
32
+ parse_error("two-heads-are-not-better-than-one")
33
+ end
34
+
35
+ def startTagTitle(name, attributes)
36
+ if @tree.head_pointer != nil && @parser.phase == @parser.phases[:inHead]
37
+ element = @tree.createElement(name, attributes)
38
+ appendToHead(element)
39
+ @tree.open_elements << element
40
+ else
41
+ @tree.insert_element(name, attributes)
42
+ end
43
+ @parser.tokenizer.content_model_flag = :RCDATA
44
+ end
45
+
46
+ def startTagStyle(name, attributes)
47
+ if @tree.head_pointer != nil and @parser.phase == @parser.phases[:inHead]
48
+ element = @tree.createElement(name, attributes)
49
+ appendToHead(element)
50
+ @tree.open_elements.push(element)
51
+ else
52
+ @tree.insert_element(name, attributes)
53
+ end
54
+ @parser.tokenizer.content_model_flag = :CDATA
55
+ end
56
+
57
+ def startTagNoscript(name, attributes)
58
+ # XXX Need to decide whether to implement the scripting disabled case.
59
+ element = @tree.createElement(name, attributes)
60
+ if @tree.head_pointer !=nil and @parser.phase == @parser.phases[:inHead]
61
+ appendToHead(element)
62
+ else
63
+ @tree.open_elements.last.appendChild(element)
64
+ end
65
+ @tree.open_elements.push(element)
66
+ @parser.tokenizer.content_model_flag = :CDATA
67
+ end
68
+
69
+ def startTagScript(name, attributes)
70
+ #XXX Inner HTML case may be wrong
71
+ element = @tree.createElement(name, attributes)
72
+ element.flags.push("parser-inserted")
73
+ if @tree.head_pointer != nil and @parser.phase == @parser.phases[:inHead]
74
+ appendToHead(element)
75
+ else
76
+ @tree.open_elements.last.appendChild(element)
77
+ end
78
+ @tree.open_elements.push(element)
79
+ @parser.tokenizer.content_model_flag = :CDATA
80
+ end
81
+
82
+ def startTagBaseLinkMeta(name, attributes)
83
+ if @tree.head_pointer != nil and @parser.phase == @parser.phases[:inHead]
84
+ element = @tree.createElement(name, attributes)
85
+ appendToHead(element)
86
+ else
87
+ @tree.insert_element(name, attributes)
88
+ @tree.open_elements.pop
89
+ end
90
+ end
91
+
92
+ def startTagOther(name, attributes)
93
+ anything_else
94
+ @parser.phase.processStartTag(name, attributes)
95
+ end
96
+
97
+ def endTagHead(name)
98
+ if @tree.open_elements.last.name == 'head'
99
+ @tree.open_elements.pop
100
+ else
101
+ parse_error("unexpected-end-tag", {"name" => "head"})
102
+ end
103
+ @parser.phase = @parser.phases[:afterHead]
104
+ end
105
+
106
+ def endTagImplyAfterHead(name)
107
+ anything_else
108
+ @parser.phase.processEndTag(name)
109
+ end
110
+
111
+ def endTagTitleStyleScriptNoscript(name)
112
+ if @tree.open_elements.last.name == name
113
+ @tree.open_elements.pop
114
+ else
115
+ parse_error("unexpected-end-tag", {"name" => name})
116
+ end
117
+ end
118
+
119
+ def endTagOther(name)
120
+ anything_else
121
+ end
122
+
123
+ def anything_else
124
+ if @tree.open_elements.last.name == 'head'
125
+ endTagHead('head')
126
+ else
127
+ @parser.phase = @parser.phases[:afterHead]
128
+ end
129
+ end
130
+
131
+ protected
132
+
133
+ def appendToHead(element)
134
+ if @tree.head_pointer.nil?
135
+ assert @parser.inner_html
136
+ @tree.open_elements.last.appendChild(element)
137
+ else
138
+ @tree.head_pointer.appendChild(element)
139
+ end
140
+ end
141
+
142
+ end
143
+ end