html5 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. data/History.txt +3 -0
  2. data/Manifest.txt +58 -0
  3. data/README +9 -0
  4. data/Rakefile.rb +17 -0
  5. data/lib/html5/constants.rb +818 -0
  6. data/lib/html5/filters/base.rb +10 -0
  7. data/lib/html5/filters/inject_meta_charset.rb +82 -0
  8. data/lib/html5/filters/optionaltags.rb +198 -0
  9. data/lib/html5/filters/sanitizer.rb +15 -0
  10. data/lib/html5/filters/whitespace.rb +36 -0
  11. data/lib/html5/html5parser/after_body_phase.rb +46 -0
  12. data/lib/html5/html5parser/after_frameset_phase.rb +34 -0
  13. data/lib/html5/html5parser/after_head_phase.rb +50 -0
  14. data/lib/html5/html5parser/before_head_phase.rb +41 -0
  15. data/lib/html5/html5parser/in_body_phase.rb +607 -0
  16. data/lib/html5/html5parser/in_caption_phase.rb +68 -0
  17. data/lib/html5/html5parser/in_cell_phase.rb +78 -0
  18. data/lib/html5/html5parser/in_column_group_phase.rb +55 -0
  19. data/lib/html5/html5parser/in_frameset_phase.rb +57 -0
  20. data/lib/html5/html5parser/in_head_phase.rb +138 -0
  21. data/lib/html5/html5parser/in_row_phase.rb +87 -0
  22. data/lib/html5/html5parser/in_select_phase.rb +84 -0
  23. data/lib/html5/html5parser/in_table_body_phase.rb +83 -0
  24. data/lib/html5/html5parser/in_table_phase.rb +110 -0
  25. data/lib/html5/html5parser/initial_phase.rb +134 -0
  26. data/lib/html5/html5parser/phase.rb +158 -0
  27. data/lib/html5/html5parser/root_element_phase.rb +42 -0
  28. data/lib/html5/html5parser/trailing_end_phase.rb +35 -0
  29. data/lib/html5/html5parser.rb +248 -0
  30. data/lib/html5/inputstream.rb +654 -0
  31. data/lib/html5/liberalxmlparser.rb +158 -0
  32. data/lib/html5/sanitizer.rb +188 -0
  33. data/lib/html5/serializer/htmlserializer.rb +180 -0
  34. data/lib/html5/serializer/xhtmlserializer.rb +20 -0
  35. data/lib/html5/serializer.rb +2 -0
  36. data/lib/html5/tokenizer.rb +968 -0
  37. data/lib/html5/treebuilders/base.rb +334 -0
  38. data/lib/html5/treebuilders/hpricot.rb +231 -0
  39. data/lib/html5/treebuilders/rexml.rb +208 -0
  40. data/lib/html5/treebuilders/simpletree.rb +185 -0
  41. data/lib/html5/treebuilders.rb +24 -0
  42. data/lib/html5/treewalkers/base.rb +154 -0
  43. data/lib/html5/treewalkers/hpricot.rb +48 -0
  44. data/lib/html5/treewalkers/rexml.rb +48 -0
  45. data/lib/html5/treewalkers/simpletree.rb +48 -0
  46. data/lib/html5/treewalkers.rb +26 -0
  47. data/lib/html5.rb +13 -0
  48. data/parse.rb +217 -0
  49. data/tests/preamble.rb +82 -0
  50. data/tests/test_encoding.rb +35 -0
  51. data/tests/test_lxp.rb +263 -0
  52. data/tests/test_parser.rb +68 -0
  53. data/tests/test_sanitizer.rb +142 -0
  54. data/tests/test_serializer.rb +68 -0
  55. data/tests/test_stream.rb +62 -0
  56. data/tests/test_tokenizer.rb +94 -0
  57. data/tests/test_treewalkers.rb +116 -0
  58. data/tests/tokenizer_test_parser.rb +63 -0
  59. metadata +120 -0
@@ -0,0 +1,208 @@
1
+ require 'html5/treebuilders/base'
2
+ require 'rexml/document'
3
+ require 'forwardable'
4
+
5
+ module HTML5
6
+ module TreeBuilders
7
+ module REXML
8
+
9
+ class Node < Base::Node
10
+ extend Forwardable
11
+ def_delegators :@rxobj, :name, :attributes
12
+ attr_accessor :rxobj
13
+
14
+ def initialize name
15
+ super name
16
+ @rxobj = self.class.rxclass.new name
17
+ end
18
+
19
+ def appendChild node
20
+ if node.kind_of?(TextNode) && childNodes.length > 0 && childNodes.last.kind_of?(TextNode)
21
+ childNodes.last.rxobj.value = childNodes.last.rxobj.to_s + node.rxobj.to_s
22
+ childNodes.last.rxobj.raw = true
23
+ else
24
+ childNodes.push node
25
+ rxobj.add node.rxobj
26
+ end
27
+ node.parent = self
28
+ end
29
+
30
+ def removeChild node
31
+ childNodes.delete node
32
+ rxobj.delete node.rxobj
33
+ node.parent = nil
34
+ end
35
+
36
+ def insertText data, before=nil
37
+ if before
38
+ insertBefore TextNode.new(data), before
39
+ else
40
+ appendChild TextNode.new(data)
41
+ end
42
+ end
43
+
44
+ def insertBefore node, refNode
45
+ index = childNodes.index(refNode)
46
+ if node.kind_of?(TextNode) and index > 0 && childNodes[index-1].kind_of?(TextNode)
47
+ childNodes[index-1].rxobj.value = childNodes[index-1].rxobj.to_s + node.rxobj.to_s
48
+ childNodes[index-1].rxobj.raw = true
49
+ else
50
+ childNodes.insert index, node
51
+ refNode.rxobj.parent.insert_before(refNode.rxobj,node.rxobj)
52
+ end
53
+ end
54
+
55
+ def hasContent
56
+ (childNodes.length > 0)
57
+ end
58
+ end
59
+
60
+ class Element < Node
61
+ def self.rxclass
62
+ ::REXML::Element
63
+ end
64
+
65
+ def initialize name
66
+ super name
67
+ end
68
+
69
+ def cloneNode
70
+ newNode = self.class.new name
71
+ attributes.each {|name,value| newNode.attributes[name] = value}
72
+ newNode
73
+ end
74
+
75
+ def attributes= value
76
+ value.each {|name, value| rxobj.attributes[name] = value}
77
+ end
78
+
79
+ def printTree indent=0
80
+ tree = "\n|#{' ' * indent}<#{name}>"
81
+ indent += 2
82
+ for name, value in attributes
83
+ next if name == 'xmlns'
84
+ tree += "\n|#{' ' * indent}#{name}=\"#{value}\""
85
+ end
86
+ for child in childNodes
87
+ tree += child.printTree(indent)
88
+ end
89
+ tree
90
+ end
91
+ end
92
+
93
+ class Document < Node
94
+ def self.rxclass
95
+ ::REXML::Document
96
+ end
97
+
98
+ def initialize
99
+ super nil
100
+ end
101
+
102
+ def appendChild node
103
+ if node.kind_of? Element and node.name == 'html'
104
+ node.rxobj.add_namespace('http://www.w3.org/1999/xhtml')
105
+ end
106
+ super node
107
+ end
108
+
109
+ def printTree indent=0
110
+ tree = "#document"
111
+ for child in childNodes
112
+ tree += child.printTree(indent + 2)
113
+ end
114
+ return tree
115
+ end
116
+ end
117
+
118
+ class DocumentType < Node
119
+ def_delegator :@rxobj, :public, :public_id
120
+
121
+ def_delegator :@rxobj, :system, :system_id
122
+
123
+ def self.rxclass
124
+ ::REXML::DocType
125
+ end
126
+
127
+ def initialize name, public_id, system_id
128
+ super(name)
129
+ if public_id
130
+ @rxobj = ::REXML::DocType.new [name, ::REXML::DocType::PUBLIC, public_id, system_id]
131
+ elsif system_id
132
+ @rxobj = ::REXML::DocType.new [name, ::REXML::DocType::SYSTEM, nil, system_id]
133
+ else
134
+ @rxobj = ::REXML::DocType.new name
135
+ end
136
+ end
137
+
138
+ def printTree indent=0
139
+ "\n|#{' ' * indent}<!DOCTYPE #{name}>"
140
+ end
141
+ end
142
+
143
+ class DocumentFragment < Element
144
+ def initialize
145
+ super nil
146
+ end
147
+
148
+ def printTree indent=0
149
+ tree = ""
150
+ for child in childNodes
151
+ tree += child.printTree(indent+2)
152
+ end
153
+ return tree
154
+ end
155
+ end
156
+
157
+ class TextNode < Node
158
+ def initialize data
159
+ raw = data.gsub('&', '&amp;').gsub('<', '&lt;').gsub('>', '&gt;')
160
+ @rxobj = ::REXML::Text.new(raw, true, nil, true)
161
+ end
162
+
163
+ def printTree indent=0
164
+ "\n|#{' ' * indent}\"#{rxobj.value}\""
165
+ end
166
+ end
167
+
168
+ class CommentNode < Node
169
+ def self.rxclass
170
+ ::REXML::Comment
171
+ end
172
+
173
+ def printTree indent=0
174
+ "\n|#{' ' * indent}<!-- #{rxobj.string} -->"
175
+ end
176
+ end
177
+
178
+ class TreeBuilder < Base::TreeBuilder
179
+ def initialize
180
+ @documentClass = Document
181
+ @doctypeClass = DocumentType
182
+ @elementClass = Element
183
+ @commentClass = CommentNode
184
+ @fragmentClass = DocumentFragment
185
+ end
186
+
187
+ def insertDoctype(name, public_id, system_id)
188
+ doctype = @doctypeClass.new(name, public_id, system_id)
189
+ @document.appendChild(doctype)
190
+ end
191
+
192
+ def testSerializer node
193
+ node.printTree
194
+ end
195
+
196
+ def get_document
197
+ @document.rxobj
198
+ end
199
+
200
+ def get_fragment
201
+ @document = super
202
+ return @document.rxobj.children
203
+ end
204
+ end
205
+
206
+ end
207
+ end
208
+ end
@@ -0,0 +1,185 @@
1
+ require 'html5/treebuilders/base'
2
+
3
+ module HTML5
4
+ module TreeBuilders
5
+ module SimpleTree
6
+
7
+ class Node < Base::Node
8
+ # Node representing an item in the tree.
9
+ # name - The tag name associated with the node
10
+ attr_accessor :name
11
+
12
+ # The value of the current node (applies to text nodes and
13
+ # comments
14
+ attr_accessor :value
15
+
16
+ # a dict holding name, value pairs for attributes of the node
17
+ attr_accessor :attributes
18
+
19
+ def initialize name
20
+ super
21
+ @name = name
22
+ @value = nil
23
+ @attributes = {}
24
+ end
25
+
26
+ def appendChild node
27
+ if node.kind_of? TextNode and
28
+ childNodes.length > 0 and childNodes.last.kind_of? TextNode
29
+ childNodes.last.value += node.value
30
+ else
31
+ childNodes << node
32
+ end
33
+ node.parent = self
34
+ end
35
+
36
+ def removeChild node
37
+ childNodes.delete node
38
+ node.parent = nil
39
+ end
40
+
41
+ def cloneNode
42
+ newNode = self.class.new name
43
+ attributes.each {|name,value| newNode.attributes[name] = value}
44
+ newNode.value = value
45
+ newNode
46
+ end
47
+
48
+ def insertText data, before=nil
49
+ if before
50
+ insertBefore TextNode.new(data), before
51
+ else
52
+ appendChild TextNode.new(data)
53
+ end
54
+ end
55
+
56
+ def insertBefore node, refNode
57
+ index = childNodes.index(refNode)
58
+ if node.kind_of?(TextNode) && index > 0 && childNodes[index-1].kind_of?(TextNode)
59
+ childNodes[index-1].value += node.value
60
+ else
61
+ childNodes.insert index, node
62
+ end
63
+ end
64
+
65
+ def printTree indent=0
66
+ tree = "\n|%s%s" % [' '* indent, self.to_s]
67
+ for child in childNodes
68
+ tree += child.printTree(indent + 2)
69
+ end
70
+ return tree
71
+ end
72
+
73
+ def hasContent
74
+ childNodes.length > 0
75
+ end
76
+ end
77
+
78
+ class Element < Node
79
+ def to_s
80
+ "<#{name}>"
81
+ end
82
+
83
+ def printTree indent=0
84
+ tree = "\n|%s%s" % [' '* indent, self.to_s]
85
+ indent += 2
86
+ for name, value in attributes
87
+ tree += "\n|%s%s=\"%s\"" % [' ' * indent, name, value]
88
+ end
89
+ for child in childNodes
90
+ tree += child.printTree(indent)
91
+ end
92
+ tree
93
+ end
94
+ end
95
+
96
+ class Document < Node
97
+ def to_s
98
+ "#document"
99
+ end
100
+
101
+ def initialize
102
+ super nil
103
+ end
104
+
105
+ def printTree indent=0
106
+ tree = to_s
107
+ for child in childNodes
108
+ tree += child.printTree(indent + 2)
109
+ end
110
+ tree
111
+ end
112
+ end
113
+
114
+ class DocumentType < Node
115
+ attr_accessor :public_id, :system_id
116
+
117
+ def to_s
118
+ "<!DOCTYPE #{name}>"
119
+ end
120
+
121
+ def initialize name
122
+ super name
123
+ @public_id = nil
124
+ @system_id = nil
125
+ end
126
+ end
127
+
128
+ class DocumentFragment < Element
129
+ def initialize
130
+ super nil
131
+ end
132
+
133
+ def printTree indent=0
134
+ tree = ""
135
+ for child in childNodes
136
+ tree += child.printTree(indent+2)
137
+ end
138
+ return tree
139
+ end
140
+ end
141
+
142
+ class TextNode < Node
143
+ def initialize value
144
+ super nil
145
+ @value = value
146
+ end
147
+
148
+ def to_s
149
+ '"%s"' % value
150
+ end
151
+ end
152
+
153
+ class CommentNode < Node
154
+ def initialize value
155
+ super nil
156
+ @value = value
157
+ end
158
+
159
+ def to_s
160
+ "<!-- %s -->" % value
161
+ end
162
+ end
163
+
164
+ class TreeBuilder < Base::TreeBuilder
165
+ def initialize
166
+ @documentClass = Document
167
+ @doctypeClass = DocumentType
168
+ @elementClass = Element
169
+ @commentClass = CommentNode
170
+ @fragmentClass = DocumentFragment
171
+ end
172
+
173
+ def testSerializer node
174
+ node.printTree
175
+ end
176
+
177
+ def get_fragment
178
+ @document = super
179
+ @document.childNodes
180
+ end
181
+ end
182
+
183
+ end
184
+ end
185
+ end
@@ -0,0 +1,24 @@
1
+ module HTML5
2
+ module TreeBuilders
3
+
4
+ class << self
5
+ def [](name)
6
+ case name.to_s.downcase
7
+ when 'simpletree' then
8
+ require 'html5/treebuilders/simpletree'
9
+ SimpleTree::TreeBuilder
10
+ when 'rexml' then
11
+ require 'html5/treebuilders/rexml'
12
+ REXML::TreeBuilder
13
+ when 'hpricot' then
14
+ require 'html5/treebuilders/hpricot'
15
+ Hpricot::TreeBuilder
16
+ else
17
+ raise "Unknown TreeBuilder #{name}"
18
+ end
19
+ end
20
+
21
+ alias :get_tree_builder :[]
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,154 @@
1
+ require 'html5/constants'
2
+ module HTML5
3
+ module TreeWalkers
4
+
5
+ module TokenConstructor
6
+ def error(msg)
7
+ {:type => "SerializeError", :data => msg}
8
+ end
9
+
10
+ def normalize_attrs(attrs)
11
+ attrs.to_a
12
+ end
13
+
14
+ def empty_tag(name, attrs, has_children=false)
15
+ error(_("Void element has children")) if has_children
16
+ {:type => :EmptyTag, :name => name, :data => normalize_attrs(attrs)}
17
+ end
18
+
19
+ def start_tag(name, attrs)
20
+ {:type => :StartTag, :name => name, :data => normalize_attrs(attrs)}
21
+ end
22
+
23
+ def end_tag(name)
24
+ {:type => :EndTag, :name => name, :data => []}
25
+ end
26
+
27
+ def text(data)
28
+ if data =~ /\A([#{SPACE_CHARACTERS.join('')}]+)/m
29
+ yield({:type => :SpaceCharacters, :data => $1})
30
+ data = data[$1.length .. -1]
31
+ return if data.empty?
32
+ end
33
+
34
+ if data =~ /([#{SPACE_CHARACTERS.join('')}]+)\Z/m
35
+ yield({:type => :Characters, :data => data[0 ... -$1.length]})
36
+ yield({:type => :SpaceCharacters, :data => $1})
37
+ else
38
+ yield({:type => :Characters, :data => data})
39
+ end
40
+ end
41
+
42
+ def comment(data)
43
+ {:type => :Comment, :data => data}
44
+ end
45
+
46
+ def doctype(name, public_id, system_id, correct=nil)
47
+ {:type => :Doctype, :name => name, :public_id => public_id, :system_id => system_id, :correct => correct}
48
+ end
49
+
50
+ def unknown(nodeType)
51
+ error(_("Unknown node type: ") + nodeType.to_s)
52
+ end
53
+
54
+ def _(str)
55
+ str
56
+ end
57
+ end
58
+
59
+ class Base
60
+ include TokenConstructor
61
+
62
+ def initialize(tree)
63
+ @tree = tree
64
+ end
65
+
66
+ def each
67
+ raise NotImplementedError
68
+ end
69
+
70
+ alias walk each
71
+ end
72
+
73
+ class NonRecursiveTreeWalker < TreeWalkers::Base
74
+ def node_details(node)
75
+ raise NotImplementedError
76
+ end
77
+
78
+ def first_child(node)
79
+ raise NotImplementedError
80
+ end
81
+
82
+ def next_sibling(node)
83
+ raise NotImplementedError
84
+ end
85
+
86
+ def parent(node)
87
+ raise NotImplementedError
88
+ end
89
+
90
+ def each
91
+ current_node = @tree
92
+ while current_node != nil
93
+ details = node_details(current_node)
94
+ has_children = false
95
+
96
+ case details.shift
97
+ when :DOCTYPE
98
+ yield doctype(*details)
99
+
100
+ when :TEXT
101
+ text(*details) {|token| yield token}
102
+
103
+ when :ELEMENT
104
+ name, attributes, has_children = details
105
+ if VOID_ELEMENTS.include?(name)
106
+ yield empty_tag(name, attributes.to_a, has_children)
107
+ has_children = false
108
+ else
109
+ yield start_tag(name, attributes.to_a)
110
+ end
111
+
112
+ when :COMMENT
113
+ yield comment(details[0])
114
+
115
+ when :DOCUMENT, :DOCUMENT_FRAGMENT
116
+ has_children = true
117
+
118
+ when nil
119
+ # ignore (REXML::XMLDecl is an example)
120
+
121
+ else
122
+ yield unknown(details[0])
123
+ end
124
+
125
+ first_child = has_children ? first_child(current_node) : nil
126
+ if first_child != nil
127
+ current_node = first_child
128
+ else
129
+ while current_node != nil
130
+ details = node_details(current_node)
131
+ if details.shift == :ELEMENT
132
+ name, attributes, has_children = details
133
+ yield end_tag(name) if !VOID_ELEMENTS.include?(name)
134
+ end
135
+
136
+ if @tree == current_node
137
+ current_node = nil
138
+ else
139
+ next_sibling = next_sibling(current_node)
140
+ if next_sibling != nil
141
+ current_node = next_sibling
142
+ break
143
+ end
144
+
145
+ current_node = parent(current_node)
146
+ end
147
+ end
148
+ end
149
+ end
150
+ end
151
+ end
152
+
153
+ end
154
+ end
@@ -0,0 +1,48 @@
1
+ require 'html5/treewalkers/base'
2
+ require 'rexml/document'
3
+
4
+ module HTML5
5
+ module TreeWalkers
6
+ module Hpricot
7
+ class TreeWalker < HTML5::TreeWalkers::NonRecursiveTreeWalker
8
+
9
+ def node_details(node)
10
+ case node
11
+ when ::Hpricot::Elem
12
+ if node.name.empty?
13
+ [:DOCUMENT_FRAGMENT]
14
+ else
15
+ [:ELEMENT, node.name,
16
+ node.attributes.map {|name, value| [name, value]},
17
+ !node.empty?]
18
+ end
19
+ when ::Hpricot::Text
20
+ [:TEXT, node.content]
21
+ when ::Hpricot::Comment
22
+ [:COMMENT, node.content]
23
+ when ::Hpricot::Doc
24
+ [:DOCUMENT]
25
+ when ::Hpricot::DocType
26
+ [:DOCTYPE, node.target, node.public_id, node.system_id]
27
+ when ::Hpricot::XMLDecl
28
+ [nil]
29
+ else
30
+ [:UNKNOWN, node.class.inspect]
31
+ end
32
+ end
33
+
34
+ def first_child(node)
35
+ node.children.first
36
+ end
37
+
38
+ def next_sibling(node)
39
+ node.next_node
40
+ end
41
+
42
+ def parent(node)
43
+ node.parent
44
+ end
45
+ end
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,48 @@
1
+ require 'html5/treewalkers/base'
2
+ require 'rexml/document'
3
+
4
+ module HTML5
5
+ module TreeWalkers
6
+ module REXML
7
+ class TreeWalker < HTML5::TreeWalkers::NonRecursiveTreeWalker
8
+
9
+ def node_details(node)
10
+ case node
11
+ when ::REXML::Document
12
+ [:DOCUMENT]
13
+ when ::REXML::Element
14
+ if !node.name
15
+ [:DOCUMENT_FRAGMENT]
16
+ else
17
+ [:ELEMENT, node.name,
18
+ node.attributes.map {|name,value| [name,value]},
19
+ node.has_elements? || node.has_text?]
20
+ end
21
+ when ::REXML::Text
22
+ [:TEXT, node.value]
23
+ when ::REXML::Comment
24
+ [:COMMENT, node.string]
25
+ when ::REXML::DocType
26
+ [:DOCTYPE, node.name, node.public, node.system]
27
+ when ::REXML::XMLDecl
28
+ [nil]
29
+ else
30
+ [:UNKNOWN, node.class.inspect]
31
+ end
32
+ end
33
+
34
+ def first_child(node)
35
+ node.children.first
36
+ end
37
+
38
+ def next_sibling(node)
39
+ node.next_sibling
40
+ end
41
+
42
+ def parent(node)
43
+ node.parent
44
+ end
45
+ end
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,48 @@
1
+ require 'html5/treewalkers/base'
2
+
3
+ module HTML5
4
+ module TreeWalkers
5
+ module SimpleTree
6
+ class TreeWalker < HTML5::TreeWalkers::Base
7
+ include HTML5::TreeBuilders::SimpleTree
8
+
9
+ def walk(node)
10
+ case node
11
+ when Document, DocumentFragment
12
+ return
13
+
14
+ when DocumentType
15
+ yield doctype(node.name, node.public_id, node.system_id)
16
+
17
+ when TextNode
18
+ text(node.value) {|token| yield token}
19
+
20
+ when Element
21
+ if VOID_ELEMENTS.include?(node.name)
22
+ yield empty_tag(node.name, node.attributes, node.hasContent())
23
+ else
24
+ yield start_tag(node.name, node.attributes)
25
+ for child in node.childNodes
26
+ walk(child) {|token| yield token}
27
+ end
28
+ yield end_tag(node.name)
29
+ end
30
+
31
+ when CommentNode
32
+ yield comment(node.value)
33
+
34
+ else
35
+ puts '?'
36
+ yield unknown(node.class)
37
+ end
38
+ end
39
+
40
+ def each
41
+ for child in @tree.childNodes
42
+ walk(child) {|node| yield node}
43
+ end
44
+ end
45
+ end
46
+ end
47
+ end
48
+ end