html5 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (59) hide show
  1. data/History.txt +3 -0
  2. data/Manifest.txt +58 -0
  3. data/README +9 -0
  4. data/Rakefile.rb +17 -0
  5. data/lib/html5/constants.rb +818 -0
  6. data/lib/html5/filters/base.rb +10 -0
  7. data/lib/html5/filters/inject_meta_charset.rb +82 -0
  8. data/lib/html5/filters/optionaltags.rb +198 -0
  9. data/lib/html5/filters/sanitizer.rb +15 -0
  10. data/lib/html5/filters/whitespace.rb +36 -0
  11. data/lib/html5/html5parser/after_body_phase.rb +46 -0
  12. data/lib/html5/html5parser/after_frameset_phase.rb +34 -0
  13. data/lib/html5/html5parser/after_head_phase.rb +50 -0
  14. data/lib/html5/html5parser/before_head_phase.rb +41 -0
  15. data/lib/html5/html5parser/in_body_phase.rb +607 -0
  16. data/lib/html5/html5parser/in_caption_phase.rb +68 -0
  17. data/lib/html5/html5parser/in_cell_phase.rb +78 -0
  18. data/lib/html5/html5parser/in_column_group_phase.rb +55 -0
  19. data/lib/html5/html5parser/in_frameset_phase.rb +57 -0
  20. data/lib/html5/html5parser/in_head_phase.rb +138 -0
  21. data/lib/html5/html5parser/in_row_phase.rb +87 -0
  22. data/lib/html5/html5parser/in_select_phase.rb +84 -0
  23. data/lib/html5/html5parser/in_table_body_phase.rb +83 -0
  24. data/lib/html5/html5parser/in_table_phase.rb +110 -0
  25. data/lib/html5/html5parser/initial_phase.rb +134 -0
  26. data/lib/html5/html5parser/phase.rb +158 -0
  27. data/lib/html5/html5parser/root_element_phase.rb +42 -0
  28. data/lib/html5/html5parser/trailing_end_phase.rb +35 -0
  29. data/lib/html5/html5parser.rb +248 -0
  30. data/lib/html5/inputstream.rb +654 -0
  31. data/lib/html5/liberalxmlparser.rb +158 -0
  32. data/lib/html5/sanitizer.rb +188 -0
  33. data/lib/html5/serializer/htmlserializer.rb +180 -0
  34. data/lib/html5/serializer/xhtmlserializer.rb +20 -0
  35. data/lib/html5/serializer.rb +2 -0
  36. data/lib/html5/tokenizer.rb +968 -0
  37. data/lib/html5/treebuilders/base.rb +334 -0
  38. data/lib/html5/treebuilders/hpricot.rb +231 -0
  39. data/lib/html5/treebuilders/rexml.rb +208 -0
  40. data/lib/html5/treebuilders/simpletree.rb +185 -0
  41. data/lib/html5/treebuilders.rb +24 -0
  42. data/lib/html5/treewalkers/base.rb +154 -0
  43. data/lib/html5/treewalkers/hpricot.rb +48 -0
  44. data/lib/html5/treewalkers/rexml.rb +48 -0
  45. data/lib/html5/treewalkers/simpletree.rb +48 -0
  46. data/lib/html5/treewalkers.rb +26 -0
  47. data/lib/html5.rb +13 -0
  48. data/parse.rb +217 -0
  49. data/tests/preamble.rb +82 -0
  50. data/tests/test_encoding.rb +35 -0
  51. data/tests/test_lxp.rb +263 -0
  52. data/tests/test_parser.rb +68 -0
  53. data/tests/test_sanitizer.rb +142 -0
  54. data/tests/test_serializer.rb +68 -0
  55. data/tests/test_stream.rb +62 -0
  56. data/tests/test_tokenizer.rb +94 -0
  57. data/tests/test_treewalkers.rb +116 -0
  58. data/tests/tokenizer_test_parser.rb +63 -0
  59. metadata +120 -0
@@ -0,0 +1,208 @@
1
+ require 'html5/treebuilders/base'
2
+ require 'rexml/document'
3
+ require 'forwardable'
4
+
5
+ module HTML5
6
+ module TreeBuilders
7
+ module REXML
8
+
9
+ class Node < Base::Node
10
+ extend Forwardable
11
+ def_delegators :@rxobj, :name, :attributes
12
+ attr_accessor :rxobj
13
+
14
+ def initialize name
15
+ super name
16
+ @rxobj = self.class.rxclass.new name
17
+ end
18
+
19
+ def appendChild node
20
+ if node.kind_of?(TextNode) && childNodes.length > 0 && childNodes.last.kind_of?(TextNode)
21
+ childNodes.last.rxobj.value = childNodes.last.rxobj.to_s + node.rxobj.to_s
22
+ childNodes.last.rxobj.raw = true
23
+ else
24
+ childNodes.push node
25
+ rxobj.add node.rxobj
26
+ end
27
+ node.parent = self
28
+ end
29
+
30
+ def removeChild node
31
+ childNodes.delete node
32
+ rxobj.delete node.rxobj
33
+ node.parent = nil
34
+ end
35
+
36
+ def insertText data, before=nil
37
+ if before
38
+ insertBefore TextNode.new(data), before
39
+ else
40
+ appendChild TextNode.new(data)
41
+ end
42
+ end
43
+
44
+ def insertBefore node, refNode
45
+ index = childNodes.index(refNode)
46
+ if node.kind_of?(TextNode) and index > 0 && childNodes[index-1].kind_of?(TextNode)
47
+ childNodes[index-1].rxobj.value = childNodes[index-1].rxobj.to_s + node.rxobj.to_s
48
+ childNodes[index-1].rxobj.raw = true
49
+ else
50
+ childNodes.insert index, node
51
+ refNode.rxobj.parent.insert_before(refNode.rxobj,node.rxobj)
52
+ end
53
+ end
54
+
55
+ def hasContent
56
+ (childNodes.length > 0)
57
+ end
58
+ end
59
+
60
+ class Element < Node
61
+ def self.rxclass
62
+ ::REXML::Element
63
+ end
64
+
65
+ def initialize name
66
+ super name
67
+ end
68
+
69
+ def cloneNode
70
+ newNode = self.class.new name
71
+ attributes.each {|name,value| newNode.attributes[name] = value}
72
+ newNode
73
+ end
74
+
75
+ def attributes= value
76
+ value.each {|name, value| rxobj.attributes[name] = value}
77
+ end
78
+
79
+ def printTree indent=0
80
+ tree = "\n|#{' ' * indent}<#{name}>"
81
+ indent += 2
82
+ for name, value in attributes
83
+ next if name == 'xmlns'
84
+ tree += "\n|#{' ' * indent}#{name}=\"#{value}\""
85
+ end
86
+ for child in childNodes
87
+ tree += child.printTree(indent)
88
+ end
89
+ tree
90
+ end
91
+ end
92
+
93
+ class Document < Node
94
+ def self.rxclass
95
+ ::REXML::Document
96
+ end
97
+
98
+ def initialize
99
+ super nil
100
+ end
101
+
102
+ def appendChild node
103
+ if node.kind_of? Element and node.name == 'html'
104
+ node.rxobj.add_namespace('http://www.w3.org/1999/xhtml')
105
+ end
106
+ super node
107
+ end
108
+
109
+ def printTree indent=0
110
+ tree = "#document"
111
+ for child in childNodes
112
+ tree += child.printTree(indent + 2)
113
+ end
114
+ return tree
115
+ end
116
+ end
117
+
118
+ class DocumentType < Node
119
+ def_delegator :@rxobj, :public, :public_id
120
+
121
+ def_delegator :@rxobj, :system, :system_id
122
+
123
+ def self.rxclass
124
+ ::REXML::DocType
125
+ end
126
+
127
+ def initialize name, public_id, system_id
128
+ super(name)
129
+ if public_id
130
+ @rxobj = ::REXML::DocType.new [name, ::REXML::DocType::PUBLIC, public_id, system_id]
131
+ elsif system_id
132
+ @rxobj = ::REXML::DocType.new [name, ::REXML::DocType::SYSTEM, nil, system_id]
133
+ else
134
+ @rxobj = ::REXML::DocType.new name
135
+ end
136
+ end
137
+
138
+ def printTree indent=0
139
+ "\n|#{' ' * indent}<!DOCTYPE #{name}>"
140
+ end
141
+ end
142
+
143
+ class DocumentFragment < Element
144
+ def initialize
145
+ super nil
146
+ end
147
+
148
+ def printTree indent=0
149
+ tree = ""
150
+ for child in childNodes
151
+ tree += child.printTree(indent+2)
152
+ end
153
+ return tree
154
+ end
155
+ end
156
+
157
+ class TextNode < Node
158
+ def initialize data
159
+ raw = data.gsub('&', '&amp;').gsub('<', '&lt;').gsub('>', '&gt;')
160
+ @rxobj = ::REXML::Text.new(raw, true, nil, true)
161
+ end
162
+
163
+ def printTree indent=0
164
+ "\n|#{' ' * indent}\"#{rxobj.value}\""
165
+ end
166
+ end
167
+
168
+ class CommentNode < Node
169
+ def self.rxclass
170
+ ::REXML::Comment
171
+ end
172
+
173
+ def printTree indent=0
174
+ "\n|#{' ' * indent}<!-- #{rxobj.string} -->"
175
+ end
176
+ end
177
+
178
+ class TreeBuilder < Base::TreeBuilder
179
+ def initialize
180
+ @documentClass = Document
181
+ @doctypeClass = DocumentType
182
+ @elementClass = Element
183
+ @commentClass = CommentNode
184
+ @fragmentClass = DocumentFragment
185
+ end
186
+
187
+ def insertDoctype(name, public_id, system_id)
188
+ doctype = @doctypeClass.new(name, public_id, system_id)
189
+ @document.appendChild(doctype)
190
+ end
191
+
192
+ def testSerializer node
193
+ node.printTree
194
+ end
195
+
196
+ def get_document
197
+ @document.rxobj
198
+ end
199
+
200
+ def get_fragment
201
+ @document = super
202
+ return @document.rxobj.children
203
+ end
204
+ end
205
+
206
+ end
207
+ end
208
+ end
@@ -0,0 +1,185 @@
1
+ require 'html5/treebuilders/base'
2
+
3
+ module HTML5
4
+ module TreeBuilders
5
+ module SimpleTree
6
+
7
+ class Node < Base::Node
8
+ # Node representing an item in the tree.
9
+ # name - The tag name associated with the node
10
+ attr_accessor :name
11
+
12
+ # The value of the current node (applies to text nodes and
13
+ # comments
14
+ attr_accessor :value
15
+
16
+ # a dict holding name, value pairs for attributes of the node
17
+ attr_accessor :attributes
18
+
19
+ def initialize name
20
+ super
21
+ @name = name
22
+ @value = nil
23
+ @attributes = {}
24
+ end
25
+
26
+ def appendChild node
27
+ if node.kind_of? TextNode and
28
+ childNodes.length > 0 and childNodes.last.kind_of? TextNode
29
+ childNodes.last.value += node.value
30
+ else
31
+ childNodes << node
32
+ end
33
+ node.parent = self
34
+ end
35
+
36
+ def removeChild node
37
+ childNodes.delete node
38
+ node.parent = nil
39
+ end
40
+
41
+ def cloneNode
42
+ newNode = self.class.new name
43
+ attributes.each {|name,value| newNode.attributes[name] = value}
44
+ newNode.value = value
45
+ newNode
46
+ end
47
+
48
+ def insertText data, before=nil
49
+ if before
50
+ insertBefore TextNode.new(data), before
51
+ else
52
+ appendChild TextNode.new(data)
53
+ end
54
+ end
55
+
56
+ def insertBefore node, refNode
57
+ index = childNodes.index(refNode)
58
+ if node.kind_of?(TextNode) && index > 0 && childNodes[index-1].kind_of?(TextNode)
59
+ childNodes[index-1].value += node.value
60
+ else
61
+ childNodes.insert index, node
62
+ end
63
+ end
64
+
65
+ def printTree indent=0
66
+ tree = "\n|%s%s" % [' '* indent, self.to_s]
67
+ for child in childNodes
68
+ tree += child.printTree(indent + 2)
69
+ end
70
+ return tree
71
+ end
72
+
73
+ def hasContent
74
+ childNodes.length > 0
75
+ end
76
+ end
77
+
78
+ class Element < Node
79
+ def to_s
80
+ "<#{name}>"
81
+ end
82
+
83
+ def printTree indent=0
84
+ tree = "\n|%s%s" % [' '* indent, self.to_s]
85
+ indent += 2
86
+ for name, value in attributes
87
+ tree += "\n|%s%s=\"%s\"" % [' ' * indent, name, value]
88
+ end
89
+ for child in childNodes
90
+ tree += child.printTree(indent)
91
+ end
92
+ tree
93
+ end
94
+ end
95
+
96
+ class Document < Node
97
+ def to_s
98
+ "#document"
99
+ end
100
+
101
+ def initialize
102
+ super nil
103
+ end
104
+
105
+ def printTree indent=0
106
+ tree = to_s
107
+ for child in childNodes
108
+ tree += child.printTree(indent + 2)
109
+ end
110
+ tree
111
+ end
112
+ end
113
+
114
+ class DocumentType < Node
115
+ attr_accessor :public_id, :system_id
116
+
117
+ def to_s
118
+ "<!DOCTYPE #{name}>"
119
+ end
120
+
121
+ def initialize name
122
+ super name
123
+ @public_id = nil
124
+ @system_id = nil
125
+ end
126
+ end
127
+
128
+ class DocumentFragment < Element
129
+ def initialize
130
+ super nil
131
+ end
132
+
133
+ def printTree indent=0
134
+ tree = ""
135
+ for child in childNodes
136
+ tree += child.printTree(indent+2)
137
+ end
138
+ return tree
139
+ end
140
+ end
141
+
142
+ class TextNode < Node
143
+ def initialize value
144
+ super nil
145
+ @value = value
146
+ end
147
+
148
+ def to_s
149
+ '"%s"' % value
150
+ end
151
+ end
152
+
153
+ class CommentNode < Node
154
+ def initialize value
155
+ super nil
156
+ @value = value
157
+ end
158
+
159
+ def to_s
160
+ "<!-- %s -->" % value
161
+ end
162
+ end
163
+
164
+ class TreeBuilder < Base::TreeBuilder
165
+ def initialize
166
+ @documentClass = Document
167
+ @doctypeClass = DocumentType
168
+ @elementClass = Element
169
+ @commentClass = CommentNode
170
+ @fragmentClass = DocumentFragment
171
+ end
172
+
173
+ def testSerializer node
174
+ node.printTree
175
+ end
176
+
177
+ def get_fragment
178
+ @document = super
179
+ @document.childNodes
180
+ end
181
+ end
182
+
183
+ end
184
+ end
185
+ end
@@ -0,0 +1,24 @@
1
+ module HTML5
2
+ module TreeBuilders
3
+
4
+ class << self
5
+ def [](name)
6
+ case name.to_s.downcase
7
+ when 'simpletree' then
8
+ require 'html5/treebuilders/simpletree'
9
+ SimpleTree::TreeBuilder
10
+ when 'rexml' then
11
+ require 'html5/treebuilders/rexml'
12
+ REXML::TreeBuilder
13
+ when 'hpricot' then
14
+ require 'html5/treebuilders/hpricot'
15
+ Hpricot::TreeBuilder
16
+ else
17
+ raise "Unknown TreeBuilder #{name}"
18
+ end
19
+ end
20
+
21
+ alias :get_tree_builder :[]
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,154 @@
1
+ require 'html5/constants'
2
+ module HTML5
3
+ module TreeWalkers
4
+
5
+ module TokenConstructor
6
+ def error(msg)
7
+ {:type => "SerializeError", :data => msg}
8
+ end
9
+
10
+ def normalize_attrs(attrs)
11
+ attrs.to_a
12
+ end
13
+
14
+ def empty_tag(name, attrs, has_children=false)
15
+ error(_("Void element has children")) if has_children
16
+ {:type => :EmptyTag, :name => name, :data => normalize_attrs(attrs)}
17
+ end
18
+
19
+ def start_tag(name, attrs)
20
+ {:type => :StartTag, :name => name, :data => normalize_attrs(attrs)}
21
+ end
22
+
23
+ def end_tag(name)
24
+ {:type => :EndTag, :name => name, :data => []}
25
+ end
26
+
27
+ def text(data)
28
+ if data =~ /\A([#{SPACE_CHARACTERS.join('')}]+)/m
29
+ yield({:type => :SpaceCharacters, :data => $1})
30
+ data = data[$1.length .. -1]
31
+ return if data.empty?
32
+ end
33
+
34
+ if data =~ /([#{SPACE_CHARACTERS.join('')}]+)\Z/m
35
+ yield({:type => :Characters, :data => data[0 ... -$1.length]})
36
+ yield({:type => :SpaceCharacters, :data => $1})
37
+ else
38
+ yield({:type => :Characters, :data => data})
39
+ end
40
+ end
41
+
42
+ def comment(data)
43
+ {:type => :Comment, :data => data}
44
+ end
45
+
46
+ def doctype(name, public_id, system_id, correct=nil)
47
+ {:type => :Doctype, :name => name, :public_id => public_id, :system_id => system_id, :correct => correct}
48
+ end
49
+
50
+ def unknown(nodeType)
51
+ error(_("Unknown node type: ") + nodeType.to_s)
52
+ end
53
+
54
+ def _(str)
55
+ str
56
+ end
57
+ end
58
+
59
+ class Base
60
+ include TokenConstructor
61
+
62
+ def initialize(tree)
63
+ @tree = tree
64
+ end
65
+
66
+ def each
67
+ raise NotImplementedError
68
+ end
69
+
70
+ alias walk each
71
+ end
72
+
73
+ class NonRecursiveTreeWalker < TreeWalkers::Base
74
+ def node_details(node)
75
+ raise NotImplementedError
76
+ end
77
+
78
+ def first_child(node)
79
+ raise NotImplementedError
80
+ end
81
+
82
+ def next_sibling(node)
83
+ raise NotImplementedError
84
+ end
85
+
86
+ def parent(node)
87
+ raise NotImplementedError
88
+ end
89
+
90
+ def each
91
+ current_node = @tree
92
+ while current_node != nil
93
+ details = node_details(current_node)
94
+ has_children = false
95
+
96
+ case details.shift
97
+ when :DOCTYPE
98
+ yield doctype(*details)
99
+
100
+ when :TEXT
101
+ text(*details) {|token| yield token}
102
+
103
+ when :ELEMENT
104
+ name, attributes, has_children = details
105
+ if VOID_ELEMENTS.include?(name)
106
+ yield empty_tag(name, attributes.to_a, has_children)
107
+ has_children = false
108
+ else
109
+ yield start_tag(name, attributes.to_a)
110
+ end
111
+
112
+ when :COMMENT
113
+ yield comment(details[0])
114
+
115
+ when :DOCUMENT, :DOCUMENT_FRAGMENT
116
+ has_children = true
117
+
118
+ when nil
119
+ # ignore (REXML::XMLDecl is an example)
120
+
121
+ else
122
+ yield unknown(details[0])
123
+ end
124
+
125
+ first_child = has_children ? first_child(current_node) : nil
126
+ if first_child != nil
127
+ current_node = first_child
128
+ else
129
+ while current_node != nil
130
+ details = node_details(current_node)
131
+ if details.shift == :ELEMENT
132
+ name, attributes, has_children = details
133
+ yield end_tag(name) if !VOID_ELEMENTS.include?(name)
134
+ end
135
+
136
+ if @tree == current_node
137
+ current_node = nil
138
+ else
139
+ next_sibling = next_sibling(current_node)
140
+ if next_sibling != nil
141
+ current_node = next_sibling
142
+ break
143
+ end
144
+
145
+ current_node = parent(current_node)
146
+ end
147
+ end
148
+ end
149
+ end
150
+ end
151
+ end
152
+
153
+ end
154
+ end
@@ -0,0 +1,48 @@
1
+ require 'html5/treewalkers/base'
2
+ require 'rexml/document'
3
+
4
+ module HTML5
5
+ module TreeWalkers
6
+ module Hpricot
7
+ class TreeWalker < HTML5::TreeWalkers::NonRecursiveTreeWalker
8
+
9
+ def node_details(node)
10
+ case node
11
+ when ::Hpricot::Elem
12
+ if node.name.empty?
13
+ [:DOCUMENT_FRAGMENT]
14
+ else
15
+ [:ELEMENT, node.name,
16
+ node.attributes.map {|name, value| [name, value]},
17
+ !node.empty?]
18
+ end
19
+ when ::Hpricot::Text
20
+ [:TEXT, node.content]
21
+ when ::Hpricot::Comment
22
+ [:COMMENT, node.content]
23
+ when ::Hpricot::Doc
24
+ [:DOCUMENT]
25
+ when ::Hpricot::DocType
26
+ [:DOCTYPE, node.target, node.public_id, node.system_id]
27
+ when ::Hpricot::XMLDecl
28
+ [nil]
29
+ else
30
+ [:UNKNOWN, node.class.inspect]
31
+ end
32
+ end
33
+
34
+ def first_child(node)
35
+ node.children.first
36
+ end
37
+
38
+ def next_sibling(node)
39
+ node.next_node
40
+ end
41
+
42
+ def parent(node)
43
+ node.parent
44
+ end
45
+ end
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,48 @@
1
+ require 'html5/treewalkers/base'
2
+ require 'rexml/document'
3
+
4
+ module HTML5
5
+ module TreeWalkers
6
+ module REXML
7
+ class TreeWalker < HTML5::TreeWalkers::NonRecursiveTreeWalker
8
+
9
+ def node_details(node)
10
+ case node
11
+ when ::REXML::Document
12
+ [:DOCUMENT]
13
+ when ::REXML::Element
14
+ if !node.name
15
+ [:DOCUMENT_FRAGMENT]
16
+ else
17
+ [:ELEMENT, node.name,
18
+ node.attributes.map {|name,value| [name,value]},
19
+ node.has_elements? || node.has_text?]
20
+ end
21
+ when ::REXML::Text
22
+ [:TEXT, node.value]
23
+ when ::REXML::Comment
24
+ [:COMMENT, node.string]
25
+ when ::REXML::DocType
26
+ [:DOCTYPE, node.name, node.public, node.system]
27
+ when ::REXML::XMLDecl
28
+ [nil]
29
+ else
30
+ [:UNKNOWN, node.class.inspect]
31
+ end
32
+ end
33
+
34
+ def first_child(node)
35
+ node.children.first
36
+ end
37
+
38
+ def next_sibling(node)
39
+ node.next_sibling
40
+ end
41
+
42
+ def parent(node)
43
+ node.parent
44
+ end
45
+ end
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,48 @@
1
+ require 'html5/treewalkers/base'
2
+
3
+ module HTML5
4
+ module TreeWalkers
5
+ module SimpleTree
6
+ class TreeWalker < HTML5::TreeWalkers::Base
7
+ include HTML5::TreeBuilders::SimpleTree
8
+
9
+ def walk(node)
10
+ case node
11
+ when Document, DocumentFragment
12
+ return
13
+
14
+ when DocumentType
15
+ yield doctype(node.name, node.public_id, node.system_id)
16
+
17
+ when TextNode
18
+ text(node.value) {|token| yield token}
19
+
20
+ when Element
21
+ if VOID_ELEMENTS.include?(node.name)
22
+ yield empty_tag(node.name, node.attributes, node.hasContent())
23
+ else
24
+ yield start_tag(node.name, node.attributes)
25
+ for child in node.childNodes
26
+ walk(child) {|token| yield token}
27
+ end
28
+ yield end_tag(node.name)
29
+ end
30
+
31
+ when CommentNode
32
+ yield comment(node.value)
33
+
34
+ else
35
+ puts '?'
36
+ yield unknown(node.class)
37
+ end
38
+ end
39
+
40
+ def each
41
+ for child in @tree.childNodes
42
+ walk(child) {|node| yield node}
43
+ end
44
+ end
45
+ end
46
+ end
47
+ end
48
+ end