spk-html5 0.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. data/History.txt +10 -0
  2. data/Manifest.txt +73 -0
  3. data/README +45 -0
  4. data/Rakefile.rb +33 -0
  5. data/bin/html5 +7 -0
  6. data/lib/html5.rb +13 -0
  7. data/lib/html5/cli.rb +248 -0
  8. data/lib/html5/constants.rb +1061 -0
  9. data/lib/html5/filters/base.rb +10 -0
  10. data/lib/html5/filters/inject_meta_charset.rb +82 -0
  11. data/lib/html5/filters/iso639codes.rb +755 -0
  12. data/lib/html5/filters/optionaltags.rb +198 -0
  13. data/lib/html5/filters/rfc2046.rb +31 -0
  14. data/lib/html5/filters/rfc3987.rb +91 -0
  15. data/lib/html5/filters/sanitizer.rb +15 -0
  16. data/lib/html5/filters/validator.rb +834 -0
  17. data/lib/html5/filters/whitespace.rb +36 -0
  18. data/lib/html5/html5parser.rb +247 -0
  19. data/lib/html5/html5parser/after_after_body_phase.rb +43 -0
  20. data/lib/html5/html5parser/after_after_frameset_phase.rb +32 -0
  21. data/lib/html5/html5parser/after_body_phase.rb +46 -0
  22. data/lib/html5/html5parser/after_frameset_phase.rb +33 -0
  23. data/lib/html5/html5parser/after_head_phase.rb +55 -0
  24. data/lib/html5/html5parser/before_head_phase.rb +44 -0
  25. data/lib/html5/html5parser/before_html_phase.rb +41 -0
  26. data/lib/html5/html5parser/in_body_phase.rb +636 -0
  27. data/lib/html5/html5parser/in_caption_phase.rb +69 -0
  28. data/lib/html5/html5parser/in_cell_phase.rb +78 -0
  29. data/lib/html5/html5parser/in_column_group_phase.rb +55 -0
  30. data/lib/html5/html5parser/in_foreign_content_phase.rb +50 -0
  31. data/lib/html5/html5parser/in_frameset_phase.rb +56 -0
  32. data/lib/html5/html5parser/in_head_phase.rb +143 -0
  33. data/lib/html5/html5parser/in_row_phase.rb +96 -0
  34. data/lib/html5/html5parser/in_select_phase.rb +90 -0
  35. data/lib/html5/html5parser/in_select_table_phase.rb +35 -0
  36. data/lib/html5/html5parser/in_table_body_phase.rb +92 -0
  37. data/lib/html5/html5parser/in_table_phase.rb +177 -0
  38. data/lib/html5/html5parser/initial_phase.rb +133 -0
  39. data/lib/html5/html5parser/phase.rb +171 -0
  40. data/lib/html5/inputstream.rb +735 -0
  41. data/lib/html5/liberalxmlparser.rb +158 -0
  42. data/lib/html5/sanitizer.rb +209 -0
  43. data/lib/html5/serializer.rb +2 -0
  44. data/lib/html5/serializer/htmlserializer.rb +179 -0
  45. data/lib/html5/serializer/xhtmlserializer.rb +20 -0
  46. data/lib/html5/sniffer.rb +45 -0
  47. data/lib/html5/tokenizer.rb +1059 -0
  48. data/lib/html5/treebuilders.rb +24 -0
  49. data/lib/html5/treebuilders/base.rb +339 -0
  50. data/lib/html5/treebuilders/hpricot.rb +231 -0
  51. data/lib/html5/treebuilders/rexml.rb +215 -0
  52. data/lib/html5/treebuilders/simpletree.rb +191 -0
  53. data/lib/html5/treewalkers.rb +26 -0
  54. data/lib/html5/treewalkers/base.rb +162 -0
  55. data/lib/html5/treewalkers/hpricot.rb +48 -0
  56. data/lib/html5/treewalkers/rexml.rb +48 -0
  57. data/lib/html5/treewalkers/simpletree.rb +48 -0
  58. data/lib/html5/version.rb +3 -0
  59. data/test/preamble.rb +69 -0
  60. data/test/test_cli.rb +16 -0
  61. data/test/test_encoding.rb +35 -0
  62. data/test/test_input_stream.rb +26 -0
  63. data/test/test_lxp.rb +283 -0
  64. data/test/test_parser.rb +63 -0
  65. data/test/test_sanitizer.rb +173 -0
  66. data/test/test_serializer.rb +67 -0
  67. data/test/test_sniffer.rb +27 -0
  68. data/test/test_stream.rb +71 -0
  69. data/test/test_tokenizer.rb +95 -0
  70. data/test/test_treewalkers.rb +135 -0
  71. data/test/test_validator.rb +31 -0
  72. data/test/tokenizer_test_parser.rb +67 -0
  73. data/test19.rb +38 -0
  74. metadata +198 -0
@@ -0,0 +1,215 @@
1
+ require 'html5/treebuilders/base'
2
+ require 'rexml/document'
3
+ require 'forwardable'
4
+
5
+ module HTML5
6
+ module TreeBuilders
7
+ module REXML
8
+
9
+ class Node < Base::Node
10
+ extend Forwardable
11
+ def_delegators :@rxobj, :name, :attributes
12
+ attr_accessor :rxobj
13
+
14
+ def initialize name
15
+ super name
16
+ @rxobj = self.class.rxclass.new name
17
+ end
18
+
19
+ def appendChild node
20
+ if node.kind_of?(TextNode) && childNodes.length > 0 && childNodes.last.kind_of?(TextNode)
21
+ childNodes.last.rxobj.value = childNodes.last.rxobj.to_s + node.rxobj.to_s
22
+ childNodes.last.rxobj.raw = true
23
+ else
24
+ childNodes.push node
25
+ rxobj.add node.rxobj
26
+ end
27
+ node.parent = self
28
+ end
29
+
30
+ def removeChild node
31
+ childNodes.delete node
32
+ rxobj.delete node.rxobj
33
+ node.parent = nil
34
+ end
35
+
36
+ def insertText data, before=nil
37
+ if before
38
+ insertBefore TextNode.new(data), before
39
+ else
40
+ appendChild TextNode.new(data)
41
+ end
42
+ end
43
+
44
+ def insertBefore node, refNode
45
+ index = childNodes.index(refNode)
46
+ if node.kind_of?(TextNode) and index > 0 && childNodes[index-1].kind_of?(TextNode)
47
+ childNodes[index-1].rxobj.value = childNodes[index-1].rxobj.to_s + node.rxobj.to_s
48
+ childNodes[index-1].rxobj.raw = true
49
+ else
50
+ childNodes.insert index, node
51
+ refNode.rxobj.parent.insert_before(refNode.rxobj,node.rxobj)
52
+ end
53
+ end
54
+
55
+ def hasContent
56
+ (childNodes.length > 0)
57
+ end
58
+ end
59
+
60
+ class Element < Node
61
+ attr_reader :namespace
62
+ def self.rxclass
63
+ ::REXML::Element
64
+ end
65
+
66
+ def initialize name, namespace=nil
67
+ super name
68
+ @namespace = namespace
69
+ end
70
+
71
+ def cloneNode
72
+ newNode = self.class.new name
73
+ attributes.each {|name,value| newNode.attributes[name] = value}
74
+ newNode
75
+ end
76
+
77
+ def attributes= value
78
+ value.each {|name, v| rxobj.attributes[name] = v}
79
+ end
80
+
81
+ def printTree indent=0
82
+ tree = "\n|#{' ' * indent}<#{namespace ? namespace.to_s + ' ' : ''}#{name}>"
83
+ indent += 2
84
+ for name, value in attributes
85
+ next if name == 'xmlns'
86
+ tree += "\n|#{' ' * indent}#{name}=\"#{value}\""
87
+ end
88
+ for child in childNodes
89
+ tree += child.printTree(indent)
90
+ end
91
+ tree
92
+ end
93
+ end
94
+
95
+ class Document < Node
96
+ def self.rxclass
97
+ ::REXML::Document
98
+ end
99
+
100
+ def initialize
101
+ super nil
102
+ end
103
+
104
+ # ryansking: not sure why this was here. removing it doesn't cause any tests to fail
105
+ # def appendChild node
106
+ # if node.kind_of? Element and node.name == 'html'
107
+ # node.rxobj.add_namespace('http://www.w3.org/1999/xhtml')
108
+ # end
109
+ # super node
110
+ # end
111
+
112
+ def printTree indent=0
113
+ tree = "#document"
114
+ for child in childNodes
115
+ tree += child.printTree(indent + 2)
116
+ end
117
+ return tree
118
+ end
119
+ end
120
+
121
+ class DocumentType < Node
122
+ def_delegator :@rxobj, :public, :public_id
123
+
124
+ def_delegator :@rxobj, :system, :system_id
125
+
126
+ def self.rxclass
127
+ ::REXML::DocType
128
+ end
129
+
130
+ def initialize name, public_id, system_id
131
+ super(name)
132
+ if public_id
133
+ @rxobj = ::REXML::DocType.new [name, ::REXML::DocType::PUBLIC, public_id, system_id]
134
+ elsif system_id
135
+ @rxobj = ::REXML::DocType.new [name, ::REXML::DocType::SYSTEM, nil, system_id]
136
+ else
137
+ @rxobj = ::REXML::DocType.new name
138
+ end
139
+ end
140
+
141
+ def printTree indent=0
142
+ "\n|#{' ' * indent}<!DOCTYPE #{name}" + ([public_id, system_id].any? ? " \"#{public_id}\" \"#{system_id}\"" : "") + ">"
143
+ end
144
+ end
145
+
146
+ class DocumentFragment < Element
147
+ def initialize
148
+ super nil
149
+ end
150
+
151
+ def printTree indent=0
152
+ tree = ""
153
+ for child in childNodes
154
+ tree += child.printTree(indent+2)
155
+ end
156
+ return tree
157
+ end
158
+ end
159
+
160
+ class TextNode < Node
161
+ def initialize data
162
+ raw = data.gsub('&', '&amp;').gsub('<', '&lt;').gsub('>', '&gt;')
163
+ if String.method_defined? :encode
164
+ @rxobj = ::REXML::Text.new(raw.force_encoding('UTF-8'), true, nil, true)
165
+ else
166
+ @rxobj = ::REXML::Text.new(raw, true, nil, true)
167
+ end
168
+ end
169
+
170
+ def printTree indent=0
171
+ "\n|#{' ' * indent}\"#{rxobj.value}\""
172
+ end
173
+ end
174
+
175
+ class CommentNode < Node
176
+ def self.rxclass
177
+ ::REXML::Comment
178
+ end
179
+
180
+ def printTree indent=0
181
+ "\n|#{' ' * indent}<!-- #{rxobj.string} -->"
182
+ end
183
+ end
184
+
185
+ class TreeBuilder < Base::TreeBuilder
186
+ def initialize
187
+ @documentClass = Document
188
+ @doctypeClass = DocumentType
189
+ @elementClass = Element
190
+ @commentClass = CommentNode
191
+ @fragmentClass = DocumentFragment
192
+ end
193
+
194
+ def insertDoctype(name, public_id, system_id)
195
+ doctype = @doctypeClass.new(name, public_id, system_id)
196
+ @document.appendChild(doctype)
197
+ end
198
+
199
+ def testSerializer node
200
+ node.printTree
201
+ end
202
+
203
+ def get_document
204
+ @document.rxobj
205
+ end
206
+
207
+ def get_fragment
208
+ @document = super
209
+ return @document.rxobj.children
210
+ end
211
+ end
212
+
213
+ end
214
+ end
215
+ end
@@ -0,0 +1,191 @@
1
+ require 'html5/treebuilders/base'
2
+
3
+ module HTML5
4
+ module TreeBuilders
5
+ module SimpleTree
6
+
7
+ class Node < Base::Node
8
+ # Node representing an item in the tree.
9
+ # name - The tag name associated with the node
10
+ attr_accessor :name
11
+
12
+ # The value of the current node (applies to text nodes and
13
+ # comments
14
+ attr_accessor :value
15
+
16
+ # a dict holding name, value pairs for attributes of the node
17
+ attr_accessor :attributes
18
+
19
+ def initialize name
20
+ super
21
+ @name = name
22
+ @value = nil
23
+ @attributes = {}
24
+ end
25
+
26
+ def appendChild node
27
+ if node.kind_of? TextNode and
28
+ childNodes.length > 0 and childNodes.last.kind_of? TextNode
29
+ childNodes.last.value += node.value
30
+ else
31
+ childNodes << node
32
+ end
33
+ node.parent = self
34
+ end
35
+
36
+ def removeChild node
37
+ childNodes.delete node
38
+ node.parent = nil
39
+ end
40
+
41
+ def cloneNode
42
+ newNode = self.class.new name
43
+ attributes.each {|name,value| newNode.attributes[name] = value}
44
+ newNode.value = value
45
+ newNode
46
+ end
47
+
48
+ def insertText data, before=nil
49
+ if before
50
+ insertBefore TextNode.new(data), before
51
+ else
52
+ appendChild TextNode.new(data)
53
+ end
54
+ end
55
+
56
+ def insertBefore node, refNode
57
+ index = childNodes.index(refNode)
58
+ if node.kind_of?(TextNode) && index > 0 && childNodes[index-1].kind_of?(TextNode)
59
+ childNodes[index-1].value += node.value
60
+ else
61
+ childNodes.insert index, node
62
+ end
63
+ end
64
+
65
+ def printTree indent=0
66
+ tree = "\n|%s%s" % [' '* indent, self.to_s]
67
+ for child in childNodes
68
+ tree += child.printTree(indent + 2)
69
+ end
70
+ return tree
71
+ end
72
+
73
+ def hasContent
74
+ childNodes.length > 0
75
+ end
76
+ end
77
+
78
+ class Element < Node
79
+ attr_accessor :namespace
80
+ def initialize(name, namespace=nil)
81
+ super(name)
82
+ @namespace = namespace
83
+ end
84
+
85
+ def to_s
86
+ "<#{namespace ? namespace.to_s + ' ' : ''}#{name}>"
87
+ end
88
+
89
+ def printTree indent=0
90
+ tree = "\n|%s%s" % [' '* indent, self.to_s]
91
+ indent += 2
92
+ for name, value in attributes
93
+ tree += "\n|%s%s=\"%s\"" % [' ' * indent, name, value]
94
+ end
95
+ for child in childNodes
96
+ tree += child.printTree(indent)
97
+ end
98
+ tree
99
+ end
100
+ end
101
+
102
+ class Document < Node
103
+ def to_s
104
+ "#document"
105
+ end
106
+
107
+ def initialize
108
+ super nil
109
+ end
110
+
111
+ def printTree indent=0
112
+ tree = to_s
113
+ for child in childNodes
114
+ tree += child.printTree(indent + 2)
115
+ end
116
+ tree
117
+ end
118
+ end
119
+
120
+ class DocumentType < Node
121
+ attr_accessor :public_id, :system_id
122
+
123
+ def to_s
124
+ "<!DOCTYPE #{name}" + ([@public_id, @system_id].any? ? " \"#{@public_id}\" \"#{@system_id}\"" : '') + ">"
125
+ end
126
+
127
+ def initialize name
128
+ super name
129
+ @public_id = nil
130
+ @system_id = nil
131
+ end
132
+ end
133
+
134
+ class DocumentFragment < Element
135
+ def initialize
136
+ super nil
137
+ end
138
+
139
+ def printTree indent=0
140
+ tree = ""
141
+ for child in childNodes
142
+ tree += child.printTree(indent+2)
143
+ end
144
+ return tree
145
+ end
146
+ end
147
+
148
+ class TextNode < Node
149
+ def initialize value
150
+ super nil
151
+ @value = value
152
+ end
153
+
154
+ def to_s
155
+ '"%s"' % value
156
+ end
157
+ end
158
+
159
+ class CommentNode < Node
160
+ def initialize value
161
+ super nil
162
+ @value = value
163
+ end
164
+
165
+ def to_s
166
+ "<!-- %s -->" % value
167
+ end
168
+ end
169
+
170
+ class TreeBuilder < Base::TreeBuilder
171
+ def initialize
172
+ @documentClass = Document
173
+ @doctypeClass = DocumentType
174
+ @elementClass = Element
175
+ @commentClass = CommentNode
176
+ @fragmentClass = DocumentFragment
177
+ end
178
+
179
+ def testSerializer node
180
+ node.printTree
181
+ end
182
+
183
+ def get_fragment
184
+ @document = super
185
+ @document
186
+ end
187
+ end
188
+
189
+ end
190
+ end
191
+ end
@@ -0,0 +1,26 @@
1
+ require 'html5/treewalkers/base'
2
+
3
+ module HTML5
4
+ module TreeWalkers
5
+
6
+ class << self
7
+ def [](name)
8
+ case name.to_s.downcase
9
+ when 'simpletree'
10
+ require 'html5/treewalkers/simpletree'
11
+ SimpleTree::TreeWalker
12
+ when 'rexml'
13
+ require 'html5/treewalkers/rexml'
14
+ REXML::TreeWalker
15
+ when 'hpricot'
16
+ require 'html5/treewalkers/hpricot'
17
+ Hpricot::TreeWalker
18
+ else
19
+ raise "Unknown TreeWalker #{name}"
20
+ end
21
+ end
22
+
23
+ alias :get_tree_walker :[]
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,162 @@
1
+ require 'html5/constants'
2
+ module HTML5
3
+ module TreeWalkers
4
+
5
+ module TokenConstructor
6
+ def error(msg)
7
+ {:type => "SerializeError", :data => msg}
8
+ end
9
+
10
+ def normalize_attrs(attrs)
11
+ attrs.to_a
12
+ end
13
+
14
+ def empty_tag(name, attrs, has_children=false)
15
+ error(_("Void element has children")) if has_children
16
+ {:type => :EmptyTag, :name => name, :data => normalize_attrs(attrs)}
17
+ end
18
+
19
+ def start_tag(name, attrs)
20
+ {:type => :StartTag, :name => name, :data => normalize_attrs(attrs)}
21
+ end
22
+
23
+ def end_tag(name)
24
+ {:type => :EndTag, :name => name, :data => []}
25
+ end
26
+
27
+ def text(data)
28
+ if data =~ /\A([#{SPACE_CHARACTERS.join('')}]+)/m
29
+ yield({:type => :SpaceCharacters, :data => $1})
30
+ data = data[$1.length .. -1]
31
+ return if data.empty?
32
+ end
33
+
34
+ if data =~ /([#{SPACE_CHARACTERS.join('')}]+)\Z/m
35
+ yield({:type => :Characters, :data => data[0 ... -$1.length]})
36
+ yield({:type => :SpaceCharacters, :data => $1})
37
+ else
38
+ yield({:type => :Characters, :data => data})
39
+ end
40
+ end
41
+
42
+ def comment(data)
43
+ {:type => :Comment, :data => data}
44
+ end
45
+
46
+ def doctype(name, public_id, system_id, correct=nil)
47
+ {:type => :Doctype, :name => name, :public_id => public_id, :system_id => system_id, :correct => correct}
48
+ end
49
+
50
+ def unknown(nodeType)
51
+ error(_("Unknown node type: ") + nodeType.to_s)
52
+ end
53
+
54
+ def _(str)
55
+ str
56
+ end
57
+ end
58
+
59
+ class Base
60
+ include TokenConstructor
61
+
62
+ def initialize(tree)
63
+ @tree = tree
64
+ end
65
+
66
+ def each
67
+ raise NotImplementedError
68
+ end
69
+
70
+ alias walk each
71
+
72
+ def to_ary
73
+ a = []
74
+ each do |i|
75
+ a << i
76
+ end
77
+ a
78
+ end
79
+ end
80
+
81
+ class NonRecursiveTreeWalker < TreeWalkers::Base
82
+ def node_details(node)
83
+ raise NotImplementedError
84
+ end
85
+
86
+ def first_child(node)
87
+ raise NotImplementedError
88
+ end
89
+
90
+ def next_sibling(node)
91
+ raise NotImplementedError
92
+ end
93
+
94
+ def parent(node)
95
+ raise NotImplementedError
96
+ end
97
+
98
+ def each
99
+ current_node = @tree
100
+ while current_node != nil
101
+ details = node_details(current_node)
102
+ has_children = false
103
+
104
+ case details.shift
105
+ when :DOCTYPE
106
+ yield doctype(*details)
107
+
108
+ when :TEXT
109
+ text(*details) {|token| yield token}
110
+
111
+ when :ELEMENT
112
+ name, attributes, has_children = details
113
+ if VOID_ELEMENTS.include?(name)
114
+ yield empty_tag(name, attributes.to_a, has_children)
115
+ has_children = false
116
+ else
117
+ yield start_tag(name, attributes.to_a)
118
+ end
119
+
120
+ when :COMMENT
121
+ yield comment(details[0])
122
+
123
+ when :DOCUMENT, :DOCUMENT_FRAGMENT
124
+ has_children = true
125
+
126
+ when nil
127
+ # ignore (REXML::XMLDecl is an example)
128
+
129
+ else
130
+ yield unknown(details[0])
131
+ end
132
+
133
+ first_child = has_children ? first_child(current_node) : nil
134
+ if first_child != nil
135
+ current_node = first_child
136
+ else
137
+ while current_node != nil
138
+ details = node_details(current_node)
139
+ if details.shift == :ELEMENT
140
+ name, attributes, has_children = details
141
+ yield end_tag(name) if !VOID_ELEMENTS.include?(name)
142
+ end
143
+
144
+ if @tree == current_node
145
+ current_node = nil
146
+ else
147
+ next_sibling = next_sibling(current_node)
148
+ if next_sibling != nil
149
+ current_node = next_sibling
150
+ break
151
+ end
152
+
153
+ current_node = parent(current_node)
154
+ end
155
+ end
156
+ end
157
+ end
158
+ end
159
+ end
160
+
161
+ end
162
+ end