spk-html5 0.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. data/History.txt +10 -0
  2. data/Manifest.txt +73 -0
  3. data/README +45 -0
  4. data/Rakefile.rb +33 -0
  5. data/bin/html5 +7 -0
  6. data/lib/html5.rb +13 -0
  7. data/lib/html5/cli.rb +248 -0
  8. data/lib/html5/constants.rb +1061 -0
  9. data/lib/html5/filters/base.rb +10 -0
  10. data/lib/html5/filters/inject_meta_charset.rb +82 -0
  11. data/lib/html5/filters/iso639codes.rb +755 -0
  12. data/lib/html5/filters/optionaltags.rb +198 -0
  13. data/lib/html5/filters/rfc2046.rb +31 -0
  14. data/lib/html5/filters/rfc3987.rb +91 -0
  15. data/lib/html5/filters/sanitizer.rb +15 -0
  16. data/lib/html5/filters/validator.rb +834 -0
  17. data/lib/html5/filters/whitespace.rb +36 -0
  18. data/lib/html5/html5parser.rb +247 -0
  19. data/lib/html5/html5parser/after_after_body_phase.rb +43 -0
  20. data/lib/html5/html5parser/after_after_frameset_phase.rb +32 -0
  21. data/lib/html5/html5parser/after_body_phase.rb +46 -0
  22. data/lib/html5/html5parser/after_frameset_phase.rb +33 -0
  23. data/lib/html5/html5parser/after_head_phase.rb +55 -0
  24. data/lib/html5/html5parser/before_head_phase.rb +44 -0
  25. data/lib/html5/html5parser/before_html_phase.rb +41 -0
  26. data/lib/html5/html5parser/in_body_phase.rb +636 -0
  27. data/lib/html5/html5parser/in_caption_phase.rb +69 -0
  28. data/lib/html5/html5parser/in_cell_phase.rb +78 -0
  29. data/lib/html5/html5parser/in_column_group_phase.rb +55 -0
  30. data/lib/html5/html5parser/in_foreign_content_phase.rb +50 -0
  31. data/lib/html5/html5parser/in_frameset_phase.rb +56 -0
  32. data/lib/html5/html5parser/in_head_phase.rb +143 -0
  33. data/lib/html5/html5parser/in_row_phase.rb +96 -0
  34. data/lib/html5/html5parser/in_select_phase.rb +90 -0
  35. data/lib/html5/html5parser/in_select_table_phase.rb +35 -0
  36. data/lib/html5/html5parser/in_table_body_phase.rb +92 -0
  37. data/lib/html5/html5parser/in_table_phase.rb +177 -0
  38. data/lib/html5/html5parser/initial_phase.rb +133 -0
  39. data/lib/html5/html5parser/phase.rb +171 -0
  40. data/lib/html5/inputstream.rb +735 -0
  41. data/lib/html5/liberalxmlparser.rb +158 -0
  42. data/lib/html5/sanitizer.rb +209 -0
  43. data/lib/html5/serializer.rb +2 -0
  44. data/lib/html5/serializer/htmlserializer.rb +179 -0
  45. data/lib/html5/serializer/xhtmlserializer.rb +20 -0
  46. data/lib/html5/sniffer.rb +45 -0
  47. data/lib/html5/tokenizer.rb +1059 -0
  48. data/lib/html5/treebuilders.rb +24 -0
  49. data/lib/html5/treebuilders/base.rb +339 -0
  50. data/lib/html5/treebuilders/hpricot.rb +231 -0
  51. data/lib/html5/treebuilders/rexml.rb +215 -0
  52. data/lib/html5/treebuilders/simpletree.rb +191 -0
  53. data/lib/html5/treewalkers.rb +26 -0
  54. data/lib/html5/treewalkers/base.rb +162 -0
  55. data/lib/html5/treewalkers/hpricot.rb +48 -0
  56. data/lib/html5/treewalkers/rexml.rb +48 -0
  57. data/lib/html5/treewalkers/simpletree.rb +48 -0
  58. data/lib/html5/version.rb +3 -0
  59. data/test/preamble.rb +69 -0
  60. data/test/test_cli.rb +16 -0
  61. data/test/test_encoding.rb +35 -0
  62. data/test/test_input_stream.rb +26 -0
  63. data/test/test_lxp.rb +283 -0
  64. data/test/test_parser.rb +63 -0
  65. data/test/test_sanitizer.rb +173 -0
  66. data/test/test_serializer.rb +67 -0
  67. data/test/test_sniffer.rb +27 -0
  68. data/test/test_stream.rb +71 -0
  69. data/test/test_tokenizer.rb +95 -0
  70. data/test/test_treewalkers.rb +135 -0
  71. data/test/test_validator.rb +31 -0
  72. data/test/tokenizer_test_parser.rb +67 -0
  73. data/test19.rb +38 -0
  74. metadata +198 -0
@@ -0,0 +1,24 @@
1
+ module HTML5
2
+ module TreeBuilders
3
+
4
+ class << self
5
+ def [](name)
6
+ case name.to_s.downcase
7
+ when 'simpletree' then
8
+ require 'html5/treebuilders/simpletree'
9
+ SimpleTree::TreeBuilder
10
+ when 'rexml' then
11
+ require 'html5/treebuilders/rexml'
12
+ REXML::TreeBuilder
13
+ when 'hpricot' then
14
+ require 'html5/treebuilders/hpricot'
15
+ Hpricot::TreeBuilder
16
+ else
17
+ raise "Unknown TreeBuilder #{name}"
18
+ end
19
+ end
20
+
21
+ alias :get_tree_builder :[]
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,339 @@
1
+ require 'html5/constants'
2
+
3
+ #XXX - TODO; make the default interface more ElementTree-like rather than DOM-like
4
+
5
+ module HTML5
6
+
7
+ # The scope markers are inserted when entering buttons, object elements,
8
+ # marquees, table cells, and table captions, and are used to prevent formatting
9
+ # from "leaking" into tables, buttons, object elements, and marquees.
10
+ Marker = nil
11
+
12
+ module TreeBuilders
13
+ module Base
14
+
15
+ class Node
16
+ # The parent of the current node (or nil for the document node)
17
+ attr_accessor :parent
18
+
19
+ # a list of child nodes of the current node. This must
20
+ # include all elements but not necessarily other node types
21
+ attr_accessor :childNodes
22
+
23
+ # A list of miscellaneous flags that can be set on the node
24
+ attr_accessor :flags
25
+
26
+ def initialize(name)
27
+ @parent = nil
28
+ @childNodes = []
29
+ @flags = []
30
+ end
31
+
32
+ # Insert node as a child of the current node
33
+ def appendChild(node)
34
+ raise NotImplementedError
35
+ end
36
+
37
+ # Insert data as text in the current node, positioned before the
38
+ # start of node insertBefore or to the end of the node's text.
39
+ def insertText(data, insertBefore=nil)
40
+ raise NotImplementedError
41
+ end
42
+
43
+ # Insert node as a child of the current node, before refNode in the
44
+ # list of child nodes. Raises ValueError if refNode is not a child of
45
+ # the current node
46
+ def insertBefore(node, refNode)
47
+ raise NotImplementedError
48
+ end
49
+
50
+ # Remove node from the children of the current node
51
+ def removeChild(node)
52
+ raise NotImplementedError
53
+ end
54
+
55
+ # Move all the children of the current node to newParent.
56
+ # This is needed so that trees that don't store text as nodes move the
57
+ # text in the correct way
58
+ def reparentChildren(newParent)
59
+ #XXX - should this method be made more general?
60
+ @childNodes.each { |child| newParent.appendChild(child) }
61
+ @childNodes = []
62
+ end
63
+
64
+ # Return a shallow copy of the current node i.e. a node with the same
65
+ # name and attributes but with no parent or child nodes
66
+ def cloneNode
67
+ raise NotImplementedError
68
+ end
69
+
70
+ # Return true if the node has children or text, false otherwise
71
+ def hasContent
72
+ raise NotImplementedError
73
+ end
74
+ end
75
+
76
+ # Base treebuilder implementation
77
+ class TreeBuilder
78
+
79
+ attr_accessor :open_elements
80
+
81
+ attr_accessor :activeFormattingElements
82
+
83
+ attr_accessor :document
84
+
85
+ attr_accessor :head_pointer
86
+
87
+ attr_accessor :formPointer
88
+
89
+ attr_reader :insert_from_table
90
+ # Class to use for document root
91
+ documentClass = nil
92
+
93
+ # Class to use for HTML elements
94
+ elementClass = nil
95
+
96
+ # Class to use for comments
97
+ commentClass = nil
98
+
99
+ # Class to use for doctypes
100
+ doctypeClass = nil
101
+
102
+ # Fragment class
103
+ fragmentClass = nil
104
+
105
+ def initialize
106
+ reset
107
+ end
108
+
109
+ def reset
110
+ @open_elements = []
111
+ @activeFormattingElements = []
112
+
113
+ #XXX - rename these to headElement, formElement
114
+ @head_pointer = nil
115
+ @formPointer = nil
116
+
117
+ self.insert_from_table = false
118
+
119
+ @document = @documentClass.new
120
+ end
121
+
122
+ def elementInScope(target, tableVariant=false)
123
+ # Exit early when possible.
124
+ return true if @open_elements[-1] && @open_elements[-1].name == target
125
+ return false if @open_elements.length == 0
126
+ # AT How about while true and simply set node to [-1] and set it to
127
+ # [-2] at the end...
128
+ @open_elements.reverse.each do |element|
129
+ if element.name == target
130
+ return true
131
+ elsif element.name == 'table'
132
+ return false
133
+ elsif not tableVariant and SCOPING_ELEMENTS.include?(element.name)
134
+ return false
135
+ elsif element.name == 'html'
136
+ return false
137
+ end
138
+ end
139
+ assert false # We should never reach this point
140
+ end
141
+
142
+ def reconstructActiveFormattingElements
143
+ # Within this algorithm the order of steps described in the
144
+ # specification is not quite the same as the order of steps in the
145
+ # code. It should still do the same though.
146
+
147
+ # Step 1: stop the algorithm when there's nothing to do.
148
+ return if @activeFormattingElements.empty?
149
+
150
+ # Step 2 and step 3: we start with the last element. So i is -1.
151
+ i = -1
152
+ entry = @activeFormattingElements[i]
153
+ return if entry == Marker or @open_elements.include?(entry)
154
+
155
+ # Step 6
156
+ until entry == Marker or @open_elements.include?(entry)
157
+ # Step 5: let entry be one earlier in the list.
158
+ i -= 1
159
+ begin
160
+ entry = @activeFormattingElements[i]
161
+ rescue
162
+ # Step 4: at this point we need to jump to step 8. By not doing
163
+ # i += 1 which is also done in step 7 we achieve that.
164
+ break
165
+ end
166
+ end
167
+ while true
168
+ # Step 7
169
+ i += 1
170
+
171
+ # Step 8
172
+ clone = @activeFormattingElements[i].cloneNode
173
+
174
+ # Step 9
175
+ element = insert_element(clone.name, clone.attributes)
176
+
177
+ # Step 10
178
+ @activeFormattingElements[i] = element
179
+
180
+ # Step 11
181
+ break if element == @activeFormattingElements[-1]
182
+ end
183
+ end
184
+
185
+ def clearActiveFormattingElements
186
+ {} until @activeFormattingElements.empty? || @activeFormattingElements.pop == Marker
187
+ end
188
+
189
+ # Check if an element exists between the end of the active
190
+ # formatting elements and the last marker. If it does, return it, else
191
+ # return false
192
+ def elementInActiveFormattingElements(name)
193
+ @activeFormattingElements.reverse.each do |element|
194
+ # Check for Marker first because if it's a Marker it doesn't have a
195
+ # name attribute.
196
+ break if element == Marker
197
+ return element if element.name == name
198
+ end
199
+ return false
200
+ end
201
+
202
+ def insertDoctype(name, public_id, system_id)
203
+ doctype = @doctypeClass.new(name)
204
+ doctype.public_id = public_id
205
+ doctype.system_id = system_id
206
+ @document.appendChild(doctype)
207
+ end
208
+
209
+ def insert_comment(data, parent=nil)
210
+ parent = @open_elements[-1] if parent.nil?
211
+ parent.appendChild(@commentClass.new(data))
212
+ end
213
+
214
+ # Create an element but don't insert it anywhere
215
+ def createElement(name, attributes)
216
+ element = @elementClass.new(name)
217
+ element.attributes = attributes
218
+ return element
219
+ end
220
+
221
+ # Switch the function used to insert an element from the
222
+ # normal one to the misnested table one and back again
223
+ def insert_from_table=(value)
224
+ @insert_from_table = value
225
+ @insert_element = value ? :insert_elementTable : :insert_elementNormal
226
+ end
227
+
228
+ def insert_element(name, attributes, namespace = nil)
229
+ send(@insert_element, name, attributes, namespace)
230
+ end
231
+
232
+ def insert_foreign_element(name, attributes, namespace)
233
+ insert_element(name, attributes, namespace)
234
+ end
235
+
236
+ def insert_elementNormal(name, attributes, namespace=nil)
237
+ element = @elementClass.new(name, namespace)
238
+ element.attributes = attributes
239
+ @open_elements.last.appendChild(element)
240
+ @open_elements.push(element)
241
+ element
242
+ end
243
+
244
+ # Create an element and insert it into the tree
245
+ def insert_elementTable(name, attributes, namespace=nil)
246
+ element = @elementClass.new(name, namespace)
247
+ element.attributes = attributes
248
+ if !TABLE_INSERT_MODE_ELEMENTS.include?(@open_elements.last.name)
249
+ return insert_elementNormal(name, attributes)
250
+ else
251
+ #We should be in the InTable mode. This means we want to do
252
+ #special magic element rearranging
253
+ parent, insertBefore = getTableMisnestedNodePosition
254
+ if insertBefore.nil?
255
+ parent.appendChild(element)
256
+ else
257
+ parent.insertBefore(element, insertBefore)
258
+ end
259
+ @open_elements.push(element)
260
+ end
261
+ return element
262
+ end
263
+
264
+ def insertText(data, parent=nil)
265
+ parent = @open_elements[-1] if parent.nil?
266
+
267
+ if (not(@insert_from_table) or (@insert_from_table and not TABLE_INSERT_MODE_ELEMENTS.include?(@open_elements[-1].name)))
268
+ parent.insertText(data)
269
+ else
270
+ #We should be in the InTable mode. This means we want to do
271
+ #special magic element rearranging
272
+ parent, insertBefore = getTableMisnestedNodePosition
273
+ parent.insertText(data, insertBefore)
274
+ end
275
+ end
276
+
277
+ # Get the foster parent element, and sibling to insert before
278
+ # (or nil) when inserting a misnested table node
279
+ def getTableMisnestedNodePosition
280
+ #The foster parent element is the one which comes before the most
281
+ #recently opened table element
282
+ #XXX - this is really inelegant
283
+ lastTable = nil
284
+ fosterParent = nil
285
+ insertBefore = nil
286
+ @open_elements.reverse.each do |element|
287
+ if element.name == "table"
288
+ lastTable = element
289
+ break
290
+ end
291
+ end
292
+ if lastTable
293
+ #XXX - we should really check that this parent is actually a
294
+ #node here
295
+ if lastTable.parent
296
+ fosterParent = lastTable.parent
297
+ insertBefore = lastTable
298
+ else
299
+ fosterParent = @open_elements[@open_elements.index(lastTable) - 1]
300
+ end
301
+ else
302
+ fosterParent = @open_elements[0]
303
+ end
304
+ return fosterParent, insertBefore
305
+ end
306
+
307
+ def generateImpliedEndTags(exclude=nil)
308
+ name = @open_elements[-1].name
309
+
310
+ # XXX td, th and tr are not actually needed
311
+ if (%w[dd dt li p td th tr].include?(name) and name != exclude)
312
+ @open_elements.pop
313
+ # XXX This is not entirely what the specification says. We should
314
+ # investigate it more closely.
315
+ generateImpliedEndTags(exclude)
316
+ end
317
+ end
318
+
319
+ def get_document
320
+ @document
321
+ end
322
+
323
+ def get_fragment
324
+ #assert @inner_html
325
+ fragment = @fragmentClass.new
326
+ @open_elements[0].reparentChildren(fragment)
327
+ return fragment
328
+ end
329
+
330
+ # Serialize the subtree of node in the format required by unit tests
331
+ # node - the node from which to start serializing
332
+ def testSerializer(node)
333
+ raise NotImplementedError
334
+ end
335
+
336
+ end
337
+ end
338
+ end
339
+ end
@@ -0,0 +1,231 @@
1
+ require 'html5/treebuilders/base'
2
+ require 'rubygems'
3
+ require 'hpricot'
4
+ require 'forwardable'
5
+
6
+ module HTML5
7
+ module TreeBuilders
8
+ module Hpricot
9
+
10
+ class Node < Base::Node
11
+ extend Forwardable
12
+
13
+ def_delegators :@hpricot, :name
14
+
15
+ attr_accessor :hpricot
16
+
17
+ def initialize(name)
18
+ super(name)
19
+ @hpricot = self.class.hpricot_class.new name
20
+ end
21
+
22
+ def appendChild(node)
23
+ if node.kind_of?(TextNode) and childNodes.any? and childNodes.last.kind_of?(TextNode)
24
+ childNodes.last.hpricot.content = childNodes.last.hpricot.content + node.hpricot.content
25
+ else
26
+ childNodes << node
27
+ hpricot.children << node.hpricot
28
+ end
29
+ if (oldparent = node.hpricot.parent) != nil
30
+ oldparent.children.delete_at(oldparent.children.index(node.hpricot))
31
+ end
32
+ node.hpricot.parent = hpricot
33
+ node.parent = self
34
+ end
35
+
36
+ def removeChild(node)
37
+ childNodes.delete(node)
38
+ hpricot.children.delete_at(hpricot.children.index(node.hpricot))
39
+ node.hpricot.parent = nil
40
+ node.parent = nil
41
+ end
42
+
43
+ def insertText(data, before=nil)
44
+ if before
45
+ insertBefore(TextNode.new(data), before)
46
+ else
47
+ appendChild(TextNode.new(data))
48
+ end
49
+ end
50
+
51
+ def insertBefore(node, refNode)
52
+ index = childNodes.index(refNode)
53
+ if node.kind_of?(TextNode) and index > 0 and childNodes[index-1].kind_of?(TextNode)
54
+ childNodes[index-1].hpricot.content = childNodes[index-1].hpricot.to_s + node.hpricot.to_s
55
+ else
56
+ refNode.hpricot.parent.insert_before(node.hpricot,refNode.hpricot)
57
+ childNodes.insert(index, node)
58
+ end
59
+ end
60
+
61
+ def hasContent
62
+ childNodes.any?
63
+ end
64
+ end
65
+
66
+ class Element < Node
67
+ attr_reader :namespace
68
+
69
+ def self.hpricot_class
70
+ ::Hpricot::Elem
71
+ end
72
+
73
+ def initialize(name, namespace=nil)
74
+ super(name)
75
+
76
+ @hpricot = ::Hpricot::Elem.new(::Hpricot::STag.new(name))
77
+ @namespace = namespace
78
+ end
79
+
80
+ def name
81
+ @hpricot.stag.name
82
+ end
83
+
84
+ def cloneNode
85
+ attributes.inject(self.class.new(name)) do |node, (name, value)|
86
+ node.hpricot[name] = value
87
+ node
88
+ end
89
+ end
90
+
91
+ # A call to Hpricot::Elem#raw_attributes is built dynamically,
92
+ # so alterations to the returned value (a hash) will be lost.
93
+ #
94
+ # AttributeProxy works around this by forwarding :[]= calls
95
+ # to the raw_attributes accessor on the element start tag.
96
+ #
97
+ class AttributeProxy
98
+ def initialize(hpricot)
99
+ @hpricot = hpricot
100
+ end
101
+
102
+ def []=(k, v)
103
+ @hpricot.stag.send(stag_attributes_method)[k] = v
104
+ end
105
+
106
+ def stag_attributes_method
107
+ # STag#attributes changed to STag#raw_attributes after Hpricot 0.5
108
+ @hpricot.stag.respond_to?(:raw_attributes) ? :raw_attributes : :attributes
109
+ end
110
+
111
+ def method_missing(*a, &b)
112
+ @hpricot.attributes.send(*a, &b)
113
+ end
114
+ end
115
+
116
+ def attributes
117
+ AttributeProxy.new(@hpricot)
118
+ end
119
+
120
+ def attributes=(attrs)
121
+ attrs.each { |name, value| @hpricot[name] = value }
122
+ end
123
+
124
+ def printTree(indent=0)
125
+ tree = "\n|#{' ' * indent}<#{!@namespace.nil? ? @namespace.to_s + ' ' : ''}#{name}>"
126
+ indent += 2
127
+ attributes.each do |name, value|
128
+ next if name == 'xmlns'
129
+ tree += "\n|#{' ' * indent}#{name}=\"#{value}\""
130
+ end
131
+ childNodes.inject(tree) { |tree, child| tree + child.printTree(indent) }
132
+ end
133
+ end
134
+
135
+ class Document < Node
136
+ def self.hpricot_class
137
+ ::Hpricot::Doc
138
+ end
139
+
140
+ def initialize
141
+ super(nil)
142
+ end
143
+
144
+ def printTree(indent=0)
145
+ childNodes.inject('#document') { |tree, child| tree + child.printTree(indent + 2) }
146
+ end
147
+ end
148
+
149
+ class DocumentType < Node
150
+ def_delegators :@hpricot, :public_id, :system_id
151
+
152
+ def self.hpricot_class
153
+ ::Hpricot::DocType
154
+ end
155
+
156
+ def initialize(name, public_id, system_id)
157
+ @hpricot = self.class.hpricot_class.new(name, public_id, system_id)
158
+ end
159
+
160
+ def printTree(indent=0)
161
+ if hpricot.target and hpricot.target.any?
162
+ "\n|#{' ' * indent}<!DOCTYPE #{hpricot.target}" +
163
+ ([hpricot.public_id, hpricot.system_id].any? ? " \"#{hpricot.public_id}\" \"#{hpricot.system_id}\"" : '') +
164
+ '>'
165
+ else
166
+ "\n|#{' ' * indent}<!DOCTYPE >"
167
+ end
168
+ end
169
+ end
170
+
171
+ class DocumentFragment < Element
172
+ def initialize
173
+ super('')
174
+ end
175
+
176
+ def printTree(indent=0)
177
+ childNodes.inject('') {|tree, child| tree + child.printTree(indent + 2) }
178
+ end
179
+ end
180
+
181
+ class TextNode < Node
182
+ def initialize(data)
183
+ @hpricot = ::Hpricot::Text.new(data)
184
+ end
185
+
186
+ def printTree(indent=0)
187
+ "\n|#{' ' * indent}\"#{hpricot.content}\""
188
+ end
189
+ end
190
+
191
+ class CommentNode < Node
192
+ def self.hpricot_class
193
+ ::Hpricot::Comment
194
+ end
195
+
196
+ def printTree(indent=0)
197
+ "\n|#{' ' * indent}<!-- #{hpricot.content} -->"
198
+ end
199
+ end
200
+
201
+ class TreeBuilder < Base::TreeBuilder
202
+ def initialize
203
+ @documentClass = Document
204
+ @doctypeClass = DocumentType
205
+ @elementClass = Element
206
+ @commentClass = CommentNode
207
+ @fragmentClass = DocumentFragment
208
+ end
209
+
210
+ def insertDoctype(name, public_id, system_id)
211
+ doctype = @doctypeClass.new(name, public_id, system_id)
212
+ @document.appendChild(doctype)
213
+ end
214
+
215
+ def testSerializer(node)
216
+ node.printTree
217
+ end
218
+
219
+ def get_document
220
+ @document.hpricot
221
+ end
222
+
223
+ def get_fragment
224
+ @document = super
225
+ return @document.hpricot.children
226
+ end
227
+ end
228
+
229
+ end
230
+ end
231
+ end