spk-html5 0.10.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (74) hide show
  1. data/History.txt +10 -0
  2. data/Manifest.txt +73 -0
  3. data/README +45 -0
  4. data/Rakefile.rb +33 -0
  5. data/bin/html5 +7 -0
  6. data/lib/html5.rb +13 -0
  7. data/lib/html5/cli.rb +248 -0
  8. data/lib/html5/constants.rb +1061 -0
  9. data/lib/html5/filters/base.rb +10 -0
  10. data/lib/html5/filters/inject_meta_charset.rb +82 -0
  11. data/lib/html5/filters/iso639codes.rb +755 -0
  12. data/lib/html5/filters/optionaltags.rb +198 -0
  13. data/lib/html5/filters/rfc2046.rb +31 -0
  14. data/lib/html5/filters/rfc3987.rb +91 -0
  15. data/lib/html5/filters/sanitizer.rb +15 -0
  16. data/lib/html5/filters/validator.rb +834 -0
  17. data/lib/html5/filters/whitespace.rb +36 -0
  18. data/lib/html5/html5parser.rb +247 -0
  19. data/lib/html5/html5parser/after_after_body_phase.rb +43 -0
  20. data/lib/html5/html5parser/after_after_frameset_phase.rb +32 -0
  21. data/lib/html5/html5parser/after_body_phase.rb +46 -0
  22. data/lib/html5/html5parser/after_frameset_phase.rb +33 -0
  23. data/lib/html5/html5parser/after_head_phase.rb +55 -0
  24. data/lib/html5/html5parser/before_head_phase.rb +44 -0
  25. data/lib/html5/html5parser/before_html_phase.rb +41 -0
  26. data/lib/html5/html5parser/in_body_phase.rb +636 -0
  27. data/lib/html5/html5parser/in_caption_phase.rb +69 -0
  28. data/lib/html5/html5parser/in_cell_phase.rb +78 -0
  29. data/lib/html5/html5parser/in_column_group_phase.rb +55 -0
  30. data/lib/html5/html5parser/in_foreign_content_phase.rb +50 -0
  31. data/lib/html5/html5parser/in_frameset_phase.rb +56 -0
  32. data/lib/html5/html5parser/in_head_phase.rb +143 -0
  33. data/lib/html5/html5parser/in_row_phase.rb +96 -0
  34. data/lib/html5/html5parser/in_select_phase.rb +90 -0
  35. data/lib/html5/html5parser/in_select_table_phase.rb +35 -0
  36. data/lib/html5/html5parser/in_table_body_phase.rb +92 -0
  37. data/lib/html5/html5parser/in_table_phase.rb +177 -0
  38. data/lib/html5/html5parser/initial_phase.rb +133 -0
  39. data/lib/html5/html5parser/phase.rb +171 -0
  40. data/lib/html5/inputstream.rb +735 -0
  41. data/lib/html5/liberalxmlparser.rb +158 -0
  42. data/lib/html5/sanitizer.rb +209 -0
  43. data/lib/html5/serializer.rb +2 -0
  44. data/lib/html5/serializer/htmlserializer.rb +179 -0
  45. data/lib/html5/serializer/xhtmlserializer.rb +20 -0
  46. data/lib/html5/sniffer.rb +45 -0
  47. data/lib/html5/tokenizer.rb +1059 -0
  48. data/lib/html5/treebuilders.rb +24 -0
  49. data/lib/html5/treebuilders/base.rb +339 -0
  50. data/lib/html5/treebuilders/hpricot.rb +231 -0
  51. data/lib/html5/treebuilders/rexml.rb +215 -0
  52. data/lib/html5/treebuilders/simpletree.rb +191 -0
  53. data/lib/html5/treewalkers.rb +26 -0
  54. data/lib/html5/treewalkers/base.rb +162 -0
  55. data/lib/html5/treewalkers/hpricot.rb +48 -0
  56. data/lib/html5/treewalkers/rexml.rb +48 -0
  57. data/lib/html5/treewalkers/simpletree.rb +48 -0
  58. data/lib/html5/version.rb +3 -0
  59. data/test/preamble.rb +69 -0
  60. data/test/test_cli.rb +16 -0
  61. data/test/test_encoding.rb +35 -0
  62. data/test/test_input_stream.rb +26 -0
  63. data/test/test_lxp.rb +283 -0
  64. data/test/test_parser.rb +63 -0
  65. data/test/test_sanitizer.rb +173 -0
  66. data/test/test_serializer.rb +67 -0
  67. data/test/test_sniffer.rb +27 -0
  68. data/test/test_stream.rb +71 -0
  69. data/test/test_tokenizer.rb +95 -0
  70. data/test/test_treewalkers.rb +135 -0
  71. data/test/test_validator.rb +31 -0
  72. data/test/tokenizer_test_parser.rb +67 -0
  73. data/test19.rb +38 -0
  74. metadata +198 -0
@@ -0,0 +1,24 @@
1
+ module HTML5
2
+ module TreeBuilders
3
+
4
+ class << self
5
+ def [](name)
6
+ case name.to_s.downcase
7
+ when 'simpletree' then
8
+ require 'html5/treebuilders/simpletree'
9
+ SimpleTree::TreeBuilder
10
+ when 'rexml' then
11
+ require 'html5/treebuilders/rexml'
12
+ REXML::TreeBuilder
13
+ when 'hpricot' then
14
+ require 'html5/treebuilders/hpricot'
15
+ Hpricot::TreeBuilder
16
+ else
17
+ raise "Unknown TreeBuilder #{name}"
18
+ end
19
+ end
20
+
21
+ alias :get_tree_builder :[]
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,339 @@
1
+ require 'html5/constants'
2
+
3
+ #XXX - TODO; make the default interface more ElementTree-like rather than DOM-like
4
+
5
+ module HTML5
6
+
7
+ # The scope markers are inserted when entering buttons, object elements,
8
+ # marquees, table cells, and table captions, and are used to prevent formatting
9
+ # from "leaking" into tables, buttons, object elements, and marquees.
10
+ Marker = nil
11
+
12
+ module TreeBuilders
13
+ module Base
14
+
15
+ class Node
16
+ # The parent of the current node (or nil for the document node)
17
+ attr_accessor :parent
18
+
19
+ # a list of child nodes of the current node. This must
20
+ # include all elements but not necessarily other node types
21
+ attr_accessor :childNodes
22
+
23
+ # A list of miscellaneous flags that can be set on the node
24
+ attr_accessor :flags
25
+
26
+ def initialize(name)
27
+ @parent = nil
28
+ @childNodes = []
29
+ @flags = []
30
+ end
31
+
32
+ # Insert node as a child of the current node
33
+ def appendChild(node)
34
+ raise NotImplementedError
35
+ end
36
+
37
+ # Insert data as text in the current node, positioned before the
38
+ # start of node insertBefore or to the end of the node's text.
39
+ def insertText(data, insertBefore=nil)
40
+ raise NotImplementedError
41
+ end
42
+
43
+ # Insert node as a child of the current node, before refNode in the
44
+ # list of child nodes. Raises ValueError if refNode is not a child of
45
+ # the current node
46
+ def insertBefore(node, refNode)
47
+ raise NotImplementedError
48
+ end
49
+
50
+ # Remove node from the children of the current node
51
+ def removeChild(node)
52
+ raise NotImplementedError
53
+ end
54
+
55
+ # Move all the children of the current node to newParent.
56
+ # This is needed so that trees that don't store text as nodes move the
57
+ # text in the correct way
58
+ def reparentChildren(newParent)
59
+ #XXX - should this method be made more general?
60
+ @childNodes.each { |child| newParent.appendChild(child) }
61
+ @childNodes = []
62
+ end
63
+
64
+ # Return a shallow copy of the current node i.e. a node with the same
65
+ # name and attributes but with no parent or child nodes
66
+ def cloneNode
67
+ raise NotImplementedError
68
+ end
69
+
70
+ # Return true if the node has children or text, false otherwise
71
+ def hasContent
72
+ raise NotImplementedError
73
+ end
74
+ end
75
+
76
+ # Base treebuilder implementation
77
+ class TreeBuilder
78
+
79
+ attr_accessor :open_elements
80
+
81
+ attr_accessor :activeFormattingElements
82
+
83
+ attr_accessor :document
84
+
85
+ attr_accessor :head_pointer
86
+
87
+ attr_accessor :formPointer
88
+
89
+ attr_reader :insert_from_table
90
+ # Class to use for document root
91
+ documentClass = nil
92
+
93
+ # Class to use for HTML elements
94
+ elementClass = nil
95
+
96
+ # Class to use for comments
97
+ commentClass = nil
98
+
99
+ # Class to use for doctypes
100
+ doctypeClass = nil
101
+
102
+ # Fragment class
103
+ fragmentClass = nil
104
+
105
+ def initialize
106
+ reset
107
+ end
108
+
109
+ def reset
110
+ @open_elements = []
111
+ @activeFormattingElements = []
112
+
113
+ #XXX - rename these to headElement, formElement
114
+ @head_pointer = nil
115
+ @formPointer = nil
116
+
117
+ self.insert_from_table = false
118
+
119
+ @document = @documentClass.new
120
+ end
121
+
122
+ def elementInScope(target, tableVariant=false)
123
+ # Exit early when possible.
124
+ return true if @open_elements[-1] && @open_elements[-1].name == target
125
+ return false if @open_elements.length == 0
126
+ # AT How about while true and simply set node to [-1] and set it to
127
+ # [-2] at the end...
128
+ @open_elements.reverse.each do |element|
129
+ if element.name == target
130
+ return true
131
+ elsif element.name == 'table'
132
+ return false
133
+ elsif not tableVariant and SCOPING_ELEMENTS.include?(element.name)
134
+ return false
135
+ elsif element.name == 'html'
136
+ return false
137
+ end
138
+ end
139
+ assert false # We should never reach this point
140
+ end
141
+
142
+ def reconstructActiveFormattingElements
143
+ # Within this algorithm the order of steps described in the
144
+ # specification is not quite the same as the order of steps in the
145
+ # code. It should still do the same though.
146
+
147
+ # Step 1: stop the algorithm when there's nothing to do.
148
+ return if @activeFormattingElements.empty?
149
+
150
+ # Step 2 and step 3: we start with the last element. So i is -1.
151
+ i = -1
152
+ entry = @activeFormattingElements[i]
153
+ return if entry == Marker or @open_elements.include?(entry)
154
+
155
+ # Step 6
156
+ until entry == Marker or @open_elements.include?(entry)
157
+ # Step 5: let entry be one earlier in the list.
158
+ i -= 1
159
+ begin
160
+ entry = @activeFormattingElements[i]
161
+ rescue
162
+ # Step 4: at this point we need to jump to step 8. By not doing
163
+ # i += 1 which is also done in step 7 we achieve that.
164
+ break
165
+ end
166
+ end
167
+ while true
168
+ # Step 7
169
+ i += 1
170
+
171
+ # Step 8
172
+ clone = @activeFormattingElements[i].cloneNode
173
+
174
+ # Step 9
175
+ element = insert_element(clone.name, clone.attributes)
176
+
177
+ # Step 10
178
+ @activeFormattingElements[i] = element
179
+
180
+ # Step 11
181
+ break if element == @activeFormattingElements[-1]
182
+ end
183
+ end
184
+
185
+ def clearActiveFormattingElements
186
+ {} until @activeFormattingElements.empty? || @activeFormattingElements.pop == Marker
187
+ end
188
+
189
+ # Check if an element exists between the end of the active
190
+ # formatting elements and the last marker. If it does, return it, else
191
+ # return false
192
+ def elementInActiveFormattingElements(name)
193
+ @activeFormattingElements.reverse.each do |element|
194
+ # Check for Marker first because if it's a Marker it doesn't have a
195
+ # name attribute.
196
+ break if element == Marker
197
+ return element if element.name == name
198
+ end
199
+ return false
200
+ end
201
+
202
+ def insertDoctype(name, public_id, system_id)
203
+ doctype = @doctypeClass.new(name)
204
+ doctype.public_id = public_id
205
+ doctype.system_id = system_id
206
+ @document.appendChild(doctype)
207
+ end
208
+
209
+ def insert_comment(data, parent=nil)
210
+ parent = @open_elements[-1] if parent.nil?
211
+ parent.appendChild(@commentClass.new(data))
212
+ end
213
+
214
+ # Create an element but don't insert it anywhere
215
+ def createElement(name, attributes)
216
+ element = @elementClass.new(name)
217
+ element.attributes = attributes
218
+ return element
219
+ end
220
+
221
+ # Switch the function used to insert an element from the
222
+ # normal one to the misnested table one and back again
223
+ def insert_from_table=(value)
224
+ @insert_from_table = value
225
+ @insert_element = value ? :insert_elementTable : :insert_elementNormal
226
+ end
227
+
228
+ def insert_element(name, attributes, namespace = nil)
229
+ send(@insert_element, name, attributes, namespace)
230
+ end
231
+
232
+ def insert_foreign_element(name, attributes, namespace)
233
+ insert_element(name, attributes, namespace)
234
+ end
235
+
236
+ def insert_elementNormal(name, attributes, namespace=nil)
237
+ element = @elementClass.new(name, namespace)
238
+ element.attributes = attributes
239
+ @open_elements.last.appendChild(element)
240
+ @open_elements.push(element)
241
+ element
242
+ end
243
+
244
+ # Create an element and insert it into the tree
245
+ def insert_elementTable(name, attributes, namespace=nil)
246
+ element = @elementClass.new(name, namespace)
247
+ element.attributes = attributes
248
+ if !TABLE_INSERT_MODE_ELEMENTS.include?(@open_elements.last.name)
249
+ return insert_elementNormal(name, attributes)
250
+ else
251
+ #We should be in the InTable mode. This means we want to do
252
+ #special magic element rearranging
253
+ parent, insertBefore = getTableMisnestedNodePosition
254
+ if insertBefore.nil?
255
+ parent.appendChild(element)
256
+ else
257
+ parent.insertBefore(element, insertBefore)
258
+ end
259
+ @open_elements.push(element)
260
+ end
261
+ return element
262
+ end
263
+
264
+ def insertText(data, parent=nil)
265
+ parent = @open_elements[-1] if parent.nil?
266
+
267
+ if (not(@insert_from_table) or (@insert_from_table and not TABLE_INSERT_MODE_ELEMENTS.include?(@open_elements[-1].name)))
268
+ parent.insertText(data)
269
+ else
270
+ #We should be in the InTable mode. This means we want to do
271
+ #special magic element rearranging
272
+ parent, insertBefore = getTableMisnestedNodePosition
273
+ parent.insertText(data, insertBefore)
274
+ end
275
+ end
276
+
277
+ # Get the foster parent element, and sibling to insert before
278
+ # (or nil) when inserting a misnested table node
279
+ def getTableMisnestedNodePosition
280
+ #The foster parent element is the one which comes before the most
281
+ #recently opened table element
282
+ #XXX - this is really inelegant
283
+ lastTable = nil
284
+ fosterParent = nil
285
+ insertBefore = nil
286
+ @open_elements.reverse.each do |element|
287
+ if element.name == "table"
288
+ lastTable = element
289
+ break
290
+ end
291
+ end
292
+ if lastTable
293
+ #XXX - we should really check that this parent is actually a
294
+ #node here
295
+ if lastTable.parent
296
+ fosterParent = lastTable.parent
297
+ insertBefore = lastTable
298
+ else
299
+ fosterParent = @open_elements[@open_elements.index(lastTable) - 1]
300
+ end
301
+ else
302
+ fosterParent = @open_elements[0]
303
+ end
304
+ return fosterParent, insertBefore
305
+ end
306
+
307
+ def generateImpliedEndTags(exclude=nil)
308
+ name = @open_elements[-1].name
309
+
310
+ # XXX td, th and tr are not actually needed
311
+ if (%w[dd dt li p td th tr].include?(name) and name != exclude)
312
+ @open_elements.pop
313
+ # XXX This is not entirely what the specification says. We should
314
+ # investigate it more closely.
315
+ generateImpliedEndTags(exclude)
316
+ end
317
+ end
318
+
319
+ def get_document
320
+ @document
321
+ end
322
+
323
+ def get_fragment
324
+ #assert @inner_html
325
+ fragment = @fragmentClass.new
326
+ @open_elements[0].reparentChildren(fragment)
327
+ return fragment
328
+ end
329
+
330
+ # Serialize the subtree of node in the format required by unit tests
331
+ # node - the node from which to start serializing
332
+ def testSerializer(node)
333
+ raise NotImplementedError
334
+ end
335
+
336
+ end
337
+ end
338
+ end
339
+ end
@@ -0,0 +1,231 @@
1
+ require 'html5/treebuilders/base'
2
+ require 'rubygems'
3
+ require 'hpricot'
4
+ require 'forwardable'
5
+
6
+ module HTML5
7
+ module TreeBuilders
8
+ module Hpricot
9
+
10
+ class Node < Base::Node
11
+ extend Forwardable
12
+
13
+ def_delegators :@hpricot, :name
14
+
15
+ attr_accessor :hpricot
16
+
17
+ def initialize(name)
18
+ super(name)
19
+ @hpricot = self.class.hpricot_class.new name
20
+ end
21
+
22
+ def appendChild(node)
23
+ if node.kind_of?(TextNode) and childNodes.any? and childNodes.last.kind_of?(TextNode)
24
+ childNodes.last.hpricot.content = childNodes.last.hpricot.content + node.hpricot.content
25
+ else
26
+ childNodes << node
27
+ hpricot.children << node.hpricot
28
+ end
29
+ if (oldparent = node.hpricot.parent) != nil
30
+ oldparent.children.delete_at(oldparent.children.index(node.hpricot))
31
+ end
32
+ node.hpricot.parent = hpricot
33
+ node.parent = self
34
+ end
35
+
36
+ def removeChild(node)
37
+ childNodes.delete(node)
38
+ hpricot.children.delete_at(hpricot.children.index(node.hpricot))
39
+ node.hpricot.parent = nil
40
+ node.parent = nil
41
+ end
42
+
43
+ def insertText(data, before=nil)
44
+ if before
45
+ insertBefore(TextNode.new(data), before)
46
+ else
47
+ appendChild(TextNode.new(data))
48
+ end
49
+ end
50
+
51
+ def insertBefore(node, refNode)
52
+ index = childNodes.index(refNode)
53
+ if node.kind_of?(TextNode) and index > 0 and childNodes[index-1].kind_of?(TextNode)
54
+ childNodes[index-1].hpricot.content = childNodes[index-1].hpricot.to_s + node.hpricot.to_s
55
+ else
56
+ refNode.hpricot.parent.insert_before(node.hpricot,refNode.hpricot)
57
+ childNodes.insert(index, node)
58
+ end
59
+ end
60
+
61
+ def hasContent
62
+ childNodes.any?
63
+ end
64
+ end
65
+
66
+ class Element < Node
67
+ attr_reader :namespace
68
+
69
+ def self.hpricot_class
70
+ ::Hpricot::Elem
71
+ end
72
+
73
+ def initialize(name, namespace=nil)
74
+ super(name)
75
+
76
+ @hpricot = ::Hpricot::Elem.new(::Hpricot::STag.new(name))
77
+ @namespace = namespace
78
+ end
79
+
80
+ def name
81
+ @hpricot.stag.name
82
+ end
83
+
84
+ def cloneNode
85
+ attributes.inject(self.class.new(name)) do |node, (name, value)|
86
+ node.hpricot[name] = value
87
+ node
88
+ end
89
+ end
90
+
91
+ # A call to Hpricot::Elem#raw_attributes is built dynamically,
92
+ # so alterations to the returned value (a hash) will be lost.
93
+ #
94
+ # AttributeProxy works around this by forwarding :[]= calls
95
+ # to the raw_attributes accessor on the element start tag.
96
+ #
97
+ class AttributeProxy
98
+ def initialize(hpricot)
99
+ @hpricot = hpricot
100
+ end
101
+
102
+ def []=(k, v)
103
+ @hpricot.stag.send(stag_attributes_method)[k] = v
104
+ end
105
+
106
+ def stag_attributes_method
107
+ # STag#attributes changed to STag#raw_attributes after Hpricot 0.5
108
+ @hpricot.stag.respond_to?(:raw_attributes) ? :raw_attributes : :attributes
109
+ end
110
+
111
+ def method_missing(*a, &b)
112
+ @hpricot.attributes.send(*a, &b)
113
+ end
114
+ end
115
+
116
+ def attributes
117
+ AttributeProxy.new(@hpricot)
118
+ end
119
+
120
+ def attributes=(attrs)
121
+ attrs.each { |name, value| @hpricot[name] = value }
122
+ end
123
+
124
+ def printTree(indent=0)
125
+ tree = "\n|#{' ' * indent}<#{!@namespace.nil? ? @namespace.to_s + ' ' : ''}#{name}>"
126
+ indent += 2
127
+ attributes.each do |name, value|
128
+ next if name == 'xmlns'
129
+ tree += "\n|#{' ' * indent}#{name}=\"#{value}\""
130
+ end
131
+ childNodes.inject(tree) { |tree, child| tree + child.printTree(indent) }
132
+ end
133
+ end
134
+
135
+ class Document < Node
136
+ def self.hpricot_class
137
+ ::Hpricot::Doc
138
+ end
139
+
140
+ def initialize
141
+ super(nil)
142
+ end
143
+
144
+ def printTree(indent=0)
145
+ childNodes.inject('#document') { |tree, child| tree + child.printTree(indent + 2) }
146
+ end
147
+ end
148
+
149
+ class DocumentType < Node
150
+ def_delegators :@hpricot, :public_id, :system_id
151
+
152
+ def self.hpricot_class
153
+ ::Hpricot::DocType
154
+ end
155
+
156
+ def initialize(name, public_id, system_id)
157
+ @hpricot = self.class.hpricot_class.new(name, public_id, system_id)
158
+ end
159
+
160
+ def printTree(indent=0)
161
+ if hpricot.target and hpricot.target.any?
162
+ "\n|#{' ' * indent}<!DOCTYPE #{hpricot.target}" +
163
+ ([hpricot.public_id, hpricot.system_id].any? ? " \"#{hpricot.public_id}\" \"#{hpricot.system_id}\"" : '') +
164
+ '>'
165
+ else
166
+ "\n|#{' ' * indent}<!DOCTYPE >"
167
+ end
168
+ end
169
+ end
170
+
171
+ class DocumentFragment < Element
172
+ def initialize
173
+ super('')
174
+ end
175
+
176
+ def printTree(indent=0)
177
+ childNodes.inject('') {|tree, child| tree + child.printTree(indent + 2) }
178
+ end
179
+ end
180
+
181
+ class TextNode < Node
182
+ def initialize(data)
183
+ @hpricot = ::Hpricot::Text.new(data)
184
+ end
185
+
186
+ def printTree(indent=0)
187
+ "\n|#{' ' * indent}\"#{hpricot.content}\""
188
+ end
189
+ end
190
+
191
+ class CommentNode < Node
192
+ def self.hpricot_class
193
+ ::Hpricot::Comment
194
+ end
195
+
196
+ def printTree(indent=0)
197
+ "\n|#{' ' * indent}<!-- #{hpricot.content} -->"
198
+ end
199
+ end
200
+
201
+ class TreeBuilder < Base::TreeBuilder
202
+ def initialize
203
+ @documentClass = Document
204
+ @doctypeClass = DocumentType
205
+ @elementClass = Element
206
+ @commentClass = CommentNode
207
+ @fragmentClass = DocumentFragment
208
+ end
209
+
210
+ def insertDoctype(name, public_id, system_id)
211
+ doctype = @doctypeClass.new(name, public_id, system_id)
212
+ @document.appendChild(doctype)
213
+ end
214
+
215
+ def testSerializer(node)
216
+ node.printTree
217
+ end
218
+
219
+ def get_document
220
+ @document.hpricot
221
+ end
222
+
223
+ def get_fragment
224
+ @document = super
225
+ return @document.hpricot.children
226
+ end
227
+ end
228
+
229
+ end
230
+ end
231
+ end