html5 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (59) hide show
  1. data/History.txt +3 -0
  2. data/Manifest.txt +58 -0
  3. data/README +9 -0
  4. data/Rakefile.rb +17 -0
  5. data/lib/html5/constants.rb +818 -0
  6. data/lib/html5/filters/base.rb +10 -0
  7. data/lib/html5/filters/inject_meta_charset.rb +82 -0
  8. data/lib/html5/filters/optionaltags.rb +198 -0
  9. data/lib/html5/filters/sanitizer.rb +15 -0
  10. data/lib/html5/filters/whitespace.rb +36 -0
  11. data/lib/html5/html5parser/after_body_phase.rb +46 -0
  12. data/lib/html5/html5parser/after_frameset_phase.rb +34 -0
  13. data/lib/html5/html5parser/after_head_phase.rb +50 -0
  14. data/lib/html5/html5parser/before_head_phase.rb +41 -0
  15. data/lib/html5/html5parser/in_body_phase.rb +607 -0
  16. data/lib/html5/html5parser/in_caption_phase.rb +68 -0
  17. data/lib/html5/html5parser/in_cell_phase.rb +78 -0
  18. data/lib/html5/html5parser/in_column_group_phase.rb +55 -0
  19. data/lib/html5/html5parser/in_frameset_phase.rb +57 -0
  20. data/lib/html5/html5parser/in_head_phase.rb +138 -0
  21. data/lib/html5/html5parser/in_row_phase.rb +87 -0
  22. data/lib/html5/html5parser/in_select_phase.rb +84 -0
  23. data/lib/html5/html5parser/in_table_body_phase.rb +83 -0
  24. data/lib/html5/html5parser/in_table_phase.rb +110 -0
  25. data/lib/html5/html5parser/initial_phase.rb +134 -0
  26. data/lib/html5/html5parser/phase.rb +158 -0
  27. data/lib/html5/html5parser/root_element_phase.rb +42 -0
  28. data/lib/html5/html5parser/trailing_end_phase.rb +35 -0
  29. data/lib/html5/html5parser.rb +248 -0
  30. data/lib/html5/inputstream.rb +654 -0
  31. data/lib/html5/liberalxmlparser.rb +158 -0
  32. data/lib/html5/sanitizer.rb +188 -0
  33. data/lib/html5/serializer/htmlserializer.rb +180 -0
  34. data/lib/html5/serializer/xhtmlserializer.rb +20 -0
  35. data/lib/html5/serializer.rb +2 -0
  36. data/lib/html5/tokenizer.rb +968 -0
  37. data/lib/html5/treebuilders/base.rb +334 -0
  38. data/lib/html5/treebuilders/hpricot.rb +231 -0
  39. data/lib/html5/treebuilders/rexml.rb +208 -0
  40. data/lib/html5/treebuilders/simpletree.rb +185 -0
  41. data/lib/html5/treebuilders.rb +24 -0
  42. data/lib/html5/treewalkers/base.rb +154 -0
  43. data/lib/html5/treewalkers/hpricot.rb +48 -0
  44. data/lib/html5/treewalkers/rexml.rb +48 -0
  45. data/lib/html5/treewalkers/simpletree.rb +48 -0
  46. data/lib/html5/treewalkers.rb +26 -0
  47. data/lib/html5.rb +13 -0
  48. data/parse.rb +217 -0
  49. data/tests/preamble.rb +82 -0
  50. data/tests/test_encoding.rb +35 -0
  51. data/tests/test_lxp.rb +263 -0
  52. data/tests/test_parser.rb +68 -0
  53. data/tests/test_sanitizer.rb +142 -0
  54. data/tests/test_serializer.rb +68 -0
  55. data/tests/test_stream.rb +62 -0
  56. data/tests/test_tokenizer.rb +94 -0
  57. data/tests/test_treewalkers.rb +116 -0
  58. data/tests/tokenizer_test_parser.rb +63 -0
  59. metadata +120 -0
@@ -0,0 +1,334 @@
1
+ require 'html5/constants'
2
+
3
+ #XXX - TODO; make the default interface more ElementTree-like rather than DOM-like
4
+
5
+ module HTML5
6
+
7
+ # The scope markers are inserted when entering buttons, object elements,
8
+ # marquees, table cells, and table captions, and are used to prevent formatting
9
+ # from "leaking" into tables, buttons, object elements, and marquees.
10
+ Marker = nil
11
+
12
+ module TreeBuilders
13
+ module Base
14
+
15
+ class Node
16
+ # The parent of the current node (or nil for the document node)
17
+ attr_accessor :parent
18
+
19
+ # a list of child nodes of the current node. This must
20
+ # include all elements but not necessarily other node types
21
+ attr_accessor :childNodes
22
+
23
+ # A list of miscellaneous flags that can be set on the node
24
+ attr_accessor :_flags
25
+
26
+ def initialize(name)
27
+ @parent = nil
28
+ @childNodes = []
29
+ @_flags = []
30
+ end
31
+
32
+ # Insert node as a child of the current node
33
+ def appendChild(node)
34
+ raise NotImplementedError
35
+ end
36
+
37
+ # Insert data as text in the current node, positioned before the
38
+ # start of node insertBefore or to the end of the node's text.
39
+ def insertText(data, insertBefore=nil)
40
+ raise NotImplementedError
41
+ end
42
+
43
+ # Insert node as a child of the current node, before refNode in the
44
+ # list of child nodes. Raises ValueError if refNode is not a child of
45
+ # the current node
46
+ def insertBefore(node, refNode)
47
+ raise NotImplementedError
48
+ end
49
+
50
+ # Remove node from the children of the current node
51
+ def removeChild(node)
52
+ raise NotImplementedError
53
+ end
54
+
55
+ # Move all the children of the current node to newParent.
56
+ # This is needed so that trees that don't store text as nodes move the
57
+ # text in the correct way
58
+ def reparentChildren(newParent)
59
+ #XXX - should this method be made more general?
60
+ @childNodes.each { |child| newParent.appendChild(child) }
61
+ @childNodes = []
62
+ end
63
+
64
+ # Return a shallow copy of the current node i.e. a node with the same
65
+ # name and attributes but with no parent or child nodes
66
+ def cloneNode
67
+ raise NotImplementedError
68
+ end
69
+
70
+ # Return true if the node has children or text, false otherwise
71
+ def hasContent
72
+ raise NotImplementedError
73
+ end
74
+ end
75
+
76
+ # Base treebuilder implementation
77
+ class TreeBuilder
78
+
79
+ attr_accessor :open_elements
80
+
81
+ attr_accessor :activeFormattingElements
82
+
83
+ attr_accessor :document
84
+
85
+ attr_accessor :head_pointer
86
+
87
+ attr_accessor :formPointer
88
+
89
+ # Class to use for document root
90
+ documentClass = nil
91
+
92
+ # Class to use for HTML elements
93
+ elementClass = nil
94
+
95
+ # Class to use for comments
96
+ commentClass = nil
97
+
98
+ # Class to use for doctypes
99
+ doctypeClass = nil
100
+
101
+ # Fragment class
102
+ fragmentClass = nil
103
+
104
+ def initialize
105
+ reset
106
+ end
107
+
108
+ def reset
109
+ @open_elements = []
110
+ @activeFormattingElements = []
111
+
112
+ #XXX - rename these to headElement, formElement
113
+ @head_pointer = nil
114
+ @formPointer = nil
115
+
116
+ self.insert_from_table = false
117
+
118
+ @document = @documentClass.new
119
+ end
120
+
121
+ def elementInScope(target, tableVariant=false)
122
+ # Exit early when possible.
123
+ return true if @open_elements[-1].name == target
124
+
125
+ # AT How about while true and simply set node to [-1] and set it to
126
+ # [-2] at the end...
127
+ @open_elements.reverse.each do |element|
128
+ if element.name == target
129
+ return true
130
+ elsif element.name == 'table'
131
+ return false
132
+ elsif not tableVariant and SCOPING_ELEMENTS.include?(element.name)
133
+ return false
134
+ elsif element.name == 'html'
135
+ return false
136
+ end
137
+ end
138
+ assert false # We should never reach this point
139
+ end
140
+
141
+ def reconstructActiveFormattingElements
142
+ # Within this algorithm the order of steps described in the
143
+ # specification is not quite the same as the order of steps in the
144
+ # code. It should still do the same though.
145
+
146
+ # Step 1: stop the algorithm when there's nothing to do.
147
+ return if @activeFormattingElements.empty?
148
+
149
+ # Step 2 and step 3: we start with the last element. So i is -1.
150
+ i = -1
151
+ entry = @activeFormattingElements[i]
152
+ return if entry == Marker or @open_elements.include?(entry)
153
+
154
+ # Step 6
155
+ until entry == Marker or @open_elements.include?(entry)
156
+ # Step 5: let entry be one earlier in the list.
157
+ i -= 1
158
+ begin
159
+ entry = @activeFormattingElements[i]
160
+ rescue
161
+ # Step 4: at this point we need to jump to step 8. By not doing
162
+ # i += 1 which is also done in step 7 we achieve that.
163
+ break
164
+ end
165
+ end
166
+ while true
167
+ # Step 7
168
+ i += 1
169
+
170
+ # Step 8
171
+ clone = @activeFormattingElements[i].cloneNode
172
+
173
+ # Step 9
174
+ element = insert_element(clone.name, clone.attributes)
175
+
176
+ # Step 10
177
+ @activeFormattingElements[i] = element
178
+
179
+ # Step 11
180
+ break if element == @activeFormattingElements[-1]
181
+ end
182
+ end
183
+
184
+ def clearActiveFormattingElements
185
+ {} until @activeFormattingElements.empty? || @activeFormattingElements.pop == Marker
186
+ end
187
+
188
+ # Check if an element exists between the end of the active
189
+ # formatting elements and the last marker. If it does, return it, else
190
+ # return false
191
+ def elementInActiveFormattingElements(name)
192
+ @activeFormattingElements.reverse.each do |element|
193
+ # Check for Marker first because if it's a Marker it doesn't have a
194
+ # name attribute.
195
+ break if element == Marker
196
+ return element if element.name == name
197
+ end
198
+ return false
199
+ end
200
+
201
+ def insertDoctype(name, public_id, system_id)
202
+ doctype = @doctypeClass.new(name)
203
+ doctype.public_id = public_id
204
+ doctype.system_id = system_id
205
+ @document.appendChild(doctype)
206
+ end
207
+
208
+ def insert_comment(data, parent=nil)
209
+ parent = @open_elements[-1] if parent.nil?
210
+ parent.appendChild(@commentClass.new(data))
211
+ end
212
+
213
+ # Create an element but don't insert it anywhere
214
+ def createElement(name, attributes)
215
+ element = @elementClass.new(name)
216
+ element.attributes = attributes
217
+ return element
218
+ end
219
+
220
+ # Switch the function used to insert an element from the
221
+ # normal one to the misnested table one and back again
222
+ def insert_from_table=(value)
223
+ @insert_from_table = value
224
+ @insert_element = value ? :insert_elementTable : :insert_elementNormal
225
+ end
226
+
227
+ def insert_element(name, attributes)
228
+ send(@insert_element, name, attributes)
229
+ end
230
+
231
+ def insert_elementNormal(name, attributes)
232
+ element = @elementClass.new(name)
233
+ element.attributes = attributes
234
+ @open_elements.last.appendChild(element)
235
+ @open_elements.push(element)
236
+ return element
237
+ end
238
+
239
+ # Create an element and insert it into the tree
240
+ def insert_elementTable(name, attributes)
241
+ element = @elementClass.new(name)
242
+ element.attributes = attributes
243
+ if TABLE_INSERT_MODE_ELEMENTS.include?(@open_elements.last.name)
244
+ #We should be in the InTable mode. This means we want to do
245
+ #special magic element rearranging
246
+ parent, insertBefore = getTableMisnestedNodePosition
247
+ if insertBefore.nil?
248
+ parent.appendChild(element)
249
+ else
250
+ parent.insertBefore(element, insertBefore)
251
+ end
252
+ @open_elements.push(element)
253
+ else
254
+ return insert_elementNormal(name, attributes)
255
+ end
256
+ return element
257
+ end
258
+
259
+ def insertText(data, parent=nil)
260
+ parent = @open_elements[-1] if parent.nil?
261
+
262
+ if (not(@insert_from_table) or (@insert_from_table and not TABLE_INSERT_MODE_ELEMENTS.include?(@open_elements[-1].name)))
263
+ parent.insertText(data)
264
+ else
265
+ #We should be in the InTable mode. This means we want to do
266
+ #special magic element rearranging
267
+ parent, insertBefore = getTableMisnestedNodePosition
268
+ parent.insertText(data, insertBefore)
269
+ end
270
+ end
271
+
272
+ # Get the foster parent element, and sibling to insert before
273
+ # (or nil) when inserting a misnested table node
274
+ def getTableMisnestedNodePosition
275
+ #The foster parent element is the one which comes before the most
276
+ #recently opened table element
277
+ #XXX - this is really inelegant
278
+ lastTable = nil
279
+ fosterParent = nil
280
+ insertBefore = nil
281
+ @open_elements.reverse.each do |element|
282
+ if element.name == "table"
283
+ lastTable = element
284
+ break
285
+ end
286
+ end
287
+ if lastTable
288
+ #XXX - we should really check that this parent is actually a
289
+ #node here
290
+ if lastTable.parent
291
+ fosterParent = lastTable.parent
292
+ insertBefore = lastTable
293
+ else
294
+ fosterParent = @open_elements[@open_elements.index(lastTable) - 1]
295
+ end
296
+ else
297
+ fosterParent = @open_elements[0]
298
+ end
299
+ return fosterParent, insertBefore
300
+ end
301
+
302
+ def generateImpliedEndTags(exclude=nil)
303
+ name = @open_elements[-1].name
304
+
305
+ # XXX td, th and tr are not actually needed
306
+ if (%w[dd dt li p td th tr].include?(name) and name != exclude)
307
+ @open_elements.pop
308
+ # XXX This is not entirely what the specification says. We should
309
+ # investigate it more closely.
310
+ generateImpliedEndTags(exclude)
311
+ end
312
+ end
313
+
314
+ def get_document
315
+ @document
316
+ end
317
+
318
+ def get_fragment
319
+ #assert @inner_html
320
+ fragment = @fragmentClass.new
321
+ @open_elements[0].reparentChildren(fragment)
322
+ return fragment
323
+ end
324
+
325
+ # Serialize the subtree of node in the format required by unit tests
326
+ # node - the node from which to start serializing
327
+ def testSerializer(node)
328
+ raise NotImplementedError
329
+ end
330
+
331
+ end
332
+ end
333
+ end
334
+ end
@@ -0,0 +1,231 @@
1
+ require 'html5/treebuilders/base'
2
+ require 'rubygems'
3
+ require 'hpricot'
4
+ require 'forwardable'
5
+
6
+ module HTML5
7
+ module TreeBuilders
8
+ module Hpricot
9
+
10
+ class Node < Base::Node
11
+ extend Forwardable
12
+
13
+ def_delegators :@hpricot, :name
14
+
15
+ attr_accessor :hpricot
16
+
17
+ def initialize(name)
18
+ super(name)
19
+ @hpricot = self.class.hpricot_class.new name
20
+ end
21
+
22
+ def appendChild(node)
23
+ if node.kind_of?(TextNode) and childNodes.any? and childNodes.last.kind_of?(TextNode)
24
+ childNodes.last.hpricot.content = childNodes.last.hpricot.content + node.hpricot.content
25
+ else
26
+ childNodes << node
27
+ hpricot.children << node.hpricot
28
+ end
29
+ if (oldparent = node.hpricot.parent) != nil
30
+ oldparent.children.delete_at(oldparent.children.index(node.hpricot))
31
+ end
32
+ node.hpricot.parent = hpricot
33
+ node.parent = self
34
+ end
35
+
36
+ def removeChild(node)
37
+ childNodes.delete(node)
38
+ hpricot.children.delete_at(hpricot.children.index(node.hpricot))
39
+ node.hpricot.parent = nil
40
+ node.parent = nil
41
+ end
42
+
43
+ def insertText(data, before=nil)
44
+ if before
45
+ insertBefore(TextNode.new(data), before)
46
+ else
47
+ appendChild(TextNode.new(data))
48
+ end
49
+ end
50
+
51
+ def insertBefore(node, refNode)
52
+ index = childNodes.index(refNode)
53
+ if node.kind_of?(TextNode) and index > 0 and childNodes[index-1].kind_of?(TextNode)
54
+ childNodes[index-1].hpricot.content = childNodes[index-1].hpricot.to_s + node.hpricot.to_s
55
+ else
56
+ refNode.hpricot.parent.insert_before(node.hpricot,refNode.hpricot)
57
+ childNodes.insert(index, node)
58
+ end
59
+ end
60
+
61
+ def hasContent
62
+ childNodes.any?
63
+ end
64
+ end
65
+
66
+ class Element < Node
67
+ def self.hpricot_class
68
+ ::Hpricot::Elem
69
+ end
70
+
71
+ def initialize(name)
72
+ super(name)
73
+
74
+ @hpricot = ::Hpricot::Elem.new(::Hpricot::STag.new(name))
75
+ end
76
+
77
+ def name
78
+ @hpricot.stag.name
79
+ end
80
+
81
+ def cloneNode
82
+ attributes.inject(self.class.new(name)) do |node, (name, value)|
83
+ node.hpricot[name] = value
84
+ node
85
+ end
86
+ end
87
+
88
+ # A call to Hpricot::Elem#raw_attributes is built dynamically,
89
+ # so alterations to the returned value (a hash) will be lost.
90
+ #
91
+ # AttributeProxy works around this by forwarding :[]= calls
92
+ # to the raw_attributes accessor on the element start tag.
93
+ #
94
+ class AttributeProxy
95
+ def initialize(hpricot)
96
+ @hpricot = hpricot
97
+ end
98
+
99
+ def []=(k, v)
100
+ @hpricot.stag.send(stag_attributes_method)[k] = v
101
+ end
102
+
103
+ def stag_attributes_method
104
+ # STag#attributes changed to STag#raw_attributes after Hpricot 0.5
105
+ @hpricot.stag.respond_to?(:raw_attributes) ? :raw_attributes : :attributes
106
+ end
107
+
108
+ def method_missing(*a, &b)
109
+ @hpricot.attributes.send(*a, &b)
110
+ end
111
+ end
112
+
113
+ def attributes
114
+ AttributeProxy.new(@hpricot)
115
+ end
116
+
117
+ def attributes=(attrs)
118
+ attrs.each { |name, value| @hpricot[name] = value }
119
+ end
120
+
121
+ def printTree(indent=0)
122
+ tree = "\n|#{' ' * indent}<#{name}>"
123
+ indent += 2
124
+ attributes.each do |name, value|
125
+ next if name == 'xmlns'
126
+ tree += "\n|#{' ' * indent}#{name}=\"#{value}\""
127
+ end
128
+ childNodes.inject(tree) { |tree, child| tree + child.printTree(indent) }
129
+ end
130
+ end
131
+
132
+ class Document < Node
133
+ def self.hpricot_class
134
+ ::Hpricot::Doc
135
+ end
136
+
137
+ def initialize
138
+ super(nil)
139
+ end
140
+
141
+ def printTree(indent=0)
142
+ childNodes.inject('#document') { |tree, child| tree + child.printTree(indent + 2) }
143
+ end
144
+ end
145
+
146
+ class DocumentType < Node
147
+ def_delegators :@hpricot, :public_id, :system_id
148
+
149
+ def self.hpricot_class
150
+ ::Hpricot::DocType
151
+ end
152
+
153
+ def initialize(name, public_id, system_id)
154
+ begin
155
+ super(name)
156
+ rescue ArgumentError # needs 3...
157
+ end
158
+
159
+ @hpricot = ::Hpricot::DocType.new(name, public_id, system_id)
160
+ end
161
+
162
+ def printTree(indent=0)
163
+ if hpricot.target and hpricot.target.any?
164
+ "\n|#{' ' * indent}<!DOCTYPE #{hpricot.target}>"
165
+ else
166
+ "\n|#{' ' * indent}<!DOCTYPE >"
167
+ end
168
+ end
169
+ end
170
+
171
+ class DocumentFragment < Element
172
+ def initialize
173
+ super('')
174
+ end
175
+
176
+ def printTree(indent=0)
177
+ childNodes.inject('') {|tree, child| tree + child.printTree(indent + 2) }
178
+ end
179
+ end
180
+
181
+ class TextNode < Node
182
+ def initialize(data)
183
+ @hpricot = ::Hpricot::Text.new(data)
184
+ end
185
+
186
+ def printTree(indent=0)
187
+ "\n|#{' ' * indent}\"#{hpricot.content}\""
188
+ end
189
+ end
190
+
191
+ class CommentNode < Node
192
+ def self.hpricot_class
193
+ ::Hpricot::Comment
194
+ end
195
+
196
+ def printTree(indent=0)
197
+ "\n|#{' ' * indent}<!-- #{hpricot.content} -->"
198
+ end
199
+ end
200
+
201
+ class TreeBuilder < Base::TreeBuilder
202
+ def initialize
203
+ @documentClass = Document
204
+ @doctypeClass = DocumentType
205
+ @elementClass = Element
206
+ @commentClass = CommentNode
207
+ @fragmentClass = DocumentFragment
208
+ end
209
+
210
+ def insertDoctype(name, public_id, system_id)
211
+ doctype = @doctypeClass.new(name, public_id, system_id)
212
+ @document.appendChild(doctype)
213
+ end
214
+
215
+ def testSerializer(node)
216
+ node.printTree
217
+ end
218
+
219
+ def get_document
220
+ @document.hpricot
221
+ end
222
+
223
+ def get_fragment
224
+ @document = super
225
+ return @document.hpricot.children
226
+ end
227
+ end
228
+
229
+ end
230
+ end
231
+ end