html5 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. data/History.txt +3 -0
  2. data/Manifest.txt +58 -0
  3. data/README +9 -0
  4. data/Rakefile.rb +17 -0
  5. data/lib/html5/constants.rb +818 -0
  6. data/lib/html5/filters/base.rb +10 -0
  7. data/lib/html5/filters/inject_meta_charset.rb +82 -0
  8. data/lib/html5/filters/optionaltags.rb +198 -0
  9. data/lib/html5/filters/sanitizer.rb +15 -0
  10. data/lib/html5/filters/whitespace.rb +36 -0
  11. data/lib/html5/html5parser/after_body_phase.rb +46 -0
  12. data/lib/html5/html5parser/after_frameset_phase.rb +34 -0
  13. data/lib/html5/html5parser/after_head_phase.rb +50 -0
  14. data/lib/html5/html5parser/before_head_phase.rb +41 -0
  15. data/lib/html5/html5parser/in_body_phase.rb +607 -0
  16. data/lib/html5/html5parser/in_caption_phase.rb +68 -0
  17. data/lib/html5/html5parser/in_cell_phase.rb +78 -0
  18. data/lib/html5/html5parser/in_column_group_phase.rb +55 -0
  19. data/lib/html5/html5parser/in_frameset_phase.rb +57 -0
  20. data/lib/html5/html5parser/in_head_phase.rb +138 -0
  21. data/lib/html5/html5parser/in_row_phase.rb +87 -0
  22. data/lib/html5/html5parser/in_select_phase.rb +84 -0
  23. data/lib/html5/html5parser/in_table_body_phase.rb +83 -0
  24. data/lib/html5/html5parser/in_table_phase.rb +110 -0
  25. data/lib/html5/html5parser/initial_phase.rb +134 -0
  26. data/lib/html5/html5parser/phase.rb +158 -0
  27. data/lib/html5/html5parser/root_element_phase.rb +42 -0
  28. data/lib/html5/html5parser/trailing_end_phase.rb +35 -0
  29. data/lib/html5/html5parser.rb +248 -0
  30. data/lib/html5/inputstream.rb +654 -0
  31. data/lib/html5/liberalxmlparser.rb +158 -0
  32. data/lib/html5/sanitizer.rb +188 -0
  33. data/lib/html5/serializer/htmlserializer.rb +180 -0
  34. data/lib/html5/serializer/xhtmlserializer.rb +20 -0
  35. data/lib/html5/serializer.rb +2 -0
  36. data/lib/html5/tokenizer.rb +968 -0
  37. data/lib/html5/treebuilders/base.rb +334 -0
  38. data/lib/html5/treebuilders/hpricot.rb +231 -0
  39. data/lib/html5/treebuilders/rexml.rb +208 -0
  40. data/lib/html5/treebuilders/simpletree.rb +185 -0
  41. data/lib/html5/treebuilders.rb +24 -0
  42. data/lib/html5/treewalkers/base.rb +154 -0
  43. data/lib/html5/treewalkers/hpricot.rb +48 -0
  44. data/lib/html5/treewalkers/rexml.rb +48 -0
  45. data/lib/html5/treewalkers/simpletree.rb +48 -0
  46. data/lib/html5/treewalkers.rb +26 -0
  47. data/lib/html5.rb +13 -0
  48. data/parse.rb +217 -0
  49. data/tests/preamble.rb +82 -0
  50. data/tests/test_encoding.rb +35 -0
  51. data/tests/test_lxp.rb +263 -0
  52. data/tests/test_parser.rb +68 -0
  53. data/tests/test_sanitizer.rb +142 -0
  54. data/tests/test_serializer.rb +68 -0
  55. data/tests/test_stream.rb +62 -0
  56. data/tests/test_tokenizer.rb +94 -0
  57. data/tests/test_treewalkers.rb +116 -0
  58. data/tests/tokenizer_test_parser.rb +63 -0
  59. metadata +120 -0
@@ -0,0 +1,334 @@
1
+ require 'html5/constants'
2
+
3
+ #XXX - TODO; make the default interface more ElementTree-like rather than DOM-like
4
+
5
+ module HTML5
6
+
7
+ # The scope markers are inserted when entering buttons, object elements,
8
+ # marquees, table cells, and table captions, and are used to prevent formatting
9
+ # from "leaking" into tables, buttons, object elements, and marquees.
10
+ Marker = nil
11
+
12
+ module TreeBuilders
13
+ module Base
14
+
15
+ class Node
16
+ # The parent of the current node (or nil for the document node)
17
+ attr_accessor :parent
18
+
19
+ # a list of child nodes of the current node. This must
20
+ # include all elements but not necessarily other node types
21
+ attr_accessor :childNodes
22
+
23
+ # A list of miscellaneous flags that can be set on the node
24
+ attr_accessor :_flags
25
+
26
+ def initialize(name)
27
+ @parent = nil
28
+ @childNodes = []
29
+ @_flags = []
30
+ end
31
+
32
+ # Insert node as a child of the current node
33
+ def appendChild(node)
34
+ raise NotImplementedError
35
+ end
36
+
37
+ # Insert data as text in the current node, positioned before the
38
+ # start of node insertBefore or to the end of the node's text.
39
+ def insertText(data, insertBefore=nil)
40
+ raise NotImplementedError
41
+ end
42
+
43
+ # Insert node as a child of the current node, before refNode in the
44
+ # list of child nodes. Raises ValueError if refNode is not a child of
45
+ # the current node
46
+ def insertBefore(node, refNode)
47
+ raise NotImplementedError
48
+ end
49
+
50
+ # Remove node from the children of the current node
51
+ def removeChild(node)
52
+ raise NotImplementedError
53
+ end
54
+
55
+ # Move all the children of the current node to newParent.
56
+ # This is needed so that trees that don't store text as nodes move the
57
+ # text in the correct way
58
+ def reparentChildren(newParent)
59
+ #XXX - should this method be made more general?
60
+ @childNodes.each { |child| newParent.appendChild(child) }
61
+ @childNodes = []
62
+ end
63
+
64
+ # Return a shallow copy of the current node i.e. a node with the same
65
+ # name and attributes but with no parent or child nodes
66
+ def cloneNode
67
+ raise NotImplementedError
68
+ end
69
+
70
+ # Return true if the node has children or text, false otherwise
71
+ def hasContent
72
+ raise NotImplementedError
73
+ end
74
+ end
75
+
76
+ # Base treebuilder implementation
77
+ class TreeBuilder
78
+
79
+ attr_accessor :open_elements
80
+
81
+ attr_accessor :activeFormattingElements
82
+
83
+ attr_accessor :document
84
+
85
+ attr_accessor :head_pointer
86
+
87
+ attr_accessor :formPointer
88
+
89
+ # Class to use for document root
90
+ documentClass = nil
91
+
92
+ # Class to use for HTML elements
93
+ elementClass = nil
94
+
95
+ # Class to use for comments
96
+ commentClass = nil
97
+
98
+ # Class to use for doctypes
99
+ doctypeClass = nil
100
+
101
+ # Fragment class
102
+ fragmentClass = nil
103
+
104
+ def initialize
105
+ reset
106
+ end
107
+
108
+ def reset
109
+ @open_elements = []
110
+ @activeFormattingElements = []
111
+
112
+ #XXX - rename these to headElement, formElement
113
+ @head_pointer = nil
114
+ @formPointer = nil
115
+
116
+ self.insert_from_table = false
117
+
118
+ @document = @documentClass.new
119
+ end
120
+
121
+ def elementInScope(target, tableVariant=false)
122
+ # Exit early when possible.
123
+ return true if @open_elements[-1].name == target
124
+
125
+ # AT How about while true and simply set node to [-1] and set it to
126
+ # [-2] at the end...
127
+ @open_elements.reverse.each do |element|
128
+ if element.name == target
129
+ return true
130
+ elsif element.name == 'table'
131
+ return false
132
+ elsif not tableVariant and SCOPING_ELEMENTS.include?(element.name)
133
+ return false
134
+ elsif element.name == 'html'
135
+ return false
136
+ end
137
+ end
138
+ assert false # We should never reach this point
139
+ end
140
+
141
+ def reconstructActiveFormattingElements
142
+ # Within this algorithm the order of steps described in the
143
+ # specification is not quite the same as the order of steps in the
144
+ # code. It should still do the same though.
145
+
146
+ # Step 1: stop the algorithm when there's nothing to do.
147
+ return if @activeFormattingElements.empty?
148
+
149
+ # Step 2 and step 3: we start with the last element. So i is -1.
150
+ i = -1
151
+ entry = @activeFormattingElements[i]
152
+ return if entry == Marker or @open_elements.include?(entry)
153
+
154
+ # Step 6
155
+ until entry == Marker or @open_elements.include?(entry)
156
+ # Step 5: let entry be one earlier in the list.
157
+ i -= 1
158
+ begin
159
+ entry = @activeFormattingElements[i]
160
+ rescue
161
+ # Step 4: at this point we need to jump to step 8. By not doing
162
+ # i += 1 which is also done in step 7 we achieve that.
163
+ break
164
+ end
165
+ end
166
+ while true
167
+ # Step 7
168
+ i += 1
169
+
170
+ # Step 8
171
+ clone = @activeFormattingElements[i].cloneNode
172
+
173
+ # Step 9
174
+ element = insert_element(clone.name, clone.attributes)
175
+
176
+ # Step 10
177
+ @activeFormattingElements[i] = element
178
+
179
+ # Step 11
180
+ break if element == @activeFormattingElements[-1]
181
+ end
182
+ end
183
+
184
+ def clearActiveFormattingElements
185
+ {} until @activeFormattingElements.empty? || @activeFormattingElements.pop == Marker
186
+ end
187
+
188
+ # Check if an element exists between the end of the active
189
+ # formatting elements and the last marker. If it does, return it, else
190
+ # return false
191
+ def elementInActiveFormattingElements(name)
192
+ @activeFormattingElements.reverse.each do |element|
193
+ # Check for Marker first because if it's a Marker it doesn't have a
194
+ # name attribute.
195
+ break if element == Marker
196
+ return element if element.name == name
197
+ end
198
+ return false
199
+ end
200
+
201
+ def insertDoctype(name, public_id, system_id)
202
+ doctype = @doctypeClass.new(name)
203
+ doctype.public_id = public_id
204
+ doctype.system_id = system_id
205
+ @document.appendChild(doctype)
206
+ end
207
+
208
+ def insert_comment(data, parent=nil)
209
+ parent = @open_elements[-1] if parent.nil?
210
+ parent.appendChild(@commentClass.new(data))
211
+ end
212
+
213
+ # Create an element but don't insert it anywhere
214
+ def createElement(name, attributes)
215
+ element = @elementClass.new(name)
216
+ element.attributes = attributes
217
+ return element
218
+ end
219
+
220
+ # Switch the function used to insert an element from the
221
+ # normal one to the misnested table one and back again
222
+ def insert_from_table=(value)
223
+ @insert_from_table = value
224
+ @insert_element = value ? :insert_elementTable : :insert_elementNormal
225
+ end
226
+
227
+ def insert_element(name, attributes)
228
+ send(@insert_element, name, attributes)
229
+ end
230
+
231
+ def insert_elementNormal(name, attributes)
232
+ element = @elementClass.new(name)
233
+ element.attributes = attributes
234
+ @open_elements.last.appendChild(element)
235
+ @open_elements.push(element)
236
+ return element
237
+ end
238
+
239
+ # Create an element and insert it into the tree
240
+ def insert_elementTable(name, attributes)
241
+ element = @elementClass.new(name)
242
+ element.attributes = attributes
243
+ if TABLE_INSERT_MODE_ELEMENTS.include?(@open_elements.last.name)
244
+ #We should be in the InTable mode. This means we want to do
245
+ #special magic element rearranging
246
+ parent, insertBefore = getTableMisnestedNodePosition
247
+ if insertBefore.nil?
248
+ parent.appendChild(element)
249
+ else
250
+ parent.insertBefore(element, insertBefore)
251
+ end
252
+ @open_elements.push(element)
253
+ else
254
+ return insert_elementNormal(name, attributes)
255
+ end
256
+ return element
257
+ end
258
+
259
+ def insertText(data, parent=nil)
260
+ parent = @open_elements[-1] if parent.nil?
261
+
262
+ if (not(@insert_from_table) or (@insert_from_table and not TABLE_INSERT_MODE_ELEMENTS.include?(@open_elements[-1].name)))
263
+ parent.insertText(data)
264
+ else
265
+ #We should be in the InTable mode. This means we want to do
266
+ #special magic element rearranging
267
+ parent, insertBefore = getTableMisnestedNodePosition
268
+ parent.insertText(data, insertBefore)
269
+ end
270
+ end
271
+
272
+ # Get the foster parent element, and sibling to insert before
273
+ # (or nil) when inserting a misnested table node
274
+ def getTableMisnestedNodePosition
275
+ #The foster parent element is the one which comes before the most
276
+ #recently opened table element
277
+ #XXX - this is really inelegant
278
+ lastTable = nil
279
+ fosterParent = nil
280
+ insertBefore = nil
281
+ @open_elements.reverse.each do |element|
282
+ if element.name == "table"
283
+ lastTable = element
284
+ break
285
+ end
286
+ end
287
+ if lastTable
288
+ #XXX - we should really check that this parent is actually a
289
+ #node here
290
+ if lastTable.parent
291
+ fosterParent = lastTable.parent
292
+ insertBefore = lastTable
293
+ else
294
+ fosterParent = @open_elements[@open_elements.index(lastTable) - 1]
295
+ end
296
+ else
297
+ fosterParent = @open_elements[0]
298
+ end
299
+ return fosterParent, insertBefore
300
+ end
301
+
302
+ def generateImpliedEndTags(exclude=nil)
303
+ name = @open_elements[-1].name
304
+
305
+ # XXX td, th and tr are not actually needed
306
+ if (%w[dd dt li p td th tr].include?(name) and name != exclude)
307
+ @open_elements.pop
308
+ # XXX This is not entirely what the specification says. We should
309
+ # investigate it more closely.
310
+ generateImpliedEndTags(exclude)
311
+ end
312
+ end
313
+
314
+ def get_document
315
+ @document
316
+ end
317
+
318
+ def get_fragment
319
+ #assert @inner_html
320
+ fragment = @fragmentClass.new
321
+ @open_elements[0].reparentChildren(fragment)
322
+ return fragment
323
+ end
324
+
325
+ # Serialize the subtree of node in the format required by unit tests
326
+ # node - the node from which to start serializing
327
+ def testSerializer(node)
328
+ raise NotImplementedError
329
+ end
330
+
331
+ end
332
+ end
333
+ end
334
+ end
@@ -0,0 +1,231 @@
1
+ require 'html5/treebuilders/base'
2
+ require 'rubygems'
3
+ require 'hpricot'
4
+ require 'forwardable'
5
+
6
+ module HTML5
7
+ module TreeBuilders
8
+ module Hpricot
9
+
10
+ class Node < Base::Node
11
+ extend Forwardable
12
+
13
+ def_delegators :@hpricot, :name
14
+
15
+ attr_accessor :hpricot
16
+
17
+ def initialize(name)
18
+ super(name)
19
+ @hpricot = self.class.hpricot_class.new name
20
+ end
21
+
22
+ def appendChild(node)
23
+ if node.kind_of?(TextNode) and childNodes.any? and childNodes.last.kind_of?(TextNode)
24
+ childNodes.last.hpricot.content = childNodes.last.hpricot.content + node.hpricot.content
25
+ else
26
+ childNodes << node
27
+ hpricot.children << node.hpricot
28
+ end
29
+ if (oldparent = node.hpricot.parent) != nil
30
+ oldparent.children.delete_at(oldparent.children.index(node.hpricot))
31
+ end
32
+ node.hpricot.parent = hpricot
33
+ node.parent = self
34
+ end
35
+
36
+ def removeChild(node)
37
+ childNodes.delete(node)
38
+ hpricot.children.delete_at(hpricot.children.index(node.hpricot))
39
+ node.hpricot.parent = nil
40
+ node.parent = nil
41
+ end
42
+
43
+ def insertText(data, before=nil)
44
+ if before
45
+ insertBefore(TextNode.new(data), before)
46
+ else
47
+ appendChild(TextNode.new(data))
48
+ end
49
+ end
50
+
51
+ def insertBefore(node, refNode)
52
+ index = childNodes.index(refNode)
53
+ if node.kind_of?(TextNode) and index > 0 and childNodes[index-1].kind_of?(TextNode)
54
+ childNodes[index-1].hpricot.content = childNodes[index-1].hpricot.to_s + node.hpricot.to_s
55
+ else
56
+ refNode.hpricot.parent.insert_before(node.hpricot,refNode.hpricot)
57
+ childNodes.insert(index, node)
58
+ end
59
+ end
60
+
61
+ def hasContent
62
+ childNodes.any?
63
+ end
64
+ end
65
+
66
+ class Element < Node
67
+ def self.hpricot_class
68
+ ::Hpricot::Elem
69
+ end
70
+
71
+ def initialize(name)
72
+ super(name)
73
+
74
+ @hpricot = ::Hpricot::Elem.new(::Hpricot::STag.new(name))
75
+ end
76
+
77
+ def name
78
+ @hpricot.stag.name
79
+ end
80
+
81
+ def cloneNode
82
+ attributes.inject(self.class.new(name)) do |node, (name, value)|
83
+ node.hpricot[name] = value
84
+ node
85
+ end
86
+ end
87
+
88
+ # A call to Hpricot::Elem#raw_attributes is built dynamically,
89
+ # so alterations to the returned value (a hash) will be lost.
90
+ #
91
+ # AttributeProxy works around this by forwarding :[]= calls
92
+ # to the raw_attributes accessor on the element start tag.
93
+ #
94
+ class AttributeProxy
95
+ def initialize(hpricot)
96
+ @hpricot = hpricot
97
+ end
98
+
99
+ def []=(k, v)
100
+ @hpricot.stag.send(stag_attributes_method)[k] = v
101
+ end
102
+
103
+ def stag_attributes_method
104
+ # STag#attributes changed to STag#raw_attributes after Hpricot 0.5
105
+ @hpricot.stag.respond_to?(:raw_attributes) ? :raw_attributes : :attributes
106
+ end
107
+
108
+ def method_missing(*a, &b)
109
+ @hpricot.attributes.send(*a, &b)
110
+ end
111
+ end
112
+
113
+ def attributes
114
+ AttributeProxy.new(@hpricot)
115
+ end
116
+
117
+ def attributes=(attrs)
118
+ attrs.each { |name, value| @hpricot[name] = value }
119
+ end
120
+
121
+ def printTree(indent=0)
122
+ tree = "\n|#{' ' * indent}<#{name}>"
123
+ indent += 2
124
+ attributes.each do |name, value|
125
+ next if name == 'xmlns'
126
+ tree += "\n|#{' ' * indent}#{name}=\"#{value}\""
127
+ end
128
+ childNodes.inject(tree) { |tree, child| tree + child.printTree(indent) }
129
+ end
130
+ end
131
+
132
+ class Document < Node
133
+ def self.hpricot_class
134
+ ::Hpricot::Doc
135
+ end
136
+
137
+ def initialize
138
+ super(nil)
139
+ end
140
+
141
+ def printTree(indent=0)
142
+ childNodes.inject('#document') { |tree, child| tree + child.printTree(indent + 2) }
143
+ end
144
+ end
145
+
146
+ class DocumentType < Node
147
+ def_delegators :@hpricot, :public_id, :system_id
148
+
149
+ def self.hpricot_class
150
+ ::Hpricot::DocType
151
+ end
152
+
153
+ def initialize(name, public_id, system_id)
154
+ begin
155
+ super(name)
156
+ rescue ArgumentError # needs 3...
157
+ end
158
+
159
+ @hpricot = ::Hpricot::DocType.new(name, public_id, system_id)
160
+ end
161
+
162
+ def printTree(indent=0)
163
+ if hpricot.target and hpricot.target.any?
164
+ "\n|#{' ' * indent}<!DOCTYPE #{hpricot.target}>"
165
+ else
166
+ "\n|#{' ' * indent}<!DOCTYPE >"
167
+ end
168
+ end
169
+ end
170
+
171
+ class DocumentFragment < Element
172
+ def initialize
173
+ super('')
174
+ end
175
+
176
+ def printTree(indent=0)
177
+ childNodes.inject('') {|tree, child| tree + child.printTree(indent + 2) }
178
+ end
179
+ end
180
+
181
+ class TextNode < Node
182
+ def initialize(data)
183
+ @hpricot = ::Hpricot::Text.new(data)
184
+ end
185
+
186
+ def printTree(indent=0)
187
+ "\n|#{' ' * indent}\"#{hpricot.content}\""
188
+ end
189
+ end
190
+
191
+ class CommentNode < Node
192
+ def self.hpricot_class
193
+ ::Hpricot::Comment
194
+ end
195
+
196
+ def printTree(indent=0)
197
+ "\n|#{' ' * indent}<!-- #{hpricot.content} -->"
198
+ end
199
+ end
200
+
201
+ class TreeBuilder < Base::TreeBuilder
202
+ def initialize
203
+ @documentClass = Document
204
+ @doctypeClass = DocumentType
205
+ @elementClass = Element
206
+ @commentClass = CommentNode
207
+ @fragmentClass = DocumentFragment
208
+ end
209
+
210
+ def insertDoctype(name, public_id, system_id)
211
+ doctype = @doctypeClass.new(name, public_id, system_id)
212
+ @document.appendChild(doctype)
213
+ end
214
+
215
+ def testSerializer(node)
216
+ node.printTree
217
+ end
218
+
219
+ def get_document
220
+ @document.hpricot
221
+ end
222
+
223
+ def get_fragment
224
+ @document = super
225
+ return @document.hpricot.children
226
+ end
227
+ end
228
+
229
+ end
230
+ end
231
+ end