rubyjedi-oga 1.0.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (58) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +13 -0
  3. data/LICENSE +362 -0
  4. data/README.md +317 -0
  5. data/doc/css/common.css +77 -0
  6. data/doc/css_selectors.md +935 -0
  7. data/doc/manually_creating_documents.md +67 -0
  8. data/doc/migrating_from_nokogiri.md +169 -0
  9. data/doc/xml_namespaces.md +63 -0
  10. data/ext/c/extconf.rb +11 -0
  11. data/ext/c/lexer.c +2595 -0
  12. data/ext/c/lexer.h +16 -0
  13. data/ext/c/lexer.rl +198 -0
  14. data/ext/c/liboga.c +6 -0
  15. data/ext/c/liboga.h +11 -0
  16. data/ext/java/Liboga.java +14 -0
  17. data/ext/java/org/liboga/xml/Lexer.java +1363 -0
  18. data/ext/java/org/liboga/xml/Lexer.rl +223 -0
  19. data/ext/ragel/base_lexer.rl +633 -0
  20. data/lib/oga.rb +57 -0
  21. data/lib/oga/blacklist.rb +40 -0
  22. data/lib/oga/css/lexer.rb +743 -0
  23. data/lib/oga/css/parser.rb +976 -0
  24. data/lib/oga/entity_decoder.rb +21 -0
  25. data/lib/oga/html/entities.rb +2150 -0
  26. data/lib/oga/html/parser.rb +25 -0
  27. data/lib/oga/html/sax_parser.rb +18 -0
  28. data/lib/oga/lru.rb +160 -0
  29. data/lib/oga/oga.rb +57 -0
  30. data/lib/oga/version.rb +3 -0
  31. data/lib/oga/whitelist.rb +20 -0
  32. data/lib/oga/xml/attribute.rb +136 -0
  33. data/lib/oga/xml/cdata.rb +17 -0
  34. data/lib/oga/xml/character_node.rb +37 -0
  35. data/lib/oga/xml/comment.rb +17 -0
  36. data/lib/oga/xml/default_namespace.rb +13 -0
  37. data/lib/oga/xml/doctype.rb +82 -0
  38. data/lib/oga/xml/document.rb +108 -0
  39. data/lib/oga/xml/element.rb +428 -0
  40. data/lib/oga/xml/entities.rb +122 -0
  41. data/lib/oga/xml/html_void_elements.rb +15 -0
  42. data/lib/oga/xml/lexer.rb +550 -0
  43. data/lib/oga/xml/namespace.rb +48 -0
  44. data/lib/oga/xml/node.rb +219 -0
  45. data/lib/oga/xml/node_set.rb +333 -0
  46. data/lib/oga/xml/parser.rb +631 -0
  47. data/lib/oga/xml/processing_instruction.rb +37 -0
  48. data/lib/oga/xml/pull_parser.rb +175 -0
  49. data/lib/oga/xml/querying.rb +56 -0
  50. data/lib/oga/xml/sax_parser.rb +192 -0
  51. data/lib/oga/xml/text.rb +66 -0
  52. data/lib/oga/xml/traversal.rb +50 -0
  53. data/lib/oga/xml/xml_declaration.rb +65 -0
  54. data/lib/oga/xpath/evaluator.rb +1798 -0
  55. data/lib/oga/xpath/lexer.rb +1958 -0
  56. data/lib/oga/xpath/parser.rb +622 -0
  57. data/oga.gemspec +45 -0
  58. metadata +227 -0
@@ -0,0 +1,17 @@
1
+ module Oga
2
+ module XML
3
+ ##
4
+ # Class used for storing information about XML comments.
5
+ #
6
+ class Comment < CharacterNode
7
+ ##
8
+ # Converts the node back to XML.
9
+ #
10
+ # @return [String]
11
+ #
12
+ def to_xml
13
+ "<!--#{text}-->"
14
+ end
15
+ end # Comment
16
+ end # XML
17
+ end # Oga
@@ -0,0 +1,13 @@
1
+ module Oga
2
+ module XML
3
+ ##
4
+ # The default XML namespace.
5
+ #
6
+ # @return [Oga::XML::Namespace]
7
+ #
8
+ DEFAULT_NAMESPACE = Namespace.new(
9
+ :name => 'xmlns',
10
+ :uri => 'http://www.w3.org/XML/1998/namespace'
11
+ ).freeze
12
+ end # XML
13
+ end # Oga
@@ -0,0 +1,82 @@
1
+ module Oga
2
+ module XML
3
+ ##
4
+ # Class used for storing information about Doctypes.
5
+ #
6
+ class Doctype
7
+ # The name of the doctype (e.g. "HTML").
8
+ # @return [String]
9
+ attr_accessor :name
10
+
11
+ # The type of the doctype (e.g. "PUBLIC").
12
+ # @return [String]
13
+ attr_accessor :type
14
+
15
+ # The public ID of the doctype.
16
+ # @return [String]
17
+ attr_accessor :public_id
18
+
19
+ # The system ID of the doctype.
20
+ # @return [String]
21
+ attr_accessor :system_id
22
+
23
+ # The inline doctype rules.
24
+ # @return [String]
25
+ attr_accessor :inline_rules
26
+
27
+ ##
28
+ # @example
29
+ # dtd = Doctype.new(:name => 'html', :type => 'PUBLIC')
30
+ #
31
+ # @param [Hash] options
32
+ #
33
+ # @option options [String] :name
34
+ # @option options [String] :type
35
+ # @option options [String] :public_id
36
+ # @option options [String] :system_id
37
+ #
38
+ def initialize(options = {})
39
+ @name = options[:name]
40
+ @type = options[:type]
41
+ @public_id = options[:public_id]
42
+ @system_id = options[:system_id]
43
+ @inline_rules = options[:inline_rules]
44
+ end
45
+
46
+ ##
47
+ # Converts the doctype back to XML.
48
+ #
49
+ # @return [String]
50
+ #
51
+ def to_xml
52
+ segments = "<!DOCTYPE #{name}"
53
+
54
+ segments << " #{type}" if type
55
+ segments << %Q{ "#{public_id}"} if public_id
56
+ segments << %Q{ "#{system_id}"} if system_id
57
+ segments << " [#{inline_rules}]" if inline_rules
58
+
59
+ segments + '>'
60
+ end
61
+
62
+ ##
63
+ # Inspects the doctype.
64
+ #
65
+ # @return [String]
66
+ #
67
+ def inspect
68
+ segments = []
69
+
70
+ [:name, :type, :public_id, :system_id, :inline_rules].each do |attr|
71
+ value = send(attr)
72
+
73
+ if value and !value.empty?
74
+ segments << "#{attr}: #{value.inspect}"
75
+ end
76
+ end
77
+
78
+ "Doctype(#{segments.join(' ')})"
79
+ end
80
+ end # Doctype
81
+ end # XML
82
+ end # Oga
@@ -0,0 +1,108 @@
1
+ module Oga
2
+ module XML
3
+ ##
4
+ # Class used for storing information about an entire XML document. This
5
+ # includes the doctype, XML declaration, child nodes and more.
6
+ #
7
+ class Document
8
+ include Querying
9
+ include Traversal
10
+
11
+ # @return [Oga::XML::Doctype]
12
+ attr_accessor :doctype
13
+
14
+ # @return [Oga::XML::XmlDeclaration]
15
+ attr_accessor :xml_declaration
16
+
17
+ # The document type, either `:xml` or `:html`.
18
+ # @return [Symbol]
19
+ attr_reader :type
20
+
21
+ ##
22
+ # @param [Hash] options
23
+ #
24
+ # @option options [Oga::XML::NodeSet] :children
25
+ # @option options [Oga::XML::Doctype] :doctype
26
+ # @option options [Oga::XML::XmlDeclaration] :xml_declaration
27
+ # @option options [Symbol] :type
28
+ #
29
+ def initialize(options = {})
30
+ @doctype = options[:doctype]
31
+ @xml_declaration = options[:xml_declaration]
32
+ @type = options[:type] || :xml
33
+
34
+ self.children = options[:children] if options[:children]
35
+ end
36
+
37
+ ##
38
+ # @return [Oga::XML::NodeSet]
39
+ #
40
+ def children
41
+ @children ||= NodeSet.new([], self)
42
+ end
43
+
44
+ ##
45
+ # Sets the child nodes of the document.
46
+ #
47
+ # @param [Oga::XML::NodeSet|Array] nodes
48
+ #
49
+ def children=(nodes)
50
+ if nodes.is_a?(NodeSet)
51
+ @children = nodes
52
+ else
53
+ @children = NodeSet.new(nodes, self)
54
+ end
55
+ end
56
+
57
+ ##
58
+ # Converts the document and its child nodes to XML.
59
+ #
60
+ # @return [String]
61
+ #
62
+ def to_xml
63
+ xml = children.map(&:to_xml).join('')
64
+
65
+ if doctype
66
+ xml = doctype.to_xml + "\n" + xml.strip
67
+ end
68
+
69
+ if xml_declaration
70
+ xml = xml_declaration.to_xml + "\n" + xml.strip
71
+ end
72
+
73
+ xml
74
+ end
75
+
76
+ ##
77
+ # @return [TrueClass|FalseClass]
78
+ #
79
+ def html?
80
+ type.equal?(:html)
81
+ end
82
+
83
+ ##
84
+ # Inspects the document and its child nodes. Child nodes are indented for
85
+ # each nesting level.
86
+ #
87
+ # @return [String]
88
+ #
89
+ def inspect
90
+ segments = []
91
+
92
+ [:doctype, :xml_declaration, :children].each do |attr|
93
+ value = send(attr)
94
+
95
+ if value
96
+ segments << "#{attr}: #{value.inspect}"
97
+ end
98
+ end
99
+
100
+ <<-EOF.strip
101
+ Document(
102
+ #{segments.join("\n ")}
103
+ )
104
+ EOF
105
+ end
106
+ end # Document
107
+ end # XML
108
+ end # Oga
@@ -0,0 +1,428 @@
1
+ module Oga
2
+ module XML
3
+ ##
4
+ # Class that contains information about an XML element such as the name,
5
+ # attributes and child nodes.
6
+ #
7
+ class Element < Node
8
+ include Querying
9
+
10
+ # @return [String]
11
+ attr_reader :namespace_name
12
+
13
+ # @return [String]
14
+ attr_accessor :name
15
+
16
+ # @return [Array<Oga::XML::Attribute>]
17
+ attr_accessor :attributes
18
+
19
+ # @return [Hash]
20
+ attr_writer :namespaces
21
+
22
+ ##
23
+ # The attribute prefix/namespace used for registering element namespaces.
24
+ #
25
+ # @return [String]
26
+ #
27
+ XMLNS_PREFIX = 'xmlns'.freeze
28
+
29
+ ##
30
+ # @param [Hash] options
31
+ #
32
+ # @option options [String] :name The name of the element.
33
+ #
34
+ # @option options [String] :namespace_name The name of the namespace.
35
+ #
36
+ # @option options [Array<Oga::XML::Attribute>] :attributes The attributes
37
+ # of the element as an Array.
38
+ #
39
+ def initialize(options = {})
40
+ super
41
+
42
+ @name = options[:name]
43
+ @namespace_name = options[:namespace_name]
44
+ @attributes = options[:attributes] || []
45
+ @namespaces = options[:namespaces] || {}
46
+
47
+ link_attributes
48
+ register_namespaces_from_attributes
49
+ end
50
+
51
+ ##
52
+ # @param [String] name
53
+ #
54
+ def namespace_name=(name)
55
+ @namespace_name = name
56
+ @namespace = nil
57
+ end
58
+
59
+ ##
60
+ # Returns an attribute matching the given name (with or without the
61
+ # namespace).
62
+ #
63
+ # @example
64
+ # # find an attribute that only has the name "foo"
65
+ # attribute('foo')
66
+ #
67
+ # # find an attribute with namespace "foo" and name bar"
68
+ # attribute('foo:bar')
69
+ #
70
+ # @param [String|Symbol] name The name (with or without the namespace)
71
+ # of the attribute.
72
+ #
73
+ # @return [Oga::XML::Attribute]
74
+ #
75
+ def attribute(name)
76
+ name, ns = split_name(name)
77
+
78
+ attributes.each do |attr|
79
+ return attr if attribute_matches?(attr, ns, name)
80
+ end
81
+
82
+ return
83
+ end
84
+
85
+ alias_method :attr, :attribute
86
+
87
+ ##
88
+ # Returns the value of the given attribute.
89
+ #
90
+ # @example
91
+ # element.get('class') # => "container"
92
+ #
93
+ # @see [#attribute]
94
+ #
95
+ def get(name)
96
+ found = attribute(name)
97
+
98
+ found ? found.value : nil
99
+ end
100
+
101
+ ##
102
+ # Adds a new attribute to the element.
103
+ #
104
+ # @param [Oga::XML::Attribute] attribute
105
+ #
106
+ def add_attribute(attribute)
107
+ attribute.element = self
108
+
109
+ attributes << attribute
110
+ end
111
+
112
+ ##
113
+ # Sets the value of an attribute to the given value. If the attribute does
114
+ # not exist it is created automatically.
115
+ #
116
+ # @param [String] name The name of the attribute, optionally including the
117
+ # namespace.
118
+ #
119
+ # @param [String] value The new value of the attribute.
120
+ #
121
+ def set(name, value)
122
+ found = attribute(name)
123
+
124
+ if found
125
+ found.value = value
126
+ else
127
+ if name.include?(':')
128
+ ns, name = name.split(':')
129
+ else
130
+ ns = nil
131
+ end
132
+
133
+ attr = Attribute.new(
134
+ :name => name,
135
+ :namespace_name => ns,
136
+ :value => value
137
+ )
138
+
139
+ add_attribute(attr)
140
+ end
141
+ end
142
+
143
+ ##
144
+ # Removes an attribute from the element.
145
+ #
146
+ # @param [String] name The name (optionally including namespace prefix)
147
+ # of the attribute to remove.
148
+ #
149
+ # @return [Oga::XML::Attribute]
150
+ #
151
+ def unset(name)
152
+ found = attribute(name)
153
+
154
+ return attributes.delete(found) if found
155
+ end
156
+
157
+ ##
158
+ # Returns the namespace of the element.
159
+ #
160
+ # @return [Oga::XML::Namespace]
161
+ #
162
+ def namespace
163
+ unless @namespace
164
+ available = available_namespaces
165
+ @namespace = available[namespace_name] || available[XMLNS_PREFIX]
166
+ end
167
+
168
+ @namespace
169
+ end
170
+
171
+ ##
172
+ # Returns the namespaces registered on this element, or an empty Hash in
173
+ # case of an HTML element.
174
+ #
175
+ # @return [Hash]
176
+ #
177
+ def namespaces
178
+ html? ? {} : @namespaces
179
+ end
180
+
181
+ ##
182
+ # Returns true if the current element resides in the default XML
183
+ # namespace.
184
+ #
185
+ # @return [TrueClass|FalseClass]
186
+ #
187
+ def default_namespace?
188
+ namespace == DEFAULT_NAMESPACE || namespace.nil?
189
+ end
190
+
191
+ ##
192
+ # Returns the text of all child nodes joined together.
193
+ #
194
+ # @return [String]
195
+ #
196
+ def text
197
+ children.text
198
+ end
199
+
200
+ ##
201
+ # Returns the text of the current element only.
202
+ #
203
+ # @return [String]
204
+ #
205
+ def inner_text
206
+ text = ''
207
+
208
+ text_nodes.each do |node|
209
+ text << node.text
210
+ end
211
+
212
+ text
213
+ end
214
+
215
+ ##
216
+ # Returns any {Oga::XML::Text} nodes that are a direct child of this
217
+ # element.
218
+ #
219
+ # @return [Oga::XML::NodeSet]
220
+ #
221
+ def text_nodes
222
+ nodes = NodeSet.new
223
+
224
+ children.each do |child|
225
+ nodes << child if child.is_a?(Text)
226
+ end
227
+
228
+ nodes
229
+ end
230
+
231
+ ##
232
+ # Sets the inner text of the current element to the given String.
233
+ #
234
+ # @param [String] text
235
+ #
236
+ def inner_text=(text)
237
+ text_node = XML::Text.new(:text => text)
238
+ @children = NodeSet.new([text_node], self)
239
+ end
240
+
241
+ ##
242
+ # Converts the element and its child elements to XML.
243
+ #
244
+ # @return [String]
245
+ #
246
+ def to_xml
247
+ if namespace_name
248
+ full_name = "#{namespace_name}:#{name}"
249
+ else
250
+ full_name = name
251
+ end
252
+
253
+ body = children.map(&:to_xml).join('')
254
+ attrs = ''
255
+
256
+ attributes.each do |attr|
257
+ attrs << " #{attr.to_xml}"
258
+ end
259
+
260
+ if self_closing?
261
+ return "<#{full_name}#{attrs} />"
262
+ else
263
+ return "<#{full_name}#{attrs}>#{body}</#{full_name}>"
264
+ end
265
+ end
266
+
267
+ ##
268
+ # @return [String]
269
+ #
270
+ def inspect
271
+ segments = []
272
+
273
+ [:name, :namespace, :attributes, :children].each do |attr|
274
+ value = send(attr)
275
+
276
+ if !value or (value.respond_to?(:empty?) and value.empty?)
277
+ next
278
+ end
279
+
280
+ segments << "#{attr}: #{value.inspect}"
281
+ end
282
+
283
+ "Element(#{segments.join(' ')})"
284
+ end
285
+
286
+ ##
287
+ # Registers a new namespace for the current element and its child
288
+ # elements.
289
+ #
290
+ # @param [String] name
291
+ # @param [String] uri
292
+ # @param [TrueClass|FalseClass] flush
293
+ # @see [Oga::XML::Namespace#initialize]
294
+ #
295
+ def register_namespace(name, uri, flush = true)
296
+ if namespaces[name]
297
+ raise ArgumentError, "The namespace #{name.inspect} already exists"
298
+ end
299
+
300
+ namespaces[name] = Namespace.new(:name => name, :uri => uri)
301
+
302
+ flush_namespaces_cache if flush
303
+ end
304
+
305
+ ##
306
+ # Returns a Hash containing all the namespaces available to the current
307
+ # element.
308
+ #
309
+ # @return [Hash]
310
+ #
311
+ def available_namespaces
312
+ # HTML(5) completely ignores namespaces
313
+ unless @available_namespaces
314
+ if html?
315
+ @available_namespaces = {}
316
+ else
317
+ merged = namespaces.dup
318
+ node = parent
319
+
320
+ while node && node.respond_to?(:namespaces)
321
+ node.namespaces.each do |prefix, ns|
322
+ merged[prefix] = ns unless merged[prefix]
323
+ end
324
+
325
+ node = node.parent
326
+ end
327
+
328
+ @available_namespaces = merged
329
+ end
330
+ end
331
+
332
+ @available_namespaces
333
+ end
334
+
335
+ ##
336
+ # Returns `true` if the element is a self-closing element.
337
+ #
338
+ # @return [TrueClass|FalseClass]
339
+ #
340
+ def self_closing?
341
+ self_closing = children.empty?
342
+ root = root_node
343
+
344
+ if root.is_a?(Document) and root.html? \
345
+ and !HTML_VOID_ELEMENTS.allow?(name)
346
+ self_closing = false
347
+ end
348
+
349
+ self_closing
350
+ end
351
+
352
+ ##
353
+ # Flushes the namespaces cache of the current element and all its child
354
+ # elements.
355
+ #
356
+ def flush_namespaces_cache
357
+ @available_namespaces = nil
358
+ @namespace = nil
359
+
360
+ children.each do |child|
361
+ child.flush_namespaces_cache if child.is_a?(Element)
362
+ end
363
+ end
364
+
365
+ private
366
+
367
+ ##
368
+ # Registers namespaces based on any "xmlns" attributes.
369
+ #
370
+ def register_namespaces_from_attributes
371
+ flush = false
372
+
373
+ attributes.each do |attr|
374
+ # We're using `namespace_name` opposed to `namespace.name` as "xmlns"
375
+ # is not a registered namespace.
376
+ if attr.name == XMLNS_PREFIX or attr.namespace_name == XMLNS_PREFIX
377
+ flush = true
378
+
379
+ # Ensures we only flush the cache once instead of flushing it on
380
+ # every register_namespace call.
381
+ register_namespace(attr.name, attr.value, false)
382
+ end
383
+ end
384
+
385
+ flush_namespaces_cache if flush
386
+ end
387
+
388
+ ##
389
+ # Links all attributes to the current element.
390
+ #
391
+ def link_attributes
392
+ attributes.each do |attr|
393
+ attr.element = self
394
+ end
395
+ end
396
+
397
+ ##
398
+ # @param [String] name
399
+ # @return [Array]
400
+ #
401
+ def split_name(name)
402
+ segments = name.to_s.split(':')
403
+
404
+ [segments.pop, segments.pop]
405
+ end
406
+
407
+ ##
408
+ # @param [Oga::XML::Attribute] attr
409
+ # @param [String] ns
410
+ # @param [String] name
411
+ # @return [TrueClass|FalseClass]
412
+ #
413
+ def attribute_matches?(attr, ns, name)
414
+ name_matches = attr.name == name
415
+ ns_matches = false
416
+
417
+ if ns
418
+ ns_matches = attr.namespace.to_s == ns
419
+
420
+ elsif name_matches and !attr.namespace
421
+ ns_matches = true
422
+ end
423
+
424
+ name_matches && ns_matches
425
+ end
426
+ end # Element
427
+ end # XML
428
+ end # Oga