rubyjedi-oga 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +13 -0
  3. data/LICENSE +362 -0
  4. data/README.md +317 -0
  5. data/doc/css/common.css +77 -0
  6. data/doc/css_selectors.md +935 -0
  7. data/doc/manually_creating_documents.md +67 -0
  8. data/doc/migrating_from_nokogiri.md +169 -0
  9. data/doc/xml_namespaces.md +63 -0
  10. data/ext/c/extconf.rb +11 -0
  11. data/ext/c/lexer.c +2595 -0
  12. data/ext/c/lexer.h +16 -0
  13. data/ext/c/lexer.rl +198 -0
  14. data/ext/c/liboga.c +6 -0
  15. data/ext/c/liboga.h +11 -0
  16. data/ext/java/Liboga.java +14 -0
  17. data/ext/java/org/liboga/xml/Lexer.java +1363 -0
  18. data/ext/java/org/liboga/xml/Lexer.rl +223 -0
  19. data/ext/ragel/base_lexer.rl +633 -0
  20. data/lib/oga.rb +57 -0
  21. data/lib/oga/blacklist.rb +40 -0
  22. data/lib/oga/css/lexer.rb +743 -0
  23. data/lib/oga/css/parser.rb +976 -0
  24. data/lib/oga/entity_decoder.rb +21 -0
  25. data/lib/oga/html/entities.rb +2150 -0
  26. data/lib/oga/html/parser.rb +25 -0
  27. data/lib/oga/html/sax_parser.rb +18 -0
  28. data/lib/oga/lru.rb +160 -0
  29. data/lib/oga/oga.rb +57 -0
  30. data/lib/oga/version.rb +3 -0
  31. data/lib/oga/whitelist.rb +20 -0
  32. data/lib/oga/xml/attribute.rb +136 -0
  33. data/lib/oga/xml/cdata.rb +17 -0
  34. data/lib/oga/xml/character_node.rb +37 -0
  35. data/lib/oga/xml/comment.rb +17 -0
  36. data/lib/oga/xml/default_namespace.rb +13 -0
  37. data/lib/oga/xml/doctype.rb +82 -0
  38. data/lib/oga/xml/document.rb +108 -0
  39. data/lib/oga/xml/element.rb +428 -0
  40. data/lib/oga/xml/entities.rb +122 -0
  41. data/lib/oga/xml/html_void_elements.rb +15 -0
  42. data/lib/oga/xml/lexer.rb +550 -0
  43. data/lib/oga/xml/namespace.rb +48 -0
  44. data/lib/oga/xml/node.rb +219 -0
  45. data/lib/oga/xml/node_set.rb +333 -0
  46. data/lib/oga/xml/parser.rb +631 -0
  47. data/lib/oga/xml/processing_instruction.rb +37 -0
  48. data/lib/oga/xml/pull_parser.rb +175 -0
  49. data/lib/oga/xml/querying.rb +56 -0
  50. data/lib/oga/xml/sax_parser.rb +192 -0
  51. data/lib/oga/xml/text.rb +66 -0
  52. data/lib/oga/xml/traversal.rb +50 -0
  53. data/lib/oga/xml/xml_declaration.rb +65 -0
  54. data/lib/oga/xpath/evaluator.rb +1798 -0
  55. data/lib/oga/xpath/lexer.rb +1958 -0
  56. data/lib/oga/xpath/parser.rb +622 -0
  57. data/oga.gemspec +45 -0
  58. metadata +227 -0
@@ -0,0 +1,17 @@
1
+ module Oga
2
+ module XML
3
+ ##
4
+ # Class used for storing information about XML comments.
5
+ #
6
+ class Comment < CharacterNode
7
+ ##
8
+ # Converts the node back to XML.
9
+ #
10
+ # @return [String]
11
+ #
12
+ def to_xml
13
+ "<!--#{text}-->"
14
+ end
15
+ end # Comment
16
+ end # XML
17
+ end # Oga
@@ -0,0 +1,13 @@
1
+ module Oga
2
+ module XML
3
+ ##
4
+ # The default XML namespace.
5
+ #
6
+ # @return [Oga::XML::Namespace]
7
+ #
8
+ DEFAULT_NAMESPACE = Namespace.new(
9
+ :name => 'xmlns',
10
+ :uri => 'http://www.w3.org/XML/1998/namespace'
11
+ ).freeze
12
+ end # XML
13
+ end # Oga
@@ -0,0 +1,82 @@
1
+ module Oga
2
+ module XML
3
+ ##
4
+ # Class used for storing information about Doctypes.
5
+ #
6
+ class Doctype
7
+ # The name of the doctype (e.g. "HTML").
8
+ # @return [String]
9
+ attr_accessor :name
10
+
11
+ # The type of the doctype (e.g. "PUBLIC").
12
+ # @return [String]
13
+ attr_accessor :type
14
+
15
+ # The public ID of the doctype.
16
+ # @return [String]
17
+ attr_accessor :public_id
18
+
19
+ # The system ID of the doctype.
20
+ # @return [String]
21
+ attr_accessor :system_id
22
+
23
+ # The inline doctype rules.
24
+ # @return [String]
25
+ attr_accessor :inline_rules
26
+
27
+ ##
28
+ # @example
29
+ # dtd = Doctype.new(:name => 'html', :type => 'PUBLIC')
30
+ #
31
+ # @param [Hash] options
32
+ #
33
+ # @option options [String] :name
34
+ # @option options [String] :type
35
+ # @option options [String] :public_id
36
+ # @option options [String] :system_id
37
+ #
38
+ def initialize(options = {})
39
+ @name = options[:name]
40
+ @type = options[:type]
41
+ @public_id = options[:public_id]
42
+ @system_id = options[:system_id]
43
+ @inline_rules = options[:inline_rules]
44
+ end
45
+
46
+ ##
47
+ # Converts the doctype back to XML.
48
+ #
49
+ # @return [String]
50
+ #
51
+ def to_xml
52
+ segments = "<!DOCTYPE #{name}"
53
+
54
+ segments << " #{type}" if type
55
+ segments << %Q{ "#{public_id}"} if public_id
56
+ segments << %Q{ "#{system_id}"} if system_id
57
+ segments << " [#{inline_rules}]" if inline_rules
58
+
59
+ segments + '>'
60
+ end
61
+
62
+ ##
63
+ # Inspects the doctype.
64
+ #
65
+ # @return [String]
66
+ #
67
+ def inspect
68
+ segments = []
69
+
70
+ [:name, :type, :public_id, :system_id, :inline_rules].each do |attr|
71
+ value = send(attr)
72
+
73
+ if value and !value.empty?
74
+ segments << "#{attr}: #{value.inspect}"
75
+ end
76
+ end
77
+
78
+ "Doctype(#{segments.join(' ')})"
79
+ end
80
+ end # Doctype
81
+ end # XML
82
+ end # Oga
@@ -0,0 +1,108 @@
1
+ module Oga
2
+ module XML
3
+ ##
4
+ # Class used for storing information about an entire XML document. This
5
+ # includes the doctype, XML declaration, child nodes and more.
6
+ #
7
+ class Document
8
+ include Querying
9
+ include Traversal
10
+
11
+ # @return [Oga::XML::Doctype]
12
+ attr_accessor :doctype
13
+
14
+ # @return [Oga::XML::XmlDeclaration]
15
+ attr_accessor :xml_declaration
16
+
17
+ # The document type, either `:xml` or `:html`.
18
+ # @return [Symbol]
19
+ attr_reader :type
20
+
21
+ ##
22
+ # @param [Hash] options
23
+ #
24
+ # @option options [Oga::XML::NodeSet] :children
25
+ # @option options [Oga::XML::Doctype] :doctype
26
+ # @option options [Oga::XML::XmlDeclaration] :xml_declaration
27
+ # @option options [Symbol] :type
28
+ #
29
+ def initialize(options = {})
30
+ @doctype = options[:doctype]
31
+ @xml_declaration = options[:xml_declaration]
32
+ @type = options[:type] || :xml
33
+
34
+ self.children = options[:children] if options[:children]
35
+ end
36
+
37
+ ##
38
+ # @return [Oga::XML::NodeSet]
39
+ #
40
+ def children
41
+ @children ||= NodeSet.new([], self)
42
+ end
43
+
44
+ ##
45
+ # Sets the child nodes of the document.
46
+ #
47
+ # @param [Oga::XML::NodeSet|Array] nodes
48
+ #
49
+ def children=(nodes)
50
+ if nodes.is_a?(NodeSet)
51
+ @children = nodes
52
+ else
53
+ @children = NodeSet.new(nodes, self)
54
+ end
55
+ end
56
+
57
+ ##
58
+ # Converts the document and its child nodes to XML.
59
+ #
60
+ # @return [String]
61
+ #
62
+ def to_xml
63
+ xml = children.map(&:to_xml).join('')
64
+
65
+ if doctype
66
+ xml = doctype.to_xml + "\n" + xml.strip
67
+ end
68
+
69
+ if xml_declaration
70
+ xml = xml_declaration.to_xml + "\n" + xml.strip
71
+ end
72
+
73
+ xml
74
+ end
75
+
76
+ ##
77
+ # @return [TrueClass|FalseClass]
78
+ #
79
+ def html?
80
+ type.equal?(:html)
81
+ end
82
+
83
+ ##
84
+ # Inspects the document and its child nodes. Child nodes are indented for
85
+ # each nesting level.
86
+ #
87
+ # @return [String]
88
+ #
89
+ def inspect
90
+ segments = []
91
+
92
+ [:doctype, :xml_declaration, :children].each do |attr|
93
+ value = send(attr)
94
+
95
+ if value
96
+ segments << "#{attr}: #{value.inspect}"
97
+ end
98
+ end
99
+
100
+ <<-EOF.strip
101
+ Document(
102
+ #{segments.join("\n ")}
103
+ )
104
+ EOF
105
+ end
106
+ end # Document
107
+ end # XML
108
+ end # Oga
@@ -0,0 +1,428 @@
1
+ module Oga
2
+ module XML
3
+ ##
4
+ # Class that contains information about an XML element such as the name,
5
+ # attributes and child nodes.
6
+ #
7
+ class Element < Node
8
+ include Querying
9
+
10
+ # @return [String]
11
+ attr_reader :namespace_name
12
+
13
+ # @return [String]
14
+ attr_accessor :name
15
+
16
+ # @return [Array<Oga::XML::Attribute>]
17
+ attr_accessor :attributes
18
+
19
+ # @return [Hash]
20
+ attr_writer :namespaces
21
+
22
+ ##
23
+ # The attribute prefix/namespace used for registering element namespaces.
24
+ #
25
+ # @return [String]
26
+ #
27
+ XMLNS_PREFIX = 'xmlns'.freeze
28
+
29
+ ##
30
+ # @param [Hash] options
31
+ #
32
+ # @option options [String] :name The name of the element.
33
+ #
34
+ # @option options [String] :namespace_name The name of the namespace.
35
+ #
36
+ # @option options [Array<Oga::XML::Attribute>] :attributes The attributes
37
+ # of the element as an Array.
38
+ #
39
+ def initialize(options = {})
40
+ super
41
+
42
+ @name = options[:name]
43
+ @namespace_name = options[:namespace_name]
44
+ @attributes = options[:attributes] || []
45
+ @namespaces = options[:namespaces] || {}
46
+
47
+ link_attributes
48
+ register_namespaces_from_attributes
49
+ end
50
+
51
+ ##
52
+ # @param [String] name
53
+ #
54
+ def namespace_name=(name)
55
+ @namespace_name = name
56
+ @namespace = nil
57
+ end
58
+
59
+ ##
60
+ # Returns an attribute matching the given name (with or without the
61
+ # namespace).
62
+ #
63
+ # @example
64
+ # # find an attribute that only has the name "foo"
65
+ # attribute('foo')
66
+ #
67
+ # # find an attribute with namespace "foo" and name bar"
68
+ # attribute('foo:bar')
69
+ #
70
+ # @param [String|Symbol] name The name (with or without the namespace)
71
+ # of the attribute.
72
+ #
73
+ # @return [Oga::XML::Attribute]
74
+ #
75
+ def attribute(name)
76
+ name, ns = split_name(name)
77
+
78
+ attributes.each do |attr|
79
+ return attr if attribute_matches?(attr, ns, name)
80
+ end
81
+
82
+ return
83
+ end
84
+
85
+ alias_method :attr, :attribute
86
+
87
+ ##
88
+ # Returns the value of the given attribute.
89
+ #
90
+ # @example
91
+ # element.get('class') # => "container"
92
+ #
93
+ # @see [#attribute]
94
+ #
95
+ def get(name)
96
+ found = attribute(name)
97
+
98
+ found ? found.value : nil
99
+ end
100
+
101
+ ##
102
+ # Adds a new attribute to the element.
103
+ #
104
+ # @param [Oga::XML::Attribute] attribute
105
+ #
106
+ def add_attribute(attribute)
107
+ attribute.element = self
108
+
109
+ attributes << attribute
110
+ end
111
+
112
+ ##
113
+ # Sets the value of an attribute to the given value. If the attribute does
114
+ # not exist it is created automatically.
115
+ #
116
+ # @param [String] name The name of the attribute, optionally including the
117
+ # namespace.
118
+ #
119
+ # @param [String] value The new value of the attribute.
120
+ #
121
+ def set(name, value)
122
+ found = attribute(name)
123
+
124
+ if found
125
+ found.value = value
126
+ else
127
+ if name.include?(':')
128
+ ns, name = name.split(':')
129
+ else
130
+ ns = nil
131
+ end
132
+
133
+ attr = Attribute.new(
134
+ :name => name,
135
+ :namespace_name => ns,
136
+ :value => value
137
+ )
138
+
139
+ add_attribute(attr)
140
+ end
141
+ end
142
+
143
+ ##
144
+ # Removes an attribute from the element.
145
+ #
146
+ # @param [String] name The name (optionally including namespace prefix)
147
+ # of the attribute to remove.
148
+ #
149
+ # @return [Oga::XML::Attribute]
150
+ #
151
+ def unset(name)
152
+ found = attribute(name)
153
+
154
+ return attributes.delete(found) if found
155
+ end
156
+
157
+ ##
158
+ # Returns the namespace of the element.
159
+ #
160
+ # @return [Oga::XML::Namespace]
161
+ #
162
+ def namespace
163
+ unless @namespace
164
+ available = available_namespaces
165
+ @namespace = available[namespace_name] || available[XMLNS_PREFIX]
166
+ end
167
+
168
+ @namespace
169
+ end
170
+
171
+ ##
172
+ # Returns the namespaces registered on this element, or an empty Hash in
173
+ # case of an HTML element.
174
+ #
175
+ # @return [Hash]
176
+ #
177
+ def namespaces
178
+ html? ? {} : @namespaces
179
+ end
180
+
181
+ ##
182
+ # Returns true if the current element resides in the default XML
183
+ # namespace.
184
+ #
185
+ # @return [TrueClass|FalseClass]
186
+ #
187
+ def default_namespace?
188
+ namespace == DEFAULT_NAMESPACE || namespace.nil?
189
+ end
190
+
191
+ ##
192
+ # Returns the text of all child nodes joined together.
193
+ #
194
+ # @return [String]
195
+ #
196
+ def text
197
+ children.text
198
+ end
199
+
200
+ ##
201
+ # Returns the text of the current element only.
202
+ #
203
+ # @return [String]
204
+ #
205
+ def inner_text
206
+ text = ''
207
+
208
+ text_nodes.each do |node|
209
+ text << node.text
210
+ end
211
+
212
+ text
213
+ end
214
+
215
+ ##
216
+ # Returns any {Oga::XML::Text} nodes that are a direct child of this
217
+ # element.
218
+ #
219
+ # @return [Oga::XML::NodeSet]
220
+ #
221
+ def text_nodes
222
+ nodes = NodeSet.new
223
+
224
+ children.each do |child|
225
+ nodes << child if child.is_a?(Text)
226
+ end
227
+
228
+ nodes
229
+ end
230
+
231
+ ##
232
+ # Sets the inner text of the current element to the given String.
233
+ #
234
+ # @param [String] text
235
+ #
236
+ def inner_text=(text)
237
+ text_node = XML::Text.new(:text => text)
238
+ @children = NodeSet.new([text_node], self)
239
+ end
240
+
241
+ ##
242
+ # Converts the element and its child elements to XML.
243
+ #
244
+ # @return [String]
245
+ #
246
+ def to_xml
247
+ if namespace_name
248
+ full_name = "#{namespace_name}:#{name}"
249
+ else
250
+ full_name = name
251
+ end
252
+
253
+ body = children.map(&:to_xml).join('')
254
+ attrs = ''
255
+
256
+ attributes.each do |attr|
257
+ attrs << " #{attr.to_xml}"
258
+ end
259
+
260
+ if self_closing?
261
+ return "<#{full_name}#{attrs} />"
262
+ else
263
+ return "<#{full_name}#{attrs}>#{body}</#{full_name}>"
264
+ end
265
+ end
266
+
267
+ ##
268
+ # @return [String]
269
+ #
270
+ def inspect
271
+ segments = []
272
+
273
+ [:name, :namespace, :attributes, :children].each do |attr|
274
+ value = send(attr)
275
+
276
+ if !value or (value.respond_to?(:empty?) and value.empty?)
277
+ next
278
+ end
279
+
280
+ segments << "#{attr}: #{value.inspect}"
281
+ end
282
+
283
+ "Element(#{segments.join(' ')})"
284
+ end
285
+
286
+ ##
287
+ # Registers a new namespace for the current element and its child
288
+ # elements.
289
+ #
290
+ # @param [String] name
291
+ # @param [String] uri
292
+ # @param [TrueClass|FalseClass] flush
293
+ # @see [Oga::XML::Namespace#initialize]
294
+ #
295
+ def register_namespace(name, uri, flush = true)
296
+ if namespaces[name]
297
+ raise ArgumentError, "The namespace #{name.inspect} already exists"
298
+ end
299
+
300
+ namespaces[name] = Namespace.new(:name => name, :uri => uri)
301
+
302
+ flush_namespaces_cache if flush
303
+ end
304
+
305
+ ##
306
+ # Returns a Hash containing all the namespaces available to the current
307
+ # element.
308
+ #
309
+ # @return [Hash]
310
+ #
311
+ def available_namespaces
312
+ # HTML(5) completely ignores namespaces
313
+ unless @available_namespaces
314
+ if html?
315
+ @available_namespaces = {}
316
+ else
317
+ merged = namespaces.dup
318
+ node = parent
319
+
320
+ while node && node.respond_to?(:namespaces)
321
+ node.namespaces.each do |prefix, ns|
322
+ merged[prefix] = ns unless merged[prefix]
323
+ end
324
+
325
+ node = node.parent
326
+ end
327
+
328
+ @available_namespaces = merged
329
+ end
330
+ end
331
+
332
+ @available_namespaces
333
+ end
334
+
335
+ ##
336
+ # Returns `true` if the element is a self-closing element.
337
+ #
338
+ # @return [TrueClass|FalseClass]
339
+ #
340
+ def self_closing?
341
+ self_closing = children.empty?
342
+ root = root_node
343
+
344
+ if root.is_a?(Document) and root.html? \
345
+ and !HTML_VOID_ELEMENTS.allow?(name)
346
+ self_closing = false
347
+ end
348
+
349
+ self_closing
350
+ end
351
+
352
+ ##
353
+ # Flushes the namespaces cache of the current element and all its child
354
+ # elements.
355
+ #
356
+ def flush_namespaces_cache
357
+ @available_namespaces = nil
358
+ @namespace = nil
359
+
360
+ children.each do |child|
361
+ child.flush_namespaces_cache if child.is_a?(Element)
362
+ end
363
+ end
364
+
365
+ private
366
+
367
+ ##
368
+ # Registers namespaces based on any "xmlns" attributes.
369
+ #
370
+ def register_namespaces_from_attributes
371
+ flush = false
372
+
373
+ attributes.each do |attr|
374
+ # We're using `namespace_name` opposed to `namespace.name` as "xmlns"
375
+ # is not a registered namespace.
376
+ if attr.name == XMLNS_PREFIX or attr.namespace_name == XMLNS_PREFIX
377
+ flush = true
378
+
379
+ # Ensures we only flush the cache once instead of flushing it on
380
+ # every register_namespace call.
381
+ register_namespace(attr.name, attr.value, false)
382
+ end
383
+ end
384
+
385
+ flush_namespaces_cache if flush
386
+ end
387
+
388
+ ##
389
+ # Links all attributes to the current element.
390
+ #
391
+ def link_attributes
392
+ attributes.each do |attr|
393
+ attr.element = self
394
+ end
395
+ end
396
+
397
+ ##
398
+ # @param [String] name
399
+ # @return [Array]
400
+ #
401
+ def split_name(name)
402
+ segments = name.to_s.split(':')
403
+
404
+ [segments.pop, segments.pop]
405
+ end
406
+
407
+ ##
408
+ # @param [Oga::XML::Attribute] attr
409
+ # @param [String] ns
410
+ # @param [String] name
411
+ # @return [TrueClass|FalseClass]
412
+ #
413
+ def attribute_matches?(attr, ns, name)
414
+ name_matches = attr.name == name
415
+ ns_matches = false
416
+
417
+ if ns
418
+ ns_matches = attr.namespace.to_s == ns
419
+
420
+ elsif name_matches and !attr.namespace
421
+ ns_matches = true
422
+ end
423
+
424
+ name_matches && ns_matches
425
+ end
426
+ end # Element
427
+ end # XML
428
+ end # Oga