oga 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +13 -0
  3. data/LICENSE +19 -0
  4. data/README.md +171 -0
  5. data/doc/DCO.md +25 -0
  6. data/doc/changelog.md +7 -0
  7. data/doc/css/common.css +76 -0
  8. data/doc/migrating_from_nokogiri.md +169 -0
  9. data/ext/c/extconf.rb +13 -0
  10. data/ext/c/lexer.c +1518 -0
  11. data/ext/c/lexer.h +8 -0
  12. data/ext/c/lexer.rl +121 -0
  13. data/ext/c/liboga.c +6 -0
  14. data/ext/c/liboga.h +11 -0
  15. data/ext/java/Liboga.java +14 -0
  16. data/ext/java/org/liboga/xml/Lexer.java +829 -0
  17. data/ext/java/org/liboga/xml/Lexer.rl +151 -0
  18. data/ext/ragel/base_lexer.rl +323 -0
  19. data/lib/oga.rb +43 -0
  20. data/lib/oga/html/parser.rb +25 -0
  21. data/lib/oga/oga.rb +27 -0
  22. data/lib/oga/version.rb +3 -0
  23. data/lib/oga/xml/attribute.rb +111 -0
  24. data/lib/oga/xml/cdata.rb +24 -0
  25. data/lib/oga/xml/character_node.rb +39 -0
  26. data/lib/oga/xml/comment.rb +24 -0
  27. data/lib/oga/xml/doctype.rb +91 -0
  28. data/lib/oga/xml/document.rb +99 -0
  29. data/lib/oga/xml/element.rb +340 -0
  30. data/lib/oga/xml/lexer.rb +399 -0
  31. data/lib/oga/xml/namespace.rb +42 -0
  32. data/lib/oga/xml/node.rb +175 -0
  33. data/lib/oga/xml/node_set.rb +313 -0
  34. data/lib/oga/xml/parser.rb +556 -0
  35. data/lib/oga/xml/processing_instruction.rb +39 -0
  36. data/lib/oga/xml/pull_parser.rb +166 -0
  37. data/lib/oga/xml/querying.rb +32 -0
  38. data/lib/oga/xml/text.rb +16 -0
  39. data/lib/oga/xml/traversal.rb +48 -0
  40. data/lib/oga/xml/xml_declaration.rb +76 -0
  41. data/lib/oga/xpath/evaluator.rb +1748 -0
  42. data/lib/oga/xpath/lexer.rb +2043 -0
  43. data/lib/oga/xpath/node.rb +10 -0
  44. data/lib/oga/xpath/parser.rb +535 -0
  45. data/oga.gemspec +45 -0
  46. metadata +221 -0
@@ -0,0 +1,340 @@
1
+ module Oga
2
+ module XML
3
+ ##
4
+ # Class that contains information about an XML element such as the name,
5
+ # attributes and child nodes.
6
+ #
7
+ # @!attribute [rw] name
8
+ # The name of the element.
9
+ # @return [String]
10
+ #
11
+ # @!attribute [ww] namespace_name
12
+ # The name of the namespace.
13
+ # @return [String]
14
+ #
15
+ # @!attribute [rw] attributes
16
+ # The attributes of the element.
17
+ # @return [Array<Oga::XML::Attribute>]
18
+ #
19
+ # @!attribute [rw] namespaces
20
+ # The registered namespaces.
21
+ # @return [Hash]
22
+ #
23
+ class Element < Node
24
+ include Querying
25
+
26
+ attr_accessor :name, :namespace_name, :attributes, :namespaces
27
+
28
+ ##
29
+ # The attribute prefix/namespace used for registering element namespaces.
30
+ #
31
+ # @return [String]
32
+ #
33
+ XMLNS_PREFIX = 'xmlns'.freeze
34
+
35
+ ##
36
+ # @param [Hash] options
37
+ #
38
+ # @option options [String] :name The name of the element.
39
+ #
40
+ # @option options [String] :namespace_name The name of the namespace.
41
+ #
42
+ # @option options [Array<Oga::XML::Attribute>] :attributes The attributes
43
+ # of the element as an Array.
44
+ #
45
+ def initialize(options = {})
46
+ super
47
+
48
+ @name = options[:name]
49
+ @namespace_name = options[:namespace_name]
50
+ @attributes = options[:attributes] || []
51
+ @namespaces = options[:namespaces] || {}
52
+
53
+ link_attributes
54
+ register_namespaces_from_attributes
55
+ end
56
+
57
+ ##
58
+ # Returns an attribute matching the given name (with or without the
59
+ # namespace).
60
+ #
61
+ # @example
62
+ # # find an attribute that only has the name "foo"
63
+ # attribute('foo')
64
+ #
65
+ # # find an attribute with namespace "foo" and name bar"
66
+ # attribute('foo:bar')
67
+ #
68
+ # @param [String|Symbol] name The name (with or without the namespace)
69
+ # of the attribute.
70
+ #
71
+ # @return [Oga::XML::Attribute]
72
+ #
73
+ def attribute(name)
74
+ name, ns = split_name(name)
75
+
76
+ attributes.each do |attr|
77
+ return attr if attribute_matches?(attr, ns, name)
78
+ end
79
+
80
+ return
81
+ end
82
+
83
+ alias_method :attr, :attribute
84
+
85
+ ##
86
+ # Returns the value of the given attribute.
87
+ #
88
+ # @example
89
+ # element.get('class') # => "container"
90
+ #
91
+ # @see [#attribute]
92
+ #
93
+ def get(name)
94
+ found = attribute(name)
95
+
96
+ return found ? found.value : nil
97
+ end
98
+
99
+ ##
100
+ # Adds a new attribute to the element.
101
+ #
102
+ # @param [Oga::XML::Attribute] attribute
103
+ #
104
+ def add_attribute(attribute)
105
+ attribute.element = self
106
+
107
+ attributes << attribute
108
+ end
109
+
110
+ ##
111
+ # Sets the value of an attribute to the given value. If the attribute does
112
+ # not exist it is created automatically.
113
+ #
114
+ # @param [String] name The name of the attribute, optionally including the
115
+ # namespace.
116
+ #
117
+ # @param [String] value The new value of the attribute.
118
+ #
119
+ def set(name, value)
120
+ found = attribute(name)
121
+
122
+ if found
123
+ found.value = value
124
+ else
125
+ if name.include?(':')
126
+ ns, name = name.split(':')
127
+ else
128
+ ns = nil
129
+ end
130
+
131
+ attr = Attribute.new(
132
+ :name => name,
133
+ :namespace_name => ns,
134
+ :value => value
135
+ )
136
+
137
+ add_attribute(attr)
138
+ end
139
+ end
140
+
141
+ ##
142
+ # Returns the namespace of the element.
143
+ #
144
+ # @return [Oga::XML::Namespace]
145
+ #
146
+ def namespace
147
+ return @namespace ||= available_namespaces[namespace_name]
148
+ end
149
+
150
+ ##
151
+ # Returns the text of all child nodes joined together.
152
+ #
153
+ # @return [String]
154
+ #
155
+ def text
156
+ return children.text
157
+ end
158
+
159
+ ##
160
+ # Returns the text of the current element only.
161
+ #
162
+ # @return [String]
163
+ #
164
+ def inner_text
165
+ text = ''
166
+
167
+ text_nodes.each do |node|
168
+ text << node.text
169
+ end
170
+
171
+ return text
172
+ end
173
+
174
+ ##
175
+ # Returns any {Oga::XML::Text} nodes that are a direct child of this
176
+ # element.
177
+ #
178
+ # @return [Oga::XML::NodeSet]
179
+ #
180
+ def text_nodes
181
+ nodes = NodeSet.new
182
+
183
+ children.each do |child|
184
+ nodes << child if child.is_a?(Text)
185
+ end
186
+
187
+ return nodes
188
+ end
189
+
190
+ ##
191
+ # Sets the inner text of the current element to the given String.
192
+ #
193
+ # @param [String] text
194
+ #
195
+ def inner_text=(text)
196
+ children.each do |child|
197
+ child.remove if child.is_a?(Text)
198
+ end
199
+
200
+ children << XML::Text.new(:text => text)
201
+ end
202
+
203
+ ##
204
+ # Converts the element and its child elements to XML.
205
+ #
206
+ # @return [String]
207
+ #
208
+ def to_xml
209
+ ns = namespace ? "#{namespace}:" : ''
210
+ body = children.map(&:to_xml).join('')
211
+ attrs = ''
212
+
213
+ attributes.each do |attr|
214
+ attrs << attr.to_xml
215
+ end
216
+
217
+ attrs = " #{attrs}" unless attrs.empty?
218
+
219
+ return "<#{ns}#{name}#{attrs}>#{body}</#{ns}#{name}>"
220
+ end
221
+
222
+ ##
223
+ # @return [String]
224
+ #
225
+ def inspect
226
+ segments = []
227
+
228
+ [:name, :namespace, :attributes, :children].each do |attr|
229
+ value = send(attr)
230
+
231
+ if !value or (value.respond_to?(:empty?) and value.empty?)
232
+ next
233
+ end
234
+
235
+ segments << "#{attr}: #{value.inspect}"
236
+ end
237
+
238
+ return "Element(#{segments.join(' ')})"
239
+ end
240
+
241
+ ##
242
+ # @return [Symbol]
243
+ #
244
+ def node_type
245
+ return :element
246
+ end
247
+
248
+ ##
249
+ # Registers a new namespace for the current element and its child
250
+ # elements.
251
+ #
252
+ # @param [String] name
253
+ # @param [String] uri
254
+ # @see [Oga::XML::Namespace#initialize]
255
+ #
256
+ def register_namespace(name, uri)
257
+ if namespaces[name]
258
+ raise ArgumentError, "The namespace #{name.inspect} already exists"
259
+ end
260
+
261
+ namespaces[name] = Namespace.new(:name => name, :uri => uri)
262
+ end
263
+
264
+ ##
265
+ # Returns a Hash containing all the namespaces available to the current
266
+ # element.
267
+ #
268
+ # @return [Hash]
269
+ #
270
+ def available_namespaces
271
+ merged = namespaces
272
+ node = parent
273
+
274
+ while node && node.respond_to?(:namespaces)
275
+ merged = merged.merge(node.namespaces)
276
+ node = node.parent
277
+ end
278
+
279
+ return merged
280
+ end
281
+
282
+ private
283
+
284
+ ##
285
+ # Registers namespaces based on any "xmlns" attributes. Once a namespace
286
+ # has been registered the corresponding attribute is removed.
287
+ #
288
+ def register_namespaces_from_attributes
289
+ self.attributes = attributes.reject do |attr|
290
+ # We're using `namespace_name` opposed to `namespace.name` as "xmlns"
291
+ # is not a registered namespace.
292
+ remove = attr.namespace_name && attr.namespace_name == XMLNS_PREFIX
293
+
294
+ register_namespace(attr.name, attr.value) if remove
295
+
296
+ remove
297
+ end
298
+ end
299
+
300
+ ##
301
+ # Links all attributes to the current element.
302
+ #
303
+ def link_attributes
304
+ attributes.each do |attr|
305
+ attr.element = self
306
+ end
307
+ end
308
+
309
+ ##
310
+ # @param [String] name
311
+ # @return [Array]
312
+ #
313
+ def split_name(name)
314
+ segments = name.to_s.split(':')
315
+
316
+ return segments.pop, segments.pop
317
+ end
318
+
319
+ ##
320
+ # @param [Oga::XML::Attribute] attr
321
+ # @param [String] ns
322
+ # @param [String] name
323
+ # @return [TrueClass|FalseClass]
324
+ #
325
+ def attribute_matches?(attr, ns, name)
326
+ name_matches = attr.name == name
327
+ ns_matches = false
328
+
329
+ if ns
330
+ ns_matches = attr.namespace.to_s == ns
331
+
332
+ elsif name_matches and !attr.namespace
333
+ ns_matches = true
334
+ end
335
+
336
+ return name_matches && ns_matches
337
+ end
338
+ end # Element
339
+ end # XML
340
+ end # Oga
@@ -0,0 +1,399 @@
1
+ module Oga
2
+ module XML
3
+ ##
4
+ # Low level lexer that supports both XML and HTML (using an extra option).
5
+ # To lex HTML input set the `:html` option to `true` when creating an
6
+ # instance of the lexer:
7
+ #
8
+ # lexer = Oga::XML::Lexer.new(:html => true)
9
+ #
10
+ # This lexer can process both String and IO instances. IO instances are
11
+ # processed on a line by line basis. This can greatly reduce memory usage
12
+ # in exchange for a slightly slower runtime.
13
+ #
14
+ # ## Thread Safety
15
+ #
16
+ # Since this class keeps track of an internal state you can not use the
17
+ # same instance between multiple threads at the same time. For example, the
18
+ # following will not work reliably:
19
+ #
20
+ # # Don't do this!
21
+ # lexer = Oga::XML::Lexer.new('....')
22
+ # threads = []
23
+ #
24
+ # 2.times do
25
+ # threads << Thread.new do
26
+ # lexer.advance do |*args|
27
+ # p args
28
+ # end
29
+ # end
30
+ # end
31
+ #
32
+ # threads.each(&:join)
33
+ #
34
+ # However, it is perfectly save to use different instances per thread.
35
+ # There is no _global_ state used by this lexer.
36
+ #
37
+ # @!attribute [r] html
38
+ # @return [TrueClass|FalseClass]
39
+ #
40
+ class Lexer
41
+ attr_reader :html
42
+
43
+ ##
44
+ # Names of the HTML void elements that should be handled when HTML lexing
45
+ # is enabled.
46
+ #
47
+ # @return [Set]
48
+ #
49
+ HTML_VOID_ELEMENTS = Set.new([
50
+ 'area',
51
+ 'base',
52
+ 'br',
53
+ 'col',
54
+ 'command',
55
+ 'embed',
56
+ 'hr',
57
+ 'img',
58
+ 'input',
59
+ 'keygen',
60
+ 'link',
61
+ 'meta',
62
+ 'param',
63
+ 'source',
64
+ 'track',
65
+ 'wbr'
66
+ ])
67
+
68
+ ##
69
+ # @param [String|IO] data The data to lex. This can either be a String or
70
+ # an IO instance.
71
+ #
72
+ # @param [Hash] options
73
+ #
74
+ # @option options [Symbol] :html When set to `true` the lexer will treat
75
+ # the input as HTML instead of SGML/XML. This makes it possible to lex
76
+ # HTML void elements such as `<link href="">`.
77
+ #
78
+ def initialize(data, options = {})
79
+ @data = data
80
+ @html = options[:html]
81
+
82
+ reset
83
+ end
84
+
85
+ ##
86
+ # Resets the internal state of the lexer. Typically you don't need to
87
+ # call this method yourself as its called by #lex after lexing a given
88
+ # String.
89
+ #
90
+ def reset
91
+ @line = 1
92
+ @elements = []
93
+
94
+ @data.rewind if io_input?
95
+
96
+ reset_native
97
+ end
98
+
99
+ ##
100
+ # Yields the data to lex to the supplied block.
101
+ #
102
+ # @return [String]
103
+ # @yieldparam [String]
104
+ #
105
+ def read_data
106
+ # We can't check for #each_line since String also defines that. Using
107
+ # String#each_line has no benefit over just lexing the String in one
108
+ # go.
109
+ if io_input?
110
+ @data.each_line do |line|
111
+ yield line
112
+ end
113
+ else
114
+ yield @data
115
+ end
116
+ end
117
+
118
+ ##
119
+ # Returns `true` if the input is an IO like object, false otherwise.
120
+ #
121
+ # @return [TrueClass|FalseClass]
122
+ #
123
+ def io_input?
124
+ return @data.is_a?(IO) || @data.is_a?(StringIO)
125
+ end
126
+
127
+ ##
128
+ # Gathers all the tokens for the input and returns them as an Array.
129
+ #
130
+ # This method resets the internal state of the lexer after consuming the
131
+ # input.
132
+ #
133
+ # @see #advance
134
+ # @return [Array]
135
+ #
136
+ def lex
137
+ tokens = []
138
+
139
+ advance do |type, value, line|
140
+ tokens << [type, value, line]
141
+ end
142
+
143
+ reset
144
+
145
+ return tokens
146
+ end
147
+
148
+ ##
149
+ # Advances through the input and generates the corresponding tokens. Each
150
+ # token is yielded to the supplied block.
151
+ #
152
+ # Each token is an Array in the following format:
153
+ #
154
+ # [TYPE, VALUE]
155
+ #
156
+ # The type is a symbol, the value is either nil or a String.
157
+ #
158
+ # This method stores the supplied block in `@block` and resets it after
159
+ # the lexer loop has finished.
160
+ #
161
+ # This method does *not* reset the internal state of the lexer.
162
+ #
163
+ # @yieldparam [Symbol] type
164
+ # @yieldparam [String] value
165
+ # @yieldparam [Fixnum] line
166
+ #
167
+ def advance(&block)
168
+ @block = block
169
+
170
+ read_data do |chunk|
171
+ advance_native(chunk)
172
+ end
173
+ ensure
174
+ @block = nil
175
+ end
176
+
177
+ ##
178
+ # @return [TrueClass|FalseClass]
179
+ #
180
+ def html?
181
+ return !!html
182
+ end
183
+
184
+ private
185
+
186
+ ##
187
+ # @param [Fixnum] amount The amount of lines to advance.
188
+ #
189
+ def advance_line(amount = 1)
190
+ @line += amount
191
+ end
192
+
193
+ ##
194
+ # Calls the supplied block with the information of the current token.
195
+ #
196
+ # @param [Symbol] type The token type.
197
+ # @param [String] value The token value.
198
+ #
199
+ # @yieldparam [String] type
200
+ # @yieldparam [String] value
201
+ # @yieldparam [Fixnum] line
202
+ #
203
+ def add_token(type, value = nil)
204
+ @block.call(type, value, @line)
205
+ end
206
+
207
+ ##
208
+ # Returns the name of the element we're currently in.
209
+ #
210
+ # @return [String]
211
+ #
212
+ def current_element
213
+ return @elements.last
214
+ end
215
+
216
+ ##
217
+ # Called when processing single/double quoted strings.
218
+ #
219
+ # @param [String] value The data between the quotes.
220
+ #
221
+ def on_string(value)
222
+ add_token(:T_STRING, value)
223
+ end
224
+
225
+ ##
226
+ # Called when a doctype starts.
227
+ #
228
+ def on_doctype_start
229
+ add_token(:T_DOCTYPE_START)
230
+ end
231
+
232
+ ##
233
+ # Called on the identifier specifying the type of the doctype.
234
+ #
235
+ # @param [String] value
236
+ #
237
+ def on_doctype_type(value)
238
+ add_token(:T_DOCTYPE_TYPE, value)
239
+ end
240
+
241
+ ##
242
+ # Called on the identifier specifying the name of the doctype.
243
+ #
244
+ # @param [String] value
245
+ #
246
+ def on_doctype_name(value)
247
+ add_token(:T_DOCTYPE_NAME, value)
248
+ end
249
+
250
+ ##
251
+ # Called on the end of a doctype.
252
+ #
253
+ def on_doctype_end
254
+ add_token(:T_DOCTYPE_END)
255
+ end
256
+
257
+ ##
258
+ # Called on an inline doctype block.
259
+ #
260
+ # @param [String] value
261
+ #
262
+ def on_doctype_inline(value)
263
+ add_token(:T_DOCTYPE_INLINE, value)
264
+ end
265
+
266
+ ##
267
+ # Called on a CDATA tag.
268
+ #
269
+ def on_cdata(value)
270
+ add_token(:T_CDATA, value)
271
+ end
272
+
273
+ ##
274
+ # Called on a comment.
275
+ #
276
+ # @param [String] value
277
+ #
278
+ def on_comment(value)
279
+ add_token(:T_COMMENT, value)
280
+ end
281
+
282
+ ##
283
+ # Called on the start of an XML declaration tag.
284
+ #
285
+ def on_xml_decl_start
286
+ add_token(:T_XML_DECL_START)
287
+ end
288
+
289
+ ##
290
+ # Called on the end of an XML declaration tag.
291
+ #
292
+ def on_xml_decl_end
293
+ add_token(:T_XML_DECL_END)
294
+ end
295
+
296
+ ##
297
+ # Called on the start of an element.
298
+ #
299
+ def on_element_start
300
+ add_token(:T_ELEM_START)
301
+ end
302
+
303
+ ##
304
+ # Called on the start of a processing instruction.
305
+ #
306
+ def on_proc_ins_start
307
+ add_token(:T_PROC_INS_START)
308
+ end
309
+
310
+ ##
311
+ # Called on a processing instruction name.
312
+ #
313
+ # @param [String] value
314
+ #
315
+ def on_proc_ins_name(value)
316
+ add_token(:T_PROC_INS_NAME, value)
317
+ end
318
+
319
+ ##
320
+ # Called on the end of a processing instruction.
321
+ #
322
+ def on_proc_ins_end
323
+ add_token(:T_PROC_INS_END)
324
+ end
325
+
326
+ ##
327
+ # Called on the name of an element.
328
+ #
329
+ # @param [String] name The name of the element, including namespace.
330
+ #
331
+ def on_element_name(name)
332
+ @elements << name if html?
333
+
334
+ add_token(:T_ELEM_NAME, name)
335
+ end
336
+
337
+ ##
338
+ # Called on the element namespace.
339
+ #
340
+ # @param [String] namespace
341
+ #
342
+ def on_element_ns(namespace)
343
+ add_token(:T_ELEM_NS, namespace)
344
+ end
345
+
346
+ ##
347
+ # Called on the closing `>` of the open tag of an element.
348
+ #
349
+ def on_element_open_end
350
+ if html? and HTML_VOID_ELEMENTS.include?(current_element)
351
+ add_token(:T_ELEM_END)
352
+ @elements.pop
353
+ end
354
+ end
355
+
356
+ ##
357
+ # Called on the closing tag of an element.
358
+ #
359
+ def on_element_end
360
+ add_token(:T_ELEM_END)
361
+
362
+ @elements.pop if html?
363
+ end
364
+
365
+ ##
366
+ # Called on regular text values.
367
+ #
368
+ # @param [String] value
369
+ #
370
+ def on_text(value)
371
+ unless value.empty?
372
+ add_token(:T_TEXT, value)
373
+
374
+ lines = value.count("\n")
375
+
376
+ advance_line(lines) if lines > 0
377
+ end
378
+ end
379
+
380
+ ##
381
+ # Called on attribute namespaces.
382
+ #
383
+ # @param [String] value
384
+ #
385
+ def on_attribute_ns(value)
386
+ add_token(:T_ATTR_NS, value)
387
+ end
388
+
389
+ ##
390
+ # Called on tag attributes.
391
+ #
392
+ # @param [String] value
393
+ #
394
+ def on_attribute(value)
395
+ add_token(:T_ATTR, value)
396
+ end
397
+ end # Lexer
398
+ end # XML
399
+ end # Oga