oga 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +13 -0
  3. data/LICENSE +19 -0
  4. data/README.md +171 -0
  5. data/doc/DCO.md +25 -0
  6. data/doc/changelog.md +7 -0
  7. data/doc/css/common.css +76 -0
  8. data/doc/migrating_from_nokogiri.md +169 -0
  9. data/ext/c/extconf.rb +13 -0
  10. data/ext/c/lexer.c +1518 -0
  11. data/ext/c/lexer.h +8 -0
  12. data/ext/c/lexer.rl +121 -0
  13. data/ext/c/liboga.c +6 -0
  14. data/ext/c/liboga.h +11 -0
  15. data/ext/java/Liboga.java +14 -0
  16. data/ext/java/org/liboga/xml/Lexer.java +829 -0
  17. data/ext/java/org/liboga/xml/Lexer.rl +151 -0
  18. data/ext/ragel/base_lexer.rl +323 -0
  19. data/lib/oga.rb +43 -0
  20. data/lib/oga/html/parser.rb +25 -0
  21. data/lib/oga/oga.rb +27 -0
  22. data/lib/oga/version.rb +3 -0
  23. data/lib/oga/xml/attribute.rb +111 -0
  24. data/lib/oga/xml/cdata.rb +24 -0
  25. data/lib/oga/xml/character_node.rb +39 -0
  26. data/lib/oga/xml/comment.rb +24 -0
  27. data/lib/oga/xml/doctype.rb +91 -0
  28. data/lib/oga/xml/document.rb +99 -0
  29. data/lib/oga/xml/element.rb +340 -0
  30. data/lib/oga/xml/lexer.rb +399 -0
  31. data/lib/oga/xml/namespace.rb +42 -0
  32. data/lib/oga/xml/node.rb +175 -0
  33. data/lib/oga/xml/node_set.rb +313 -0
  34. data/lib/oga/xml/parser.rb +556 -0
  35. data/lib/oga/xml/processing_instruction.rb +39 -0
  36. data/lib/oga/xml/pull_parser.rb +166 -0
  37. data/lib/oga/xml/querying.rb +32 -0
  38. data/lib/oga/xml/text.rb +16 -0
  39. data/lib/oga/xml/traversal.rb +48 -0
  40. data/lib/oga/xml/xml_declaration.rb +76 -0
  41. data/lib/oga/xpath/evaluator.rb +1748 -0
  42. data/lib/oga/xpath/lexer.rb +2043 -0
  43. data/lib/oga/xpath/node.rb +10 -0
  44. data/lib/oga/xpath/parser.rb +535 -0
  45. data/oga.gemspec +45 -0
  46. metadata +221 -0
@@ -0,0 +1,340 @@
1
+ module Oga
2
+ module XML
3
+ ##
4
+ # Class that contains information about an XML element such as the name,
5
+ # attributes and child nodes.
6
+ #
7
+ # @!attribute [rw] name
8
+ # The name of the element.
9
+ # @return [String]
10
+ #
11
+ # @!attribute [ww] namespace_name
12
+ # The name of the namespace.
13
+ # @return [String]
14
+ #
15
+ # @!attribute [rw] attributes
16
+ # The attributes of the element.
17
+ # @return [Array<Oga::XML::Attribute>]
18
+ #
19
+ # @!attribute [rw] namespaces
20
+ # The registered namespaces.
21
+ # @return [Hash]
22
+ #
23
+ class Element < Node
24
+ include Querying
25
+
26
+ attr_accessor :name, :namespace_name, :attributes, :namespaces
27
+
28
+ ##
29
+ # The attribute prefix/namespace used for registering element namespaces.
30
+ #
31
+ # @return [String]
32
+ #
33
+ XMLNS_PREFIX = 'xmlns'.freeze
34
+
35
+ ##
36
+ # @param [Hash] options
37
+ #
38
+ # @option options [String] :name The name of the element.
39
+ #
40
+ # @option options [String] :namespace_name The name of the namespace.
41
+ #
42
+ # @option options [Array<Oga::XML::Attribute>] :attributes The attributes
43
+ # of the element as an Array.
44
+ #
45
+ def initialize(options = {})
46
+ super
47
+
48
+ @name = options[:name]
49
+ @namespace_name = options[:namespace_name]
50
+ @attributes = options[:attributes] || []
51
+ @namespaces = options[:namespaces] || {}
52
+
53
+ link_attributes
54
+ register_namespaces_from_attributes
55
+ end
56
+
57
+ ##
58
+ # Returns an attribute matching the given name (with or without the
59
+ # namespace).
60
+ #
61
+ # @example
62
+ # # find an attribute that only has the name "foo"
63
+ # attribute('foo')
64
+ #
65
+ # # find an attribute with namespace "foo" and name bar"
66
+ # attribute('foo:bar')
67
+ #
68
+ # @param [String|Symbol] name The name (with or without the namespace)
69
+ # of the attribute.
70
+ #
71
+ # @return [Oga::XML::Attribute]
72
+ #
73
+ def attribute(name)
74
+ name, ns = split_name(name)
75
+
76
+ attributes.each do |attr|
77
+ return attr if attribute_matches?(attr, ns, name)
78
+ end
79
+
80
+ return
81
+ end
82
+
83
+ alias_method :attr, :attribute
84
+
85
+ ##
86
+ # Returns the value of the given attribute.
87
+ #
88
+ # @example
89
+ # element.get('class') # => "container"
90
+ #
91
+ # @see [#attribute]
92
+ #
93
+ def get(name)
94
+ found = attribute(name)
95
+
96
+ return found ? found.value : nil
97
+ end
98
+
99
+ ##
100
+ # Adds a new attribute to the element.
101
+ #
102
+ # @param [Oga::XML::Attribute] attribute
103
+ #
104
+ def add_attribute(attribute)
105
+ attribute.element = self
106
+
107
+ attributes << attribute
108
+ end
109
+
110
+ ##
111
+ # Sets the value of an attribute to the given value. If the attribute does
112
+ # not exist it is created automatically.
113
+ #
114
+ # @param [String] name The name of the attribute, optionally including the
115
+ # namespace.
116
+ #
117
+ # @param [String] value The new value of the attribute.
118
+ #
119
+ def set(name, value)
120
+ found = attribute(name)
121
+
122
+ if found
123
+ found.value = value
124
+ else
125
+ if name.include?(':')
126
+ ns, name = name.split(':')
127
+ else
128
+ ns = nil
129
+ end
130
+
131
+ attr = Attribute.new(
132
+ :name => name,
133
+ :namespace_name => ns,
134
+ :value => value
135
+ )
136
+
137
+ add_attribute(attr)
138
+ end
139
+ end
140
+
141
+ ##
142
+ # Returns the namespace of the element.
143
+ #
144
+ # @return [Oga::XML::Namespace]
145
+ #
146
+ def namespace
147
+ return @namespace ||= available_namespaces[namespace_name]
148
+ end
149
+
150
+ ##
151
+ # Returns the text of all child nodes joined together.
152
+ #
153
+ # @return [String]
154
+ #
155
+ def text
156
+ return children.text
157
+ end
158
+
159
+ ##
160
+ # Returns the text of the current element only.
161
+ #
162
+ # @return [String]
163
+ #
164
+ def inner_text
165
+ text = ''
166
+
167
+ text_nodes.each do |node|
168
+ text << node.text
169
+ end
170
+
171
+ return text
172
+ end
173
+
174
+ ##
175
+ # Returns any {Oga::XML::Text} nodes that are a direct child of this
176
+ # element.
177
+ #
178
+ # @return [Oga::XML::NodeSet]
179
+ #
180
+ def text_nodes
181
+ nodes = NodeSet.new
182
+
183
+ children.each do |child|
184
+ nodes << child if child.is_a?(Text)
185
+ end
186
+
187
+ return nodes
188
+ end
189
+
190
+ ##
191
+ # Sets the inner text of the current element to the given String.
192
+ #
193
+ # @param [String] text
194
+ #
195
+ def inner_text=(text)
196
+ children.each do |child|
197
+ child.remove if child.is_a?(Text)
198
+ end
199
+
200
+ children << XML::Text.new(:text => text)
201
+ end
202
+
203
+ ##
204
+ # Converts the element and its child elements to XML.
205
+ #
206
+ # @return [String]
207
+ #
208
+ def to_xml
209
+ ns = namespace ? "#{namespace}:" : ''
210
+ body = children.map(&:to_xml).join('')
211
+ attrs = ''
212
+
213
+ attributes.each do |attr|
214
+ attrs << attr.to_xml
215
+ end
216
+
217
+ attrs = " #{attrs}" unless attrs.empty?
218
+
219
+ return "<#{ns}#{name}#{attrs}>#{body}</#{ns}#{name}>"
220
+ end
221
+
222
+ ##
223
+ # @return [String]
224
+ #
225
+ def inspect
226
+ segments = []
227
+
228
+ [:name, :namespace, :attributes, :children].each do |attr|
229
+ value = send(attr)
230
+
231
+ if !value or (value.respond_to?(:empty?) and value.empty?)
232
+ next
233
+ end
234
+
235
+ segments << "#{attr}: #{value.inspect}"
236
+ end
237
+
238
+ return "Element(#{segments.join(' ')})"
239
+ end
240
+
241
+ ##
242
+ # @return [Symbol]
243
+ #
244
+ def node_type
245
+ return :element
246
+ end
247
+
248
+ ##
249
+ # Registers a new namespace for the current element and its child
250
+ # elements.
251
+ #
252
+ # @param [String] name
253
+ # @param [String] uri
254
+ # @see [Oga::XML::Namespace#initialize]
255
+ #
256
+ def register_namespace(name, uri)
257
+ if namespaces[name]
258
+ raise ArgumentError, "The namespace #{name.inspect} already exists"
259
+ end
260
+
261
+ namespaces[name] = Namespace.new(:name => name, :uri => uri)
262
+ end
263
+
264
+ ##
265
+ # Returns a Hash containing all the namespaces available to the current
266
+ # element.
267
+ #
268
+ # @return [Hash]
269
+ #
270
+ def available_namespaces
271
+ merged = namespaces
272
+ node = parent
273
+
274
+ while node && node.respond_to?(:namespaces)
275
+ merged = merged.merge(node.namespaces)
276
+ node = node.parent
277
+ end
278
+
279
+ return merged
280
+ end
281
+
282
+ private
283
+
284
+ ##
285
+ # Registers namespaces based on any "xmlns" attributes. Once a namespace
286
+ # has been registered the corresponding attribute is removed.
287
+ #
288
+ def register_namespaces_from_attributes
289
+ self.attributes = attributes.reject do |attr|
290
+ # We're using `namespace_name` opposed to `namespace.name` as "xmlns"
291
+ # is not a registered namespace.
292
+ remove = attr.namespace_name && attr.namespace_name == XMLNS_PREFIX
293
+
294
+ register_namespace(attr.name, attr.value) if remove
295
+
296
+ remove
297
+ end
298
+ end
299
+
300
+ ##
301
+ # Links all attributes to the current element.
302
+ #
303
+ def link_attributes
304
+ attributes.each do |attr|
305
+ attr.element = self
306
+ end
307
+ end
308
+
309
+ ##
310
+ # @param [String] name
311
+ # @return [Array]
312
+ #
313
+ def split_name(name)
314
+ segments = name.to_s.split(':')
315
+
316
+ return segments.pop, segments.pop
317
+ end
318
+
319
+ ##
320
+ # @param [Oga::XML::Attribute] attr
321
+ # @param [String] ns
322
+ # @param [String] name
323
+ # @return [TrueClass|FalseClass]
324
+ #
325
+ def attribute_matches?(attr, ns, name)
326
+ name_matches = attr.name == name
327
+ ns_matches = false
328
+
329
+ if ns
330
+ ns_matches = attr.namespace.to_s == ns
331
+
332
+ elsif name_matches and !attr.namespace
333
+ ns_matches = true
334
+ end
335
+
336
+ return name_matches && ns_matches
337
+ end
338
+ end # Element
339
+ end # XML
340
+ end # Oga
@@ -0,0 +1,399 @@
1
+ module Oga
2
+ module XML
3
+ ##
4
+ # Low level lexer that supports both XML and HTML (using an extra option).
5
+ # To lex HTML input set the `:html` option to `true` when creating an
6
+ # instance of the lexer:
7
+ #
8
+ # lexer = Oga::XML::Lexer.new(:html => true)
9
+ #
10
+ # This lexer can process both String and IO instances. IO instances are
11
+ # processed on a line by line basis. This can greatly reduce memory usage
12
+ # in exchange for a slightly slower runtime.
13
+ #
14
+ # ## Thread Safety
15
+ #
16
+ # Since this class keeps track of an internal state you can not use the
17
+ # same instance between multiple threads at the same time. For example, the
18
+ # following will not work reliably:
19
+ #
20
+ # # Don't do this!
21
+ # lexer = Oga::XML::Lexer.new('....')
22
+ # threads = []
23
+ #
24
+ # 2.times do
25
+ # threads << Thread.new do
26
+ # lexer.advance do |*args|
27
+ # p args
28
+ # end
29
+ # end
30
+ # end
31
+ #
32
+ # threads.each(&:join)
33
+ #
34
+ # However, it is perfectly save to use different instances per thread.
35
+ # There is no _global_ state used by this lexer.
36
+ #
37
+ # @!attribute [r] html
38
+ # @return [TrueClass|FalseClass]
39
+ #
40
+ class Lexer
41
+ attr_reader :html
42
+
43
+ ##
44
+ # Names of the HTML void elements that should be handled when HTML lexing
45
+ # is enabled.
46
+ #
47
+ # @return [Set]
48
+ #
49
+ HTML_VOID_ELEMENTS = Set.new([
50
+ 'area',
51
+ 'base',
52
+ 'br',
53
+ 'col',
54
+ 'command',
55
+ 'embed',
56
+ 'hr',
57
+ 'img',
58
+ 'input',
59
+ 'keygen',
60
+ 'link',
61
+ 'meta',
62
+ 'param',
63
+ 'source',
64
+ 'track',
65
+ 'wbr'
66
+ ])
67
+
68
+ ##
69
+ # @param [String|IO] data The data to lex. This can either be a String or
70
+ # an IO instance.
71
+ #
72
+ # @param [Hash] options
73
+ #
74
+ # @option options [Symbol] :html When set to `true` the lexer will treat
75
+ # the input as HTML instead of SGML/XML. This makes it possible to lex
76
+ # HTML void elements such as `<link href="">`.
77
+ #
78
+ def initialize(data, options = {})
79
+ @data = data
80
+ @html = options[:html]
81
+
82
+ reset
83
+ end
84
+
85
+ ##
86
+ # Resets the internal state of the lexer. Typically you don't need to
87
+ # call this method yourself as its called by #lex after lexing a given
88
+ # String.
89
+ #
90
+ def reset
91
+ @line = 1
92
+ @elements = []
93
+
94
+ @data.rewind if io_input?
95
+
96
+ reset_native
97
+ end
98
+
99
+ ##
100
+ # Yields the data to lex to the supplied block.
101
+ #
102
+ # @return [String]
103
+ # @yieldparam [String]
104
+ #
105
+ def read_data
106
+ # We can't check for #each_line since String also defines that. Using
107
+ # String#each_line has no benefit over just lexing the String in one
108
+ # go.
109
+ if io_input?
110
+ @data.each_line do |line|
111
+ yield line
112
+ end
113
+ else
114
+ yield @data
115
+ end
116
+ end
117
+
118
+ ##
119
+ # Returns `true` if the input is an IO like object, false otherwise.
120
+ #
121
+ # @return [TrueClass|FalseClass]
122
+ #
123
+ def io_input?
124
+ return @data.is_a?(IO) || @data.is_a?(StringIO)
125
+ end
126
+
127
+ ##
128
+ # Gathers all the tokens for the input and returns them as an Array.
129
+ #
130
+ # This method resets the internal state of the lexer after consuming the
131
+ # input.
132
+ #
133
+ # @see #advance
134
+ # @return [Array]
135
+ #
136
+ def lex
137
+ tokens = []
138
+
139
+ advance do |type, value, line|
140
+ tokens << [type, value, line]
141
+ end
142
+
143
+ reset
144
+
145
+ return tokens
146
+ end
147
+
148
+ ##
149
+ # Advances through the input and generates the corresponding tokens. Each
150
+ # token is yielded to the supplied block.
151
+ #
152
+ # Each token is an Array in the following format:
153
+ #
154
+ # [TYPE, VALUE]
155
+ #
156
+ # The type is a symbol, the value is either nil or a String.
157
+ #
158
+ # This method stores the supplied block in `@block` and resets it after
159
+ # the lexer loop has finished.
160
+ #
161
+ # This method does *not* reset the internal state of the lexer.
162
+ #
163
+ # @yieldparam [Symbol] type
164
+ # @yieldparam [String] value
165
+ # @yieldparam [Fixnum] line
166
+ #
167
+ def advance(&block)
168
+ @block = block
169
+
170
+ read_data do |chunk|
171
+ advance_native(chunk)
172
+ end
173
+ ensure
174
+ @block = nil
175
+ end
176
+
177
+ ##
178
+ # @return [TrueClass|FalseClass]
179
+ #
180
+ def html?
181
+ return !!html
182
+ end
183
+
184
+ private
185
+
186
+ ##
187
+ # @param [Fixnum] amount The amount of lines to advance.
188
+ #
189
+ def advance_line(amount = 1)
190
+ @line += amount
191
+ end
192
+
193
+ ##
194
+ # Calls the supplied block with the information of the current token.
195
+ #
196
+ # @param [Symbol] type The token type.
197
+ # @param [String] value The token value.
198
+ #
199
+ # @yieldparam [String] type
200
+ # @yieldparam [String] value
201
+ # @yieldparam [Fixnum] line
202
+ #
203
+ def add_token(type, value = nil)
204
+ @block.call(type, value, @line)
205
+ end
206
+
207
+ ##
208
+ # Returns the name of the element we're currently in.
209
+ #
210
+ # @return [String]
211
+ #
212
+ def current_element
213
+ return @elements.last
214
+ end
215
+
216
+ ##
217
+ # Called when processing single/double quoted strings.
218
+ #
219
+ # @param [String] value The data between the quotes.
220
+ #
221
+ def on_string(value)
222
+ add_token(:T_STRING, value)
223
+ end
224
+
225
+ ##
226
+ # Called when a doctype starts.
227
+ #
228
+ def on_doctype_start
229
+ add_token(:T_DOCTYPE_START)
230
+ end
231
+
232
+ ##
233
+ # Called on the identifier specifying the type of the doctype.
234
+ #
235
+ # @param [String] value
236
+ #
237
+ def on_doctype_type(value)
238
+ add_token(:T_DOCTYPE_TYPE, value)
239
+ end
240
+
241
+ ##
242
+ # Called on the identifier specifying the name of the doctype.
243
+ #
244
+ # @param [String] value
245
+ #
246
+ def on_doctype_name(value)
247
+ add_token(:T_DOCTYPE_NAME, value)
248
+ end
249
+
250
+ ##
251
+ # Called on the end of a doctype.
252
+ #
253
+ def on_doctype_end
254
+ add_token(:T_DOCTYPE_END)
255
+ end
256
+
257
+ ##
258
+ # Called on an inline doctype block.
259
+ #
260
+ # @param [String] value
261
+ #
262
+ def on_doctype_inline(value)
263
+ add_token(:T_DOCTYPE_INLINE, value)
264
+ end
265
+
266
+ ##
267
+ # Called on a CDATA tag.
268
+ #
269
+ def on_cdata(value)
270
+ add_token(:T_CDATA, value)
271
+ end
272
+
273
+ ##
274
+ # Called on a comment.
275
+ #
276
+ # @param [String] value
277
+ #
278
+ def on_comment(value)
279
+ add_token(:T_COMMENT, value)
280
+ end
281
+
282
+ ##
283
+ # Called on the start of an XML declaration tag.
284
+ #
285
+ def on_xml_decl_start
286
+ add_token(:T_XML_DECL_START)
287
+ end
288
+
289
+ ##
290
+ # Called on the end of an XML declaration tag.
291
+ #
292
+ def on_xml_decl_end
293
+ add_token(:T_XML_DECL_END)
294
+ end
295
+
296
+ ##
297
+ # Called on the start of an element.
298
+ #
299
+ def on_element_start
300
+ add_token(:T_ELEM_START)
301
+ end
302
+
303
+ ##
304
+ # Called on the start of a processing instruction.
305
+ #
306
+ def on_proc_ins_start
307
+ add_token(:T_PROC_INS_START)
308
+ end
309
+
310
+ ##
311
+ # Called on a processing instruction name.
312
+ #
313
+ # @param [String] value
314
+ #
315
+ def on_proc_ins_name(value)
316
+ add_token(:T_PROC_INS_NAME, value)
317
+ end
318
+
319
+ ##
320
+ # Called on the end of a processing instruction.
321
+ #
322
+ def on_proc_ins_end
323
+ add_token(:T_PROC_INS_END)
324
+ end
325
+
326
+ ##
327
+ # Called on the name of an element.
328
+ #
329
+ # @param [String] name The name of the element, including namespace.
330
+ #
331
+ def on_element_name(name)
332
+ @elements << name if html?
333
+
334
+ add_token(:T_ELEM_NAME, name)
335
+ end
336
+
337
+ ##
338
+ # Called on the element namespace.
339
+ #
340
+ # @param [String] namespace
341
+ #
342
+ def on_element_ns(namespace)
343
+ add_token(:T_ELEM_NS, namespace)
344
+ end
345
+
346
+ ##
347
+ # Called on the closing `>` of the open tag of an element.
348
+ #
349
+ def on_element_open_end
350
+ if html? and HTML_VOID_ELEMENTS.include?(current_element)
351
+ add_token(:T_ELEM_END)
352
+ @elements.pop
353
+ end
354
+ end
355
+
356
+ ##
357
+ # Called on the closing tag of an element.
358
+ #
359
+ def on_element_end
360
+ add_token(:T_ELEM_END)
361
+
362
+ @elements.pop if html?
363
+ end
364
+
365
+ ##
366
+ # Called on regular text values.
367
+ #
368
+ # @param [String] value
369
+ #
370
+ def on_text(value)
371
+ unless value.empty?
372
+ add_token(:T_TEXT, value)
373
+
374
+ lines = value.count("\n")
375
+
376
+ advance_line(lines) if lines > 0
377
+ end
378
+ end
379
+
380
+ ##
381
+ # Called on attribute namespaces.
382
+ #
383
+ # @param [String] value
384
+ #
385
+ def on_attribute_ns(value)
386
+ add_token(:T_ATTR_NS, value)
387
+ end
388
+
389
+ ##
390
+ # Called on tag attributes.
391
+ #
392
+ # @param [String] value
393
+ #
394
+ def on_attribute(value)
395
+ add_token(:T_ATTR, value)
396
+ end
397
+ end # Lexer
398
+ end # XML
399
+ end # Oga