oga 0.1.1-java

Sign up to get free protection for your applications and to get access to all the features.
Files changed (47) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +13 -0
  3. data/LICENSE +19 -0
  4. data/README.md +179 -0
  5. data/doc/DCO.md +25 -0
  6. data/doc/changelog.md +20 -0
  7. data/doc/css/common.css +76 -0
  8. data/doc/migrating_from_nokogiri.md +169 -0
  9. data/ext/c/extconf.rb +13 -0
  10. data/ext/c/lexer.c +1518 -0
  11. data/ext/c/lexer.h +8 -0
  12. data/ext/c/lexer.rl +121 -0
  13. data/ext/c/liboga.c +6 -0
  14. data/ext/c/liboga.h +11 -0
  15. data/ext/java/Liboga.java +14 -0
  16. data/ext/java/org/liboga/xml/Lexer.java +829 -0
  17. data/ext/java/org/liboga/xml/Lexer.rl +151 -0
  18. data/ext/ragel/base_lexer.rl +323 -0
  19. data/lib/liboga.jar +0 -0
  20. data/lib/oga.rb +43 -0
  21. data/lib/oga/html/parser.rb +25 -0
  22. data/lib/oga/oga.rb +27 -0
  23. data/lib/oga/version.rb +3 -0
  24. data/lib/oga/xml/attribute.rb +111 -0
  25. data/lib/oga/xml/cdata.rb +17 -0
  26. data/lib/oga/xml/character_node.rb +39 -0
  27. data/lib/oga/xml/comment.rb +17 -0
  28. data/lib/oga/xml/doctype.rb +84 -0
  29. data/lib/oga/xml/document.rb +99 -0
  30. data/lib/oga/xml/element.rb +331 -0
  31. data/lib/oga/xml/lexer.rb +399 -0
  32. data/lib/oga/xml/namespace.rb +42 -0
  33. data/lib/oga/xml/node.rb +168 -0
  34. data/lib/oga/xml/node_set.rb +313 -0
  35. data/lib/oga/xml/parser.rb +556 -0
  36. data/lib/oga/xml/processing_instruction.rb +39 -0
  37. data/lib/oga/xml/pull_parser.rb +180 -0
  38. data/lib/oga/xml/querying.rb +32 -0
  39. data/lib/oga/xml/text.rb +11 -0
  40. data/lib/oga/xml/traversal.rb +48 -0
  41. data/lib/oga/xml/xml_declaration.rb +69 -0
  42. data/lib/oga/xpath/evaluator.rb +1748 -0
  43. data/lib/oga/xpath/lexer.rb +2043 -0
  44. data/lib/oga/xpath/node.rb +10 -0
  45. data/lib/oga/xpath/parser.rb +537 -0
  46. data/oga.gemspec +45 -0
  47. metadata +221 -0
@@ -0,0 +1,331 @@
1
+ module Oga
2
+ module XML
3
+ ##
4
+ # Class that contains information about an XML element such as the name,
5
+ # attributes and child nodes.
6
+ #
7
+ # @!attribute [rw] name
8
+ # The name of the element.
9
+ # @return [String]
10
+ #
11
+ # @!attribute [ww] namespace_name
12
+ # The name of the namespace.
13
+ # @return [String]
14
+ #
15
+ # @!attribute [rw] attributes
16
+ # The attributes of the element.
17
+ # @return [Array<Oga::XML::Attribute>]
18
+ #
19
+ # @!attribute [rw] namespaces
20
+ # The registered namespaces.
21
+ # @return [Hash]
22
+ #
23
+ class Element < Node
24
+ include Querying
25
+
26
+ attr_accessor :name, :namespace_name, :attributes, :namespaces
27
+
28
+ ##
29
+ # The attribute prefix/namespace used for registering element namespaces.
30
+ #
31
+ # @return [String]
32
+ #
33
+ XMLNS_PREFIX = 'xmlns'.freeze
34
+
35
+ ##
36
+ # @param [Hash] options
37
+ #
38
+ # @option options [String] :name The name of the element.
39
+ #
40
+ # @option options [String] :namespace_name The name of the namespace.
41
+ #
42
+ # @option options [Array<Oga::XML::Attribute>] :attributes The attributes
43
+ # of the element as an Array.
44
+ #
45
+ def initialize(options = {})
46
+ super
47
+
48
+ @name = options[:name]
49
+ @namespace_name = options[:namespace_name]
50
+ @attributes = options[:attributes] || []
51
+ @namespaces = options[:namespaces] || {}
52
+
53
+ link_attributes
54
+ register_namespaces_from_attributes
55
+ end
56
+
57
+ ##
58
+ # Returns an attribute matching the given name (with or without the
59
+ # namespace).
60
+ #
61
+ # @example
62
+ # # find an attribute that only has the name "foo"
63
+ # attribute('foo')
64
+ #
65
+ # # find an attribute with namespace "foo" and name bar"
66
+ # attribute('foo:bar')
67
+ #
68
+ # @param [String|Symbol] name The name (with or without the namespace)
69
+ # of the attribute.
70
+ #
71
+ # @return [Oga::XML::Attribute]
72
+ #
73
+ def attribute(name)
74
+ name, ns = split_name(name)
75
+
76
+ attributes.each do |attr|
77
+ return attr if attribute_matches?(attr, ns, name)
78
+ end
79
+
80
+ return
81
+ end
82
+
83
+ alias_method :attr, :attribute
84
+
85
+ ##
86
+ # Returns the value of the given attribute.
87
+ #
88
+ # @example
89
+ # element.get('class') # => "container"
90
+ #
91
+ # @see [#attribute]
92
+ #
93
+ def get(name)
94
+ found = attribute(name)
95
+
96
+ return found ? found.value : nil
97
+ end
98
+
99
+ ##
100
+ # Adds a new attribute to the element.
101
+ #
102
+ # @param [Oga::XML::Attribute] attribute
103
+ #
104
+ def add_attribute(attribute)
105
+ attribute.element = self
106
+
107
+ attributes << attribute
108
+ end
109
+
110
+ ##
111
+ # Sets the value of an attribute to the given value. If the attribute does
112
+ # not exist it is created automatically.
113
+ #
114
+ # @param [String] name The name of the attribute, optionally including the
115
+ # namespace.
116
+ #
117
+ # @param [String] value The new value of the attribute.
118
+ #
119
+ def set(name, value)
120
+ found = attribute(name)
121
+
122
+ if found
123
+ found.value = value
124
+ else
125
+ if name.include?(':')
126
+ ns, name = name.split(':')
127
+ else
128
+ ns = nil
129
+ end
130
+
131
+ attr = Attribute.new(
132
+ :name => name,
133
+ :namespace_name => ns,
134
+ :value => value
135
+ )
136
+
137
+ add_attribute(attr)
138
+ end
139
+ end
140
+
141
+ ##
142
+ # Returns the namespace of the element.
143
+ #
144
+ # @return [Oga::XML::Namespace]
145
+ #
146
+ def namespace
147
+ return @namespace ||= available_namespaces[namespace_name]
148
+ end
149
+
150
+ ##
151
+ # Returns the text of all child nodes joined together.
152
+ #
153
+ # @return [String]
154
+ #
155
+ def text
156
+ return children.text
157
+ end
158
+
159
+ ##
160
+ # Returns the text of the current element only.
161
+ #
162
+ # @return [String]
163
+ #
164
+ def inner_text
165
+ text = ''
166
+
167
+ text_nodes.each do |node|
168
+ text << node.text
169
+ end
170
+
171
+ return text
172
+ end
173
+
174
+ ##
175
+ # Returns any {Oga::XML::Text} nodes that are a direct child of this
176
+ # element.
177
+ #
178
+ # @return [Oga::XML::NodeSet]
179
+ #
180
+ def text_nodes
181
+ nodes = NodeSet.new
182
+
183
+ children.each do |child|
184
+ nodes << child if child.is_a?(Text)
185
+ end
186
+
187
+ return nodes
188
+ end
189
+
190
+ ##
191
+ # Sets the inner text of the current element to the given String.
192
+ #
193
+ # @param [String] text
194
+ #
195
+ def inner_text=(text)
196
+ children.each do |child|
197
+ child.remove if child.is_a?(Text)
198
+ end
199
+
200
+ children << XML::Text.new(:text => text)
201
+ end
202
+
203
+ ##
204
+ # Converts the element and its child elements to XML.
205
+ #
206
+ # @return [String]
207
+ #
208
+ def to_xml
209
+ ns = namespace ? "#{namespace}:" : ''
210
+ body = children.map(&:to_xml).join('')
211
+ attrs = ''
212
+
213
+ attributes.each do |attr|
214
+ attrs << " #{attr.to_xml}"
215
+ end
216
+
217
+ return "<#{ns}#{name}#{attrs}>#{body}</#{ns}#{name}>"
218
+ end
219
+
220
+ ##
221
+ # @return [String]
222
+ #
223
+ def inspect
224
+ segments = []
225
+
226
+ [:name, :namespace, :attributes, :children].each do |attr|
227
+ value = send(attr)
228
+
229
+ if !value or (value.respond_to?(:empty?) and value.empty?)
230
+ next
231
+ end
232
+
233
+ segments << "#{attr}: #{value.inspect}"
234
+ end
235
+
236
+ return "Element(#{segments.join(' ')})"
237
+ end
238
+
239
+ ##
240
+ # Registers a new namespace for the current element and its child
241
+ # elements.
242
+ #
243
+ # @param [String] name
244
+ # @param [String] uri
245
+ # @see [Oga::XML::Namespace#initialize]
246
+ #
247
+ def register_namespace(name, uri)
248
+ if namespaces[name]
249
+ raise ArgumentError, "The namespace #{name.inspect} already exists"
250
+ end
251
+
252
+ namespaces[name] = Namespace.new(:name => name, :uri => uri)
253
+ end
254
+
255
+ ##
256
+ # Returns a Hash containing all the namespaces available to the current
257
+ # element.
258
+ #
259
+ # @return [Hash]
260
+ #
261
+ def available_namespaces
262
+ merged = namespaces
263
+ node = parent
264
+
265
+ while node && node.respond_to?(:namespaces)
266
+ merged = merged.merge(node.namespaces)
267
+ node = node.parent
268
+ end
269
+
270
+ return merged
271
+ end
272
+
273
+ private
274
+
275
+ ##
276
+ # Registers namespaces based on any "xmlns" attributes. Once a namespace
277
+ # has been registered the corresponding attribute is removed.
278
+ #
279
+ def register_namespaces_from_attributes
280
+ self.attributes = attributes.reject do |attr|
281
+ # We're using `namespace_name` opposed to `namespace.name` as "xmlns"
282
+ # is not a registered namespace.
283
+ remove = attr.namespace_name && attr.namespace_name == XMLNS_PREFIX
284
+
285
+ register_namespace(attr.name, attr.value) if remove
286
+
287
+ remove
288
+ end
289
+ end
290
+
291
+ ##
292
+ # Links all attributes to the current element.
293
+ #
294
+ def link_attributes
295
+ attributes.each do |attr|
296
+ attr.element = self
297
+ end
298
+ end
299
+
300
+ ##
301
+ # @param [String] name
302
+ # @return [Array]
303
+ #
304
+ def split_name(name)
305
+ segments = name.to_s.split(':')
306
+
307
+ return segments.pop, segments.pop
308
+ end
309
+
310
+ ##
311
+ # @param [Oga::XML::Attribute] attr
312
+ # @param [String] ns
313
+ # @param [String] name
314
+ # @return [TrueClass|FalseClass]
315
+ #
316
+ def attribute_matches?(attr, ns, name)
317
+ name_matches = attr.name == name
318
+ ns_matches = false
319
+
320
+ if ns
321
+ ns_matches = attr.namespace.to_s == ns
322
+
323
+ elsif name_matches and !attr.namespace
324
+ ns_matches = true
325
+ end
326
+
327
+ return name_matches && ns_matches
328
+ end
329
+ end # Element
330
+ end # XML
331
+ end # Oga
@@ -0,0 +1,399 @@
1
+ module Oga
2
+ module XML
3
+ ##
4
+ # Low level lexer that supports both XML and HTML (using an extra option).
5
+ # To lex HTML input set the `:html` option to `true` when creating an
6
+ # instance of the lexer:
7
+ #
8
+ # lexer = Oga::XML::Lexer.new(:html => true)
9
+ #
10
+ # This lexer can process both String and IO instances. IO instances are
11
+ # processed on a line by line basis. This can greatly reduce memory usage
12
+ # in exchange for a slightly slower runtime.
13
+ #
14
+ # ## Thread Safety
15
+ #
16
+ # Since this class keeps track of an internal state you can not use the
17
+ # same instance between multiple threads at the same time. For example, the
18
+ # following will not work reliably:
19
+ #
20
+ # # Don't do this!
21
+ # lexer = Oga::XML::Lexer.new('....')
22
+ # threads = []
23
+ #
24
+ # 2.times do
25
+ # threads << Thread.new do
26
+ # lexer.advance do |*args|
27
+ # p args
28
+ # end
29
+ # end
30
+ # end
31
+ #
32
+ # threads.each(&:join)
33
+ #
34
+ # However, it is perfectly save to use different instances per thread.
35
+ # There is no _global_ state used by this lexer.
36
+ #
37
+ # @!attribute [r] html
38
+ # @return [TrueClass|FalseClass]
39
+ #
40
+ class Lexer
41
+ attr_reader :html
42
+
43
+ ##
44
+ # Names of the HTML void elements that should be handled when HTML lexing
45
+ # is enabled.
46
+ #
47
+ # @return [Set]
48
+ #
49
+ HTML_VOID_ELEMENTS = Set.new([
50
+ 'area',
51
+ 'base',
52
+ 'br',
53
+ 'col',
54
+ 'command',
55
+ 'embed',
56
+ 'hr',
57
+ 'img',
58
+ 'input',
59
+ 'keygen',
60
+ 'link',
61
+ 'meta',
62
+ 'param',
63
+ 'source',
64
+ 'track',
65
+ 'wbr'
66
+ ])
67
+
68
+ ##
69
+ # @param [String|IO] data The data to lex. This can either be a String or
70
+ # an IO instance.
71
+ #
72
+ # @param [Hash] options
73
+ #
74
+ # @option options [Symbol] :html When set to `true` the lexer will treat
75
+ # the input as HTML instead of SGML/XML. This makes it possible to lex
76
+ # HTML void elements such as `<link href="">`.
77
+ #
78
+ def initialize(data, options = {})
79
+ @data = data
80
+ @html = options[:html]
81
+
82
+ reset
83
+ end
84
+
85
+ ##
86
+ # Resets the internal state of the lexer. Typically you don't need to
87
+ # call this method yourself as its called by #lex after lexing a given
88
+ # String.
89
+ #
90
+ def reset
91
+ @line = 1
92
+ @elements = []
93
+
94
+ @data.rewind if io_input?
95
+
96
+ reset_native
97
+ end
98
+
99
+ ##
100
+ # Yields the data to lex to the supplied block.
101
+ #
102
+ # @return [String]
103
+ # @yieldparam [String]
104
+ #
105
+ def read_data
106
+ # We can't check for #each_line since String also defines that. Using
107
+ # String#each_line has no benefit over just lexing the String in one
108
+ # go.
109
+ if io_input?
110
+ @data.each_line do |line|
111
+ yield line
112
+ end
113
+ else
114
+ yield @data
115
+ end
116
+ end
117
+
118
+ ##
119
+ # Returns `true` if the input is an IO like object, false otherwise.
120
+ #
121
+ # @return [TrueClass|FalseClass]
122
+ #
123
+ def io_input?
124
+ return @data.is_a?(IO) || @data.is_a?(StringIO)
125
+ end
126
+
127
+ ##
128
+ # Gathers all the tokens for the input and returns them as an Array.
129
+ #
130
+ # This method resets the internal state of the lexer after consuming the
131
+ # input.
132
+ #
133
+ # @see #advance
134
+ # @return [Array]
135
+ #
136
+ def lex
137
+ tokens = []
138
+
139
+ advance do |type, value, line|
140
+ tokens << [type, value, line]
141
+ end
142
+
143
+ reset
144
+
145
+ return tokens
146
+ end
147
+
148
+ ##
149
+ # Advances through the input and generates the corresponding tokens. Each
150
+ # token is yielded to the supplied block.
151
+ #
152
+ # Each token is an Array in the following format:
153
+ #
154
+ # [TYPE, VALUE]
155
+ #
156
+ # The type is a symbol, the value is either nil or a String.
157
+ #
158
+ # This method stores the supplied block in `@block` and resets it after
159
+ # the lexer loop has finished.
160
+ #
161
+ # This method does *not* reset the internal state of the lexer.
162
+ #
163
+ # @yieldparam [Symbol] type
164
+ # @yieldparam [String] value
165
+ # @yieldparam [Fixnum] line
166
+ #
167
+ def advance(&block)
168
+ @block = block
169
+
170
+ read_data do |chunk|
171
+ advance_native(chunk)
172
+ end
173
+ ensure
174
+ @block = nil
175
+ end
176
+
177
+ ##
178
+ # @return [TrueClass|FalseClass]
179
+ #
180
+ def html?
181
+ return !!html
182
+ end
183
+
184
+ private
185
+
186
+ ##
187
+ # @param [Fixnum] amount The amount of lines to advance.
188
+ #
189
+ def advance_line(amount = 1)
190
+ @line += amount
191
+ end
192
+
193
+ ##
194
+ # Calls the supplied block with the information of the current token.
195
+ #
196
+ # @param [Symbol] type The token type.
197
+ # @param [String] value The token value.
198
+ #
199
+ # @yieldparam [String] type
200
+ # @yieldparam [String] value
201
+ # @yieldparam [Fixnum] line
202
+ #
203
+ def add_token(type, value = nil)
204
+ @block.call(type, value, @line)
205
+ end
206
+
207
+ ##
208
+ # Returns the name of the element we're currently in.
209
+ #
210
+ # @return [String]
211
+ #
212
+ def current_element
213
+ return @elements.last
214
+ end
215
+
216
+ ##
217
+ # Called when processing single/double quoted strings.
218
+ #
219
+ # @param [String] value The data between the quotes.
220
+ #
221
+ def on_string(value)
222
+ add_token(:T_STRING, value)
223
+ end
224
+
225
+ ##
226
+ # Called when a doctype starts.
227
+ #
228
+ def on_doctype_start
229
+ add_token(:T_DOCTYPE_START)
230
+ end
231
+
232
+ ##
233
+ # Called on the identifier specifying the type of the doctype.
234
+ #
235
+ # @param [String] value
236
+ #
237
+ def on_doctype_type(value)
238
+ add_token(:T_DOCTYPE_TYPE, value)
239
+ end
240
+
241
+ ##
242
+ # Called on the identifier specifying the name of the doctype.
243
+ #
244
+ # @param [String] value
245
+ #
246
+ def on_doctype_name(value)
247
+ add_token(:T_DOCTYPE_NAME, value)
248
+ end
249
+
250
+ ##
251
+ # Called on the end of a doctype.
252
+ #
253
+ def on_doctype_end
254
+ add_token(:T_DOCTYPE_END)
255
+ end
256
+
257
+ ##
258
+ # Called on an inline doctype block.
259
+ #
260
+ # @param [String] value
261
+ #
262
+ def on_doctype_inline(value)
263
+ add_token(:T_DOCTYPE_INLINE, value)
264
+ end
265
+
266
+ ##
267
+ # Called on a CDATA tag.
268
+ #
269
+ def on_cdata(value)
270
+ add_token(:T_CDATA, value)
271
+ end
272
+
273
+ ##
274
+ # Called on a comment.
275
+ #
276
+ # @param [String] value
277
+ #
278
+ def on_comment(value)
279
+ add_token(:T_COMMENT, value)
280
+ end
281
+
282
+ ##
283
+ # Called on the start of an XML declaration tag.
284
+ #
285
+ def on_xml_decl_start
286
+ add_token(:T_XML_DECL_START)
287
+ end
288
+
289
+ ##
290
+ # Called on the end of an XML declaration tag.
291
+ #
292
+ def on_xml_decl_end
293
+ add_token(:T_XML_DECL_END)
294
+ end
295
+
296
+ ##
297
+ # Called on the start of an element.
298
+ #
299
+ def on_element_start
300
+ add_token(:T_ELEM_START)
301
+ end
302
+
303
+ ##
304
+ # Called on the start of a processing instruction.
305
+ #
306
+ def on_proc_ins_start
307
+ add_token(:T_PROC_INS_START)
308
+ end
309
+
310
+ ##
311
+ # Called on a processing instruction name.
312
+ #
313
+ # @param [String] value
314
+ #
315
+ def on_proc_ins_name(value)
316
+ add_token(:T_PROC_INS_NAME, value)
317
+ end
318
+
319
+ ##
320
+ # Called on the end of a processing instruction.
321
+ #
322
+ def on_proc_ins_end
323
+ add_token(:T_PROC_INS_END)
324
+ end
325
+
326
+ ##
327
+ # Called on the name of an element.
328
+ #
329
+ # @param [String] name The name of the element, including namespace.
330
+ #
331
+ def on_element_name(name)
332
+ @elements << name if html?
333
+
334
+ add_token(:T_ELEM_NAME, name)
335
+ end
336
+
337
+ ##
338
+ # Called on the element namespace.
339
+ #
340
+ # @param [String] namespace
341
+ #
342
+ def on_element_ns(namespace)
343
+ add_token(:T_ELEM_NS, namespace)
344
+ end
345
+
346
+ ##
347
+ # Called on the closing `>` of the open tag of an element.
348
+ #
349
+ def on_element_open_end
350
+ if html? and HTML_VOID_ELEMENTS.include?(current_element)
351
+ add_token(:T_ELEM_END)
352
+ @elements.pop
353
+ end
354
+ end
355
+
356
+ ##
357
+ # Called on the closing tag of an element.
358
+ #
359
+ def on_element_end
360
+ add_token(:T_ELEM_END)
361
+
362
+ @elements.pop if html?
363
+ end
364
+
365
+ ##
366
+ # Called on regular text values.
367
+ #
368
+ # @param [String] value
369
+ #
370
+ def on_text(value)
371
+ unless value.empty?
372
+ add_token(:T_TEXT, value)
373
+
374
+ lines = value.count("\n")
375
+
376
+ advance_line(lines) if lines > 0
377
+ end
378
+ end
379
+
380
+ ##
381
+ # Called on attribute namespaces.
382
+ #
383
+ # @param [String] value
384
+ #
385
+ def on_attribute_ns(value)
386
+ add_token(:T_ATTR_NS, value)
387
+ end
388
+
389
+ ##
390
+ # Called on tag attributes.
391
+ #
392
+ # @param [String] value
393
+ #
394
+ def on_attribute(value)
395
+ add_token(:T_ATTR, value)
396
+ end
397
+ end # Lexer
398
+ end # XML
399
+ end # Oga