oga 0.1.1-java → 0.1.2-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -292,6 +292,14 @@
292
292
  allowed_text = any* -- terminate_text;
293
293
 
294
294
  text := |*
295
+ # Input such as just "</" or "<?". This rule takes precedence over the
296
+ # rules below, but only if those don't match.
297
+ terminate_text => {
298
+ callback("on_text", data, encoding, ts, te);
299
+
300
+ fnext main;
301
+ };
302
+
295
303
  # Text followed by a special tag, such as "foo<!--"
296
304
  allowed_text @{ mark = p; } terminate_text => {
297
305
  callback("on_text", data, encoding, ts, mark);
data/lib/liboga.jar CHANGED
Binary file
data/lib/oga.rb CHANGED
@@ -1,3 +1,5 @@
1
+ gem 'racc'
2
+
1
3
  require 'ast'
2
4
  require 'set'
3
5
  require 'stringio'
@@ -18,6 +20,7 @@ if RUBY_PLATFORM == 'java'
18
20
  end
19
21
  #:nocov:
20
22
 
23
+ require_relative 'oga/xml/html_void_elements'
21
24
  require_relative 'oga/xml/querying'
22
25
  require_relative 'oga/xml/traversal'
23
26
  require_relative 'oga/xml/node'
@@ -34,8 +37,11 @@ require_relative 'oga/xml/attribute'
34
37
  require_relative 'oga/xml/element'
35
38
  require_relative 'oga/xml/node_set'
36
39
 
40
+ require_relative 'oga/xml/sax_parser'
37
41
  require_relative 'oga/xml/pull_parser'
42
+
38
43
  require_relative 'oga/html/parser'
44
+ require_relative 'oga/html/sax_parser'
39
45
 
40
46
  require_relative 'oga/xpath/node'
41
47
  require_relative 'oga/xpath/lexer'
@@ -0,0 +1,18 @@
1
+ module Oga
2
+ module HTML
3
+ ##
4
+ # SAX parser for HTML documents. See the documentation of
5
+ # {Oga::XML::SaxParser} for more information.
6
+ #
7
+ class SaxParser < XML::SaxParser
8
+ ##
9
+ # @see [Oga::XML::SaxParser#initialize]
10
+ #
11
+ def initialize(handler, data, options = {})
12
+ options = options.merge(:html => true)
13
+
14
+ super(handler, data, options)
15
+ end
16
+ end # SaxParser
17
+ end # HTML
18
+ end # Oga
data/lib/oga/oga.rb CHANGED
@@ -24,4 +24,34 @@ module Oga
24
24
  def self.parse_html(html)
25
25
  return HTML::Parser.new(html).parse
26
26
  end
27
+
28
+ ##
29
+ # Parses the given XML document using the SAX parser.
30
+ #
31
+ # @example
32
+ # handler = SomeSaxHandler.new
33
+ #
34
+ # Oga.sax_parse_html(handler, '<root>Hello</root>')
35
+ #
36
+ # @param [Object] handler The SAX handler for the parser.
37
+ # @param [String|IO] xml The XML to parse.
38
+ #
39
+ def self.sax_parse_xml(handler, xml)
40
+ XML::SaxParser.new(handler, xml).parse
41
+ end
42
+
43
+ ##
44
+ # Parses the given HTML document using the SAX parser.
45
+ #
46
+ # @example
47
+ # handler = SomeSaxHandler.new
48
+ #
49
+ # Oga.sax_parse_html(handler, '<script>foo()</script>')
50
+ #
51
+ # @param [Object] handler The SAX handler for the parser.
52
+ # @param [String|IO] html The HTML to parse.
53
+ #
54
+ def self.sax_parse_html(handler, html)
55
+ HTML::SaxParser.new(handler, html).parse
56
+ end
27
57
  end # Oga
data/lib/oga/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Oga
2
- VERSION = '0.1.1'
2
+ VERSION = '0.1.2'
3
3
  end # Oga
@@ -12,11 +12,15 @@ module Oga
12
12
  # The XML declaration of the document.
13
13
  # @return [Oga::XML::XmlDeclaration]
14
14
  #
15
+ # @!attribute [rw] type
16
+ # The document type, either `:xml` or `:html`.
17
+ # @return [Symbol]
18
+ #
15
19
  class Document
16
20
  include Querying
17
21
  include Traversal
18
22
 
19
- attr_accessor :doctype, :xml_declaration
23
+ attr_accessor :doctype, :xml_declaration, :type
20
24
 
21
25
  ##
22
26
  # @param [Hash] options
@@ -24,10 +28,12 @@ module Oga
24
28
  # @option options [Oga::XML::NodeSet] :children
25
29
  # @option options [Oga::XML::Doctype] :doctype
26
30
  # @option options [Oga::XML::XmlDeclaration] :xml_declaration
31
+ # @option options [Symbol] :type
27
32
  #
28
33
  def initialize(options = {})
29
34
  @doctype = options[:doctype]
30
35
  @xml_declaration = options[:xml_declaration]
36
+ @type = options[:type] || :xml
31
37
 
32
38
  self.children = options[:children] if options[:children]
33
39
  end
@@ -144,7 +144,12 @@ module Oga
144
144
  # @return [Oga::XML::Namespace]
145
145
  #
146
146
  def namespace
147
- return @namespace ||= available_namespaces[namespace_name]
147
+ unless @namespace
148
+ available = available_namespaces
149
+ @namespace = available[namespace_name] || available[XMLNS_PREFIX]
150
+ end
151
+
152
+ return @namespace
148
153
  end
149
154
 
150
155
  ##
@@ -206,7 +211,12 @@ module Oga
206
211
  # @return [String]
207
212
  #
208
213
  def to_xml
209
- ns = namespace ? "#{namespace}:" : ''
214
+ if namespace_name
215
+ full_name = "#{namespace_name}:#{name}"
216
+ else
217
+ full_name = name
218
+ end
219
+
210
220
  body = children.map(&:to_xml).join('')
211
221
  attrs = ''
212
222
 
@@ -214,7 +224,11 @@ module Oga
214
224
  attrs << " #{attr.to_xml}"
215
225
  end
216
226
 
217
- return "<#{ns}#{name}#{attrs}>#{body}</#{ns}#{name}>"
227
+ if self_closing?
228
+ return "<#{full_name}#{attrs} />"
229
+ else
230
+ return "<#{full_name}#{attrs}>#{body}</#{full_name}>"
231
+ end
218
232
  end
219
233
 
220
234
  ##
@@ -263,13 +277,33 @@ module Oga
263
277
  node = parent
264
278
 
265
279
  while node && node.respond_to?(:namespaces)
266
- merged = merged.merge(node.namespaces)
267
- node = node.parent
280
+ node.namespaces.each do |prefix, ns|
281
+ merged[prefix] = ns unless merged[prefix]
282
+ end
283
+
284
+ node = node.parent
268
285
  end
269
286
 
270
287
  return merged
271
288
  end
272
289
 
290
+ ##
291
+ # Returns `true` if the element is a self-closing element.
292
+ #
293
+ # @return [TrueClass|FalseClass]
294
+ #
295
+ def self_closing?
296
+ self_closing = children.empty?
297
+ root = root_node
298
+
299
+ if root.is_a?(Document) and root.type == :html \
300
+ and !HTML_VOID_ELEMENTS.include?(name)
301
+ self_closing = false
302
+ end
303
+
304
+ return self_closing
305
+ end
306
+
273
307
  private
274
308
 
275
309
  ##
@@ -277,14 +311,12 @@ module Oga
277
311
  # has been registered the corresponding attribute is removed.
278
312
  #
279
313
  def register_namespaces_from_attributes
280
- self.attributes = attributes.reject do |attr|
314
+ attributes.each do |attr|
281
315
  # We're using `namespace_name` opposed to `namespace.name` as "xmlns"
282
316
  # is not a registered namespace.
283
- remove = attr.namespace_name && attr.namespace_name == XMLNS_PREFIX
284
-
285
- register_namespace(attr.name, attr.value) if remove
286
-
287
- remove
317
+ if attr.name == XMLNS_PREFIX or attr.namespace_name == XMLNS_PREFIX
318
+ register_namespace(attr.name, attr.value)
319
+ end
288
320
  end
289
321
  end
290
322
 
@@ -0,0 +1,28 @@
1
+ module Oga
2
+ module XML
3
+ ##
4
+ # Names of the HTML void elements that should be handled when HTML lexing
5
+ # is enabled.
6
+ #
7
+ # @return [Set]
8
+ #
9
+ HTML_VOID_ELEMENTS = Set.new([
10
+ 'area',
11
+ 'base',
12
+ 'br',
13
+ 'col',
14
+ 'command',
15
+ 'embed',
16
+ 'hr',
17
+ 'img',
18
+ 'input',
19
+ 'keygen',
20
+ 'link',
21
+ 'meta',
22
+ 'param',
23
+ 'source',
24
+ 'track',
25
+ 'wbr'
26
+ ])
27
+ end # XML
28
+ end # Oga
data/lib/oga/xml/lexer.rb CHANGED
@@ -40,31 +40,6 @@ module Oga
40
40
  class Lexer
41
41
  attr_reader :html
42
42
 
43
- ##
44
- # Names of the HTML void elements that should be handled when HTML lexing
45
- # is enabled.
46
- #
47
- # @return [Set]
48
- #
49
- HTML_VOID_ELEMENTS = Set.new([
50
- 'area',
51
- 'base',
52
- 'br',
53
- 'col',
54
- 'command',
55
- 'embed',
56
- 'hr',
57
- 'img',
58
- 'input',
59
- 'keygen',
60
- 'link',
61
- 'meta',
62
- 'param',
63
- 'source',
64
- 'track',
65
- 'wbr'
66
- ])
67
-
68
43
  ##
69
44
  # @param [String|IO] data The data to lex. This can either be a String or
70
45
  # an IO instance.
@@ -347,7 +322,7 @@ module Oga
347
322
  # Called on the closing `>` of the open tag of an element.
348
323
  #
349
324
  def on_element_open_end
350
- if html? and HTML_VOID_ELEMENTS.include?(current_element)
325
+ if html? and HTML_VOID_ELEMENTS.include?(current_element.downcase)
351
326
  add_token(:T_ELEM_END)
352
327
  @elements.pop
353
328
  end
@@ -9,6 +9,35 @@ module Oga
9
9
  module XML
10
10
  class Parser < Racc::Parser
11
11
 
12
+ ##
13
+ # Hash mapping token types and dedicated error labels.
14
+ #
15
+ # @return [Hash]
16
+ #
17
+ TOKEN_ERROR_MAPPING = {
18
+ 'T_STRING' => 'string',
19
+ 'T_TEXT' => 'text',
20
+ 'T_DOCTYPE_START' => 'doctype start',
21
+ 'T_DOCTYPE_END' => 'doctype closing tag',
22
+ 'T_DOCTYPE_TYPE' => 'doctype type',
23
+ 'T_DOCTYPE_NAME' => 'doctype name',
24
+ 'T_DOCTYPE_INLINE' => 'inline doctype rules',
25
+ 'T_CDATA' => 'CDATA',
26
+ 'T_COMMENT' => 'comment',
27
+ 'T_ELEM_START' => 'element start',
28
+ 'T_ELEM_NAME' => 'element name',
29
+ 'T_ELEM_NS' => 'element namespace',
30
+ 'T_ELEM_END' => 'element closing tag',
31
+ 'T_ATTR' => 'attribute',
32
+ 'T_ATTR_NS' => 'attribute namespace',
33
+ 'T_XML_DECL_START' => 'XML declaration start',
34
+ 'T_XML_DECL_END' => 'XML declaration end',
35
+ 'T_PROC_INS_START' => 'processing-instruction start',
36
+ 'T_PROC_INS_NAME' => 'processing-instruction name',
37
+ 'T_PROC_INS_END' => 'processing-instruction closing tag',
38
+ '$end' => 'end of input'
39
+ }
40
+
12
41
  ##
13
42
  # @param [String|IO] data The input to parse.
14
43
  # @param [Hash] options
@@ -53,6 +82,7 @@ module Oga
53
82
  #
54
83
  def on_error(type, value, stack)
55
84
  name = token_to_str(type)
85
+ name = TOKEN_ERROR_MAPPING[name] || name
56
86
  index = @line - 1
57
87
  index_range = (index - 5)..(index + 5)
58
88
  code = ''
@@ -84,7 +114,7 @@ module Oga
84
114
  end
85
115
 
86
116
  raise Racc::ParseError, <<-EOF.strip
87
- Unexpected #{name} with value #{value.inspect} on line #{@line}:
117
+ Unexpected #{name} on line #{@line}:
88
118
 
89
119
  #{code}
90
120
  EOF
@@ -112,7 +142,9 @@ Unexpected #{name} with value #{value.inspect} on line #{@line}:
112
142
  # @return [Oga::XML::Document]
113
143
  #
114
144
  def on_document(children = [])
115
- document = Document.new
145
+ document = Document.new(
146
+ :type => @lexer.html ? :html : :xml
147
+ )
116
148
 
117
149
  children.each do |child|
118
150
  if child.is_a?(Doctype)
@@ -0,0 +1,63 @@
1
+ module Oga
2
+ module XML
3
+ ##
4
+ # The SaxParser class provides the basic interface for writing custom SAX
5
+ # parsers. All callback methods defined in {Oga::XML::Parser} are delegated
6
+ # to a dedicated handler class.
7
+ #
8
+ # To write a custom handler for the SAX parser, create a class that
9
+ # implements one (or many) of the following callback methods:
10
+ #
11
+ # * `on_document`
12
+ # * `on_doctype`
13
+ # * `on_cdata`
14
+ # * `on_comment`
15
+ # * `on_proc_ins`
16
+ # * `on_xml_decl`
17
+ # * `on_text`
18
+ # * `on_element`
19
+ # * `on_element_children`
20
+ # * `after_element`
21
+ #
22
+ # For example:
23
+ #
24
+ # class SaxHandler
25
+ # def on_element(namespace, name, attrs = {})
26
+ # puts name
27
+ # end
28
+ # end
29
+ #
30
+ # You can then use it as following:
31
+ #
32
+ # handler = SaxHandler.new
33
+ # parser = Oga::XML::SaxParser.new(handler, '<foo />')
34
+ #
35
+ # parser.parse
36
+ #
37
+ # For information on the callback arguments see the documentation of the
38
+ # corresponding methods in {Oga::XML::Parser}.
39
+ #
40
+ class SaxParser < Parser
41
+ ##
42
+ # @param [Object] handler The SAX handler to delegate callbacks to.
43
+ # @see [Oga::XML::Parser#initialize]
44
+ #
45
+ def initialize(handler, *args)
46
+ @handler = handler
47
+
48
+ super(*args)
49
+ end
50
+
51
+ # Delegate all callbacks to the handler object.
52
+ instance_methods.grep(/^(on_|after_)/).each do |method|
53
+ eval <<-EOF, nil, __FILE__, __LINE__ + 1
54
+ def #{method}(*args)
55
+ @handler.#{method}(*args) if @handler.respond_to?(:#{method})
56
+
57
+ return
58
+ end
59
+ EOF
60
+ end
61
+ end # SaxParser
62
+ end # XML
63
+ end # Oga
@@ -131,7 +131,8 @@ module Oga
131
131
  context = XML::NodeSet.new([@document])
132
132
  end
133
133
 
134
- return on_path(ast_node, context)
134
+ # If the expression is just "/" we'll just return the current context.
135
+ return ast_node.children.empty? ? context : on_path(ast_node, context)
135
136
  end
136
137
 
137
138
  ##
@@ -1188,7 +1189,7 @@ module Oga
1188
1189
  # This function call returns the substring of the 1st argument that occurs
1189
1190
  # after the string given in the 2nd argument. For example:
1190
1191
  #
1191
- # substring-before("2014-08-25", "-")
1192
+ # substring-after("2014-08-25", "-")
1192
1193
  #
1193
1194
  # This would return "08-25" as it occurs after the first "-".
1194
1195
  #