oga 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -292,6 +292,14 @@
292
292
  allowed_text = any* -- terminate_text;
293
293
 
294
294
  text := |*
295
+ # Input such as just "</" or "<?". This rule takes precedence over the
296
+ # rules below, but only if those don't match.
297
+ terminate_text => {
298
+ callback("on_text", data, encoding, ts, te);
299
+
300
+ fnext main;
301
+ };
302
+
295
303
  # Text followed by a special tag, such as "foo<!--"
296
304
  allowed_text @{ mark = p; } terminate_text => {
297
305
  callback("on_text", data, encoding, ts, mark);
data/lib/oga.rb CHANGED
@@ -1,3 +1,5 @@
1
+ gem 'racc'
2
+
1
3
  require 'ast'
2
4
  require 'set'
3
5
  require 'stringio'
@@ -9,7 +11,6 @@ require_relative 'oga/oga'
9
11
  # Oga::XML namespace.
10
12
  require_relative 'oga/xml/lexer'
11
13
  require_relative 'oga/xml/parser'
12
- require_relative 'oga/xml/pull_parser'
13
14
 
14
15
  require_relative 'liboga'
15
16
 
@@ -19,6 +20,7 @@ if RUBY_PLATFORM == 'java'
19
20
  end
20
21
  #:nocov:
21
22
 
23
+ require_relative 'oga/xml/html_void_elements'
22
24
  require_relative 'oga/xml/querying'
23
25
  require_relative 'oga/xml/traversal'
24
26
  require_relative 'oga/xml/node'
@@ -35,7 +37,11 @@ require_relative 'oga/xml/attribute'
35
37
  require_relative 'oga/xml/element'
36
38
  require_relative 'oga/xml/node_set'
37
39
 
40
+ require_relative 'oga/xml/sax_parser'
41
+ require_relative 'oga/xml/pull_parser'
42
+
38
43
  require_relative 'oga/html/parser'
44
+ require_relative 'oga/html/sax_parser'
39
45
 
40
46
  require_relative 'oga/xpath/node'
41
47
  require_relative 'oga/xpath/lexer'
@@ -0,0 +1,18 @@
1
+ module Oga
2
+ module HTML
3
+ ##
4
+ # SAX parser for HTML documents. See the documentation of
5
+ # {Oga::XML::SaxParser} for more information.
6
+ #
7
+ class SaxParser < XML::SaxParser
8
+ ##
9
+ # @see [Oga::XML::SaxParser#initialize]
10
+ #
11
+ def initialize(handler, data, options = {})
12
+ options = options.merge(:html => true)
13
+
14
+ super(handler, data, options)
15
+ end
16
+ end # SaxParser
17
+ end # HTML
18
+ end # Oga
data/lib/oga/oga.rb CHANGED
@@ -24,4 +24,34 @@ module Oga
24
24
  def self.parse_html(html)
25
25
  return HTML::Parser.new(html).parse
26
26
  end
27
+
28
+ ##
29
+ # Parses the given XML document using the SAX parser.
30
+ #
31
+ # @example
32
+ # handler = SomeSaxHandler.new
33
+ #
34
+ # Oga.sax_parse_html(handler, '<root>Hello</root>')
35
+ #
36
+ # @param [Object] handler The SAX handler for the parser.
37
+ # @param [String|IO] xml The XML to parse.
38
+ #
39
+ def self.sax_parse_xml(handler, xml)
40
+ XML::SaxParser.new(handler, xml).parse
41
+ end
42
+
43
+ ##
44
+ # Parses the given HTML document using the SAX parser.
45
+ #
46
+ # @example
47
+ # handler = SomeSaxHandler.new
48
+ #
49
+ # Oga.sax_parse_html(handler, '<script>foo()</script>')
50
+ #
51
+ # @param [Object] handler The SAX handler for the parser.
52
+ # @param [String|IO] html The HTML to parse.
53
+ #
54
+ def self.sax_parse_html(handler, html)
55
+ HTML::SaxParser.new(handler, html).parse
56
+ end
27
57
  end # Oga
data/lib/oga/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Oga
2
- VERSION = '0.1.1'
2
+ VERSION = '0.1.2'
3
3
  end # Oga
data/lib/oga/xml/cdata.rb CHANGED
@@ -12,13 +12,6 @@ module Oga
12
12
  def to_xml
13
13
  return "<![CDATA[#{text}]]>"
14
14
  end
15
-
16
- ##
17
- # @return [Symbol]
18
- #
19
- def node_type
20
- return :cdata
21
- end
22
15
  end # Cdata
23
16
  end # XML
24
17
  end # Oga
@@ -12,13 +12,6 @@ module Oga
12
12
  def to_xml
13
13
  return "<!--#{text}-->"
14
14
  end
15
-
16
- ##
17
- # @return [Symbol]
18
- #
19
- def node_type
20
- return :comment
21
- end
22
15
  end # Comment
23
16
  end # XML
24
17
  end # Oga
@@ -79,13 +79,6 @@ module Oga
79
79
 
80
80
  return "Doctype(#{segments.join(' ')})"
81
81
  end
82
-
83
- ##
84
- # @return [Symbol]
85
- #
86
- def node_type
87
- return :doctype
88
- end
89
82
  end # Doctype
90
83
  end # XML
91
84
  end # Oga
@@ -12,11 +12,15 @@ module Oga
12
12
  # The XML declaration of the document.
13
13
  # @return [Oga::XML::XmlDeclaration]
14
14
  #
15
+ # @!attribute [rw] type
16
+ # The document type, either `:xml` or `:html`.
17
+ # @return [Symbol]
18
+ #
15
19
  class Document
16
20
  include Querying
17
21
  include Traversal
18
22
 
19
- attr_accessor :doctype, :xml_declaration
23
+ attr_accessor :doctype, :xml_declaration, :type
20
24
 
21
25
  ##
22
26
  # @param [Hash] options
@@ -24,10 +28,12 @@ module Oga
24
28
  # @option options [Oga::XML::NodeSet] :children
25
29
  # @option options [Oga::XML::Doctype] :doctype
26
30
  # @option options [Oga::XML::XmlDeclaration] :xml_declaration
31
+ # @option options [Symbol] :type
27
32
  #
28
33
  def initialize(options = {})
29
34
  @doctype = options[:doctype]
30
35
  @xml_declaration = options[:xml_declaration]
36
+ @type = options[:type] || :xml
31
37
 
32
38
  self.children = options[:children] if options[:children]
33
39
  end
@@ -144,7 +144,12 @@ module Oga
144
144
  # @return [Oga::XML::Namespace]
145
145
  #
146
146
  def namespace
147
- return @namespace ||= available_namespaces[namespace_name]
147
+ unless @namespace
148
+ available = available_namespaces
149
+ @namespace = available[namespace_name] || available[XMLNS_PREFIX]
150
+ end
151
+
152
+ return @namespace
148
153
  end
149
154
 
150
155
  ##
@@ -206,7 +211,12 @@ module Oga
206
211
  # @return [String]
207
212
  #
208
213
  def to_xml
209
- ns = namespace ? "#{namespace}:" : ''
214
+ if namespace_name
215
+ full_name = "#{namespace_name}:#{name}"
216
+ else
217
+ full_name = name
218
+ end
219
+
210
220
  body = children.map(&:to_xml).join('')
211
221
  attrs = ''
212
222
 
@@ -214,7 +224,11 @@ module Oga
214
224
  attrs << " #{attr.to_xml}"
215
225
  end
216
226
 
217
- return "<#{ns}#{name}#{attrs}>#{body}</#{ns}#{name}>"
227
+ if self_closing?
228
+ return "<#{full_name}#{attrs} />"
229
+ else
230
+ return "<#{full_name}#{attrs}>#{body}</#{full_name}>"
231
+ end
218
232
  end
219
233
 
220
234
  ##
@@ -236,13 +250,6 @@ module Oga
236
250
  return "Element(#{segments.join(' ')})"
237
251
  end
238
252
 
239
- ##
240
- # @return [Symbol]
241
- #
242
- def node_type
243
- return :element
244
- end
245
-
246
253
  ##
247
254
  # Registers a new namespace for the current element and its child
248
255
  # elements.
@@ -270,13 +277,33 @@ module Oga
270
277
  node = parent
271
278
 
272
279
  while node && node.respond_to?(:namespaces)
273
- merged = merged.merge(node.namespaces)
274
- node = node.parent
280
+ node.namespaces.each do |prefix, ns|
281
+ merged[prefix] = ns unless merged[prefix]
282
+ end
283
+
284
+ node = node.parent
275
285
  end
276
286
 
277
287
  return merged
278
288
  end
279
289
 
290
+ ##
291
+ # Returns `true` if the element is a self-closing element.
292
+ #
293
+ # @return [TrueClass|FalseClass]
294
+ #
295
+ def self_closing?
296
+ self_closing = children.empty?
297
+ root = root_node
298
+
299
+ if root.is_a?(Document) and root.type == :html \
300
+ and !HTML_VOID_ELEMENTS.include?(name)
301
+ self_closing = false
302
+ end
303
+
304
+ return self_closing
305
+ end
306
+
280
307
  private
281
308
 
282
309
  ##
@@ -284,14 +311,12 @@ module Oga
284
311
  # has been registered the corresponding attribute is removed.
285
312
  #
286
313
  def register_namespaces_from_attributes
287
- self.attributes = attributes.reject do |attr|
314
+ attributes.each do |attr|
288
315
  # We're using `namespace_name` opposed to `namespace.name` as "xmlns"
289
316
  # is not a registered namespace.
290
- remove = attr.namespace_name && attr.namespace_name == XMLNS_PREFIX
291
-
292
- register_namespace(attr.name, attr.value) if remove
293
-
294
- remove
317
+ if attr.name == XMLNS_PREFIX or attr.namespace_name == XMLNS_PREFIX
318
+ register_namespace(attr.name, attr.value)
319
+ end
295
320
  end
296
321
  end
297
322
 
@@ -0,0 +1,28 @@
1
+ module Oga
2
+ module XML
3
+ ##
4
+ # Names of the HTML void elements that should be handled when HTML lexing
5
+ # is enabled.
6
+ #
7
+ # @return [Set]
8
+ #
9
+ HTML_VOID_ELEMENTS = Set.new([
10
+ 'area',
11
+ 'base',
12
+ 'br',
13
+ 'col',
14
+ 'command',
15
+ 'embed',
16
+ 'hr',
17
+ 'img',
18
+ 'input',
19
+ 'keygen',
20
+ 'link',
21
+ 'meta',
22
+ 'param',
23
+ 'source',
24
+ 'track',
25
+ 'wbr'
26
+ ])
27
+ end # XML
28
+ end # Oga
data/lib/oga/xml/lexer.rb CHANGED
@@ -40,31 +40,6 @@ module Oga
40
40
  class Lexer
41
41
  attr_reader :html
42
42
 
43
- ##
44
- # Names of the HTML void elements that should be handled when HTML lexing
45
- # is enabled.
46
- #
47
- # @return [Set]
48
- #
49
- HTML_VOID_ELEMENTS = Set.new([
50
- 'area',
51
- 'base',
52
- 'br',
53
- 'col',
54
- 'command',
55
- 'embed',
56
- 'hr',
57
- 'img',
58
- 'input',
59
- 'keygen',
60
- 'link',
61
- 'meta',
62
- 'param',
63
- 'source',
64
- 'track',
65
- 'wbr'
66
- ])
67
-
68
43
  ##
69
44
  # @param [String|IO] data The data to lex. This can either be a String or
70
45
  # an IO instance.
@@ -347,7 +322,7 @@ module Oga
347
322
  # Called on the closing `>` of the open tag of an element.
348
323
  #
349
324
  def on_element_open_end
350
- if html? and HTML_VOID_ELEMENTS.include?(current_element)
325
+ if html? and HTML_VOID_ELEMENTS.include?(current_element.downcase)
351
326
  add_token(:T_ELEM_END)
352
327
  @elements.pop
353
328
  end
data/lib/oga/xml/node.rb CHANGED
@@ -163,13 +163,6 @@ module Oga
163
163
 
164
164
  node_set.insert(index, other)
165
165
  end
166
-
167
- ##
168
- # @return [Symbol]
169
- #
170
- def node_type
171
- return :node
172
- end
173
166
  end # Element
174
167
  end # XML
175
168
  end # Oga
@@ -9,6 +9,35 @@ module Oga
9
9
  module XML
10
10
  class Parser < Racc::Parser
11
11
 
12
+ ##
13
+ # Hash mapping token types and dedicated error labels.
14
+ #
15
+ # @return [Hash]
16
+ #
17
+ TOKEN_ERROR_MAPPING = {
18
+ 'T_STRING' => 'string',
19
+ 'T_TEXT' => 'text',
20
+ 'T_DOCTYPE_START' => 'doctype start',
21
+ 'T_DOCTYPE_END' => 'doctype closing tag',
22
+ 'T_DOCTYPE_TYPE' => 'doctype type',
23
+ 'T_DOCTYPE_NAME' => 'doctype name',
24
+ 'T_DOCTYPE_INLINE' => 'inline doctype rules',
25
+ 'T_CDATA' => 'CDATA',
26
+ 'T_COMMENT' => 'comment',
27
+ 'T_ELEM_START' => 'element start',
28
+ 'T_ELEM_NAME' => 'element name',
29
+ 'T_ELEM_NS' => 'element namespace',
30
+ 'T_ELEM_END' => 'element closing tag',
31
+ 'T_ATTR' => 'attribute',
32
+ 'T_ATTR_NS' => 'attribute namespace',
33
+ 'T_XML_DECL_START' => 'XML declaration start',
34
+ 'T_XML_DECL_END' => 'XML declaration end',
35
+ 'T_PROC_INS_START' => 'processing-instruction start',
36
+ 'T_PROC_INS_NAME' => 'processing-instruction name',
37
+ 'T_PROC_INS_END' => 'processing-instruction closing tag',
38
+ '$end' => 'end of input'
39
+ }
40
+
12
41
  ##
13
42
  # @param [String|IO] data The input to parse.
14
43
  # @param [Hash] options
@@ -53,6 +82,7 @@ module Oga
53
82
  #
54
83
  def on_error(type, value, stack)
55
84
  name = token_to_str(type)
85
+ name = TOKEN_ERROR_MAPPING[name] || name
56
86
  index = @line - 1
57
87
  index_range = (index - 5)..(index + 5)
58
88
  code = ''
@@ -84,7 +114,7 @@ module Oga
84
114
  end
85
115
 
86
116
  raise Racc::ParseError, <<-EOF.strip
87
- Unexpected #{name} with value #{value.inspect} on line #{@line}:
117
+ Unexpected #{name} on line #{@line}:
88
118
 
89
119
  #{code}
90
120
  EOF
@@ -112,7 +142,9 @@ Unexpected #{name} with value #{value.inspect} on line #{@line}:
112
142
  # @return [Oga::XML::Document]
113
143
  #
114
144
  def on_document(children = [])
115
- document = Document.new
145
+ document = Document.new(
146
+ :type => @lexer.html ? :html : :xml
147
+ )
116
148
 
117
149
  children.each do |child|
118
150
  if child.is_a?(Doctype)
@@ -50,6 +50,21 @@ module Oga
50
50
  :on_proc_ins
51
51
  ]
52
52
 
53
+ ##
54
+ # Returns the shorthands that can be used for various node classes.
55
+ #
56
+ # @return [Hash]
57
+ #
58
+ NODE_SHORTHANDS = {
59
+ :text => XML::Text,
60
+ :node => XML::Node,
61
+ :cdata => XML::Cdata,
62
+ :element => XML::Element,
63
+ :doctype => XML::Doctype,
64
+ :comment => XML::Comment,
65
+ :xml_declaration => XML::XmlDeclaration
66
+ }
67
+
53
68
  ##
54
69
  # @see Oga::XML::Parser#reset
55
70
  #
@@ -89,8 +104,7 @@ module Oga
89
104
  # Instead of this:
90
105
  #
91
106
  # parser.parse do |node|
92
- # if node.node_type == :text \
93
- # and parser.nesting == %w{people person name}
107
+ # if node.is_a?(Oga::XML::Text) and parser.nesting == %w{people person name}
94
108
  # puts node.text
95
109
  # end
96
110
  # end
@@ -113,7 +127,7 @@ module Oga
113
127
  # @param [Array] nesting The element name nesting to act upon.
114
128
  #
115
129
  def on(type, nesting = [])
116
- if node.node_type == type
130
+ if node.is_a?(NODE_SHORTHANDS[type])
117
131
  if nesting.empty? or nesting == self.nesting
118
132
  yield
119
133
  end