rubyjedi-oga 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.yardopts +13 -0
- data/LICENSE +362 -0
- data/README.md +317 -0
- data/doc/css/common.css +77 -0
- data/doc/css_selectors.md +935 -0
- data/doc/manually_creating_documents.md +67 -0
- data/doc/migrating_from_nokogiri.md +169 -0
- data/doc/xml_namespaces.md +63 -0
- data/ext/c/extconf.rb +11 -0
- data/ext/c/lexer.c +2595 -0
- data/ext/c/lexer.h +16 -0
- data/ext/c/lexer.rl +198 -0
- data/ext/c/liboga.c +6 -0
- data/ext/c/liboga.h +11 -0
- data/ext/java/Liboga.java +14 -0
- data/ext/java/org/liboga/xml/Lexer.java +1363 -0
- data/ext/java/org/liboga/xml/Lexer.rl +223 -0
- data/ext/ragel/base_lexer.rl +633 -0
- data/lib/oga.rb +57 -0
- data/lib/oga/blacklist.rb +40 -0
- data/lib/oga/css/lexer.rb +743 -0
- data/lib/oga/css/parser.rb +976 -0
- data/lib/oga/entity_decoder.rb +21 -0
- data/lib/oga/html/entities.rb +2150 -0
- data/lib/oga/html/parser.rb +25 -0
- data/lib/oga/html/sax_parser.rb +18 -0
- data/lib/oga/lru.rb +160 -0
- data/lib/oga/oga.rb +57 -0
- data/lib/oga/version.rb +3 -0
- data/lib/oga/whitelist.rb +20 -0
- data/lib/oga/xml/attribute.rb +136 -0
- data/lib/oga/xml/cdata.rb +17 -0
- data/lib/oga/xml/character_node.rb +37 -0
- data/lib/oga/xml/comment.rb +17 -0
- data/lib/oga/xml/default_namespace.rb +13 -0
- data/lib/oga/xml/doctype.rb +82 -0
- data/lib/oga/xml/document.rb +108 -0
- data/lib/oga/xml/element.rb +428 -0
- data/lib/oga/xml/entities.rb +122 -0
- data/lib/oga/xml/html_void_elements.rb +15 -0
- data/lib/oga/xml/lexer.rb +550 -0
- data/lib/oga/xml/namespace.rb +48 -0
- data/lib/oga/xml/node.rb +219 -0
- data/lib/oga/xml/node_set.rb +333 -0
- data/lib/oga/xml/parser.rb +631 -0
- data/lib/oga/xml/processing_instruction.rb +37 -0
- data/lib/oga/xml/pull_parser.rb +175 -0
- data/lib/oga/xml/querying.rb +56 -0
- data/lib/oga/xml/sax_parser.rb +192 -0
- data/lib/oga/xml/text.rb +66 -0
- data/lib/oga/xml/traversal.rb +50 -0
- data/lib/oga/xml/xml_declaration.rb +65 -0
- data/lib/oga/xpath/evaluator.rb +1798 -0
- data/lib/oga/xpath/lexer.rb +1958 -0
- data/lib/oga/xpath/parser.rb +622 -0
- data/oga.gemspec +45 -0
- metadata +227 -0
@@ -0,0 +1,37 @@
|
|
1
|
+
module Oga
|
2
|
+
module XML
|
3
|
+
##
|
4
|
+
# Class used for storing information about a single processing instruction.
|
5
|
+
#
|
6
|
+
class ProcessingInstruction < CharacterNode
|
7
|
+
# @return [String]
|
8
|
+
attr_accessor :name
|
9
|
+
|
10
|
+
##
|
11
|
+
# @param [Hash] options
|
12
|
+
#
|
13
|
+
# @option options [String] :name The name of the instruction.
|
14
|
+
# @see [Oga::XML::CharacterNode#initialize]
|
15
|
+
#
|
16
|
+
def initialize(options = {})
|
17
|
+
super
|
18
|
+
|
19
|
+
@name = options[:name]
|
20
|
+
end
|
21
|
+
|
22
|
+
##
|
23
|
+
# @return [String]
|
24
|
+
#
|
25
|
+
def to_xml
|
26
|
+
"<?#{name}#{text}?>"
|
27
|
+
end
|
28
|
+
|
29
|
+
##
|
30
|
+
# @return [String]
|
31
|
+
#
|
32
|
+
def inspect
|
33
|
+
"ProcessingInstruction(name: #{name.inspect} text: #{text.inspect})"
|
34
|
+
end
|
35
|
+
end # ProcessingInstruction
|
36
|
+
end # XML
|
37
|
+
end # Oga
|
@@ -0,0 +1,175 @@
|
|
1
|
+
module Oga
|
2
|
+
module XML
|
3
|
+
##
|
4
|
+
# The PullParser class can be used to parse an XML document incrementally
|
5
|
+
# instead of parsing it as a whole. This results in lower memory usage and
|
6
|
+
# potentially faster parsing times. The downside is that pull parsers are
|
7
|
+
# typically more difficult to use compared to DOM parsers.
|
8
|
+
#
|
9
|
+
# Basic parsing using this class works as following:
|
10
|
+
#
|
11
|
+
# parser = Oga::XML::PullParser.new('... xml here ...')
|
12
|
+
#
|
13
|
+
# parser.parse do |node|
|
14
|
+
# if node.is_a?(Oga::XML::PullParser)
|
15
|
+
#
|
16
|
+
# end
|
17
|
+
# end
|
18
|
+
#
|
19
|
+
# This parses yields proper XML instances such as {Oga::XML::Element}.
|
20
|
+
# Doctypes and XML declarations are ignored by this parser.
|
21
|
+
#
|
22
|
+
class PullParser < Parser
|
23
|
+
# @return [Oga::XML::Node]
|
24
|
+
attr_reader :node
|
25
|
+
|
26
|
+
# Array containing the names of the currently nested elements.
|
27
|
+
# @return [Array]
|
28
|
+
attr_reader :nesting
|
29
|
+
|
30
|
+
##
|
31
|
+
# @return [Array]
|
32
|
+
#
|
33
|
+
DISABLED_CALLBACKS = [
|
34
|
+
:on_document,
|
35
|
+
:on_doctype,
|
36
|
+
:on_xml_decl,
|
37
|
+
:on_element_children
|
38
|
+
]
|
39
|
+
|
40
|
+
##
|
41
|
+
# @return [Array]
|
42
|
+
#
|
43
|
+
BLOCK_CALLBACKS = [
|
44
|
+
:on_cdata,
|
45
|
+
:on_comment,
|
46
|
+
:on_text,
|
47
|
+
:on_proc_ins
|
48
|
+
]
|
49
|
+
|
50
|
+
##
|
51
|
+
# Returns the shorthands that can be used for various node classes.
|
52
|
+
#
|
53
|
+
# @return [Hash]
|
54
|
+
#
|
55
|
+
NODE_SHORTHANDS = {
|
56
|
+
:text => XML::Text,
|
57
|
+
:node => XML::Node,
|
58
|
+
:cdata => XML::Cdata,
|
59
|
+
:element => XML::Element,
|
60
|
+
:doctype => XML::Doctype,
|
61
|
+
:comment => XML::Comment,
|
62
|
+
:xml_declaration => XML::XmlDeclaration
|
63
|
+
}
|
64
|
+
|
65
|
+
##
|
66
|
+
# @see Oga::XML::Parser#reset
|
67
|
+
#
|
68
|
+
def reset
|
69
|
+
super
|
70
|
+
|
71
|
+
@block = nil
|
72
|
+
@nesting = []
|
73
|
+
@node = nil
|
74
|
+
end
|
75
|
+
|
76
|
+
##
|
77
|
+
# Parses the input and yields every node to the supplied block.
|
78
|
+
#
|
79
|
+
# @yieldparam [Oga::XML::Node]
|
80
|
+
#
|
81
|
+
def parse(&block)
|
82
|
+
@block = block
|
83
|
+
|
84
|
+
super
|
85
|
+
|
86
|
+
return
|
87
|
+
end
|
88
|
+
|
89
|
+
##
|
90
|
+
# Calls the supplied block if the current node type and optionally the
|
91
|
+
# nesting match. This method allows you to write this:
|
92
|
+
#
|
93
|
+
# parser.parse do |node|
|
94
|
+
# parser.on(:text, %w{people person name}) do
|
95
|
+
# puts node.text
|
96
|
+
# end
|
97
|
+
# end
|
98
|
+
#
|
99
|
+
# Instead of this:
|
100
|
+
#
|
101
|
+
# parser.parse do |node|
|
102
|
+
# if node.is_a?(Oga::XML::Text) and parser.nesting == %w{people person name}
|
103
|
+
# puts node.text
|
104
|
+
# end
|
105
|
+
# end
|
106
|
+
#
|
107
|
+
# When calling this method you can specify the following node types:
|
108
|
+
#
|
109
|
+
# * `:cdata`
|
110
|
+
# * `:comment`
|
111
|
+
# * `:element`
|
112
|
+
# * `:text`
|
113
|
+
#
|
114
|
+
# @example
|
115
|
+
# parser.on(:element, %w{people person name}) do
|
116
|
+
#
|
117
|
+
# end
|
118
|
+
#
|
119
|
+
# @param [Symbol] type The type of node to act upon. This is a symbol as
|
120
|
+
# returned by {Oga::XML::Node#node_type}.
|
121
|
+
#
|
122
|
+
# @param [Array] nesting The element name nesting to act upon.
|
123
|
+
#
|
124
|
+
def on(type, nesting = [])
|
125
|
+
if node.is_a?(NODE_SHORTHANDS[type])
|
126
|
+
if nesting.empty? or nesting == self.nesting
|
127
|
+
yield
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
# eval is a heck of a lot faster than define_method on both Rubinius and
|
133
|
+
# JRuby.
|
134
|
+
DISABLED_CALLBACKS.each do |method|
|
135
|
+
eval <<-EOF, nil, __FILE__, __LINE__ + 1
|
136
|
+
def #{method}(*args)
|
137
|
+
return
|
138
|
+
end
|
139
|
+
EOF
|
140
|
+
end
|
141
|
+
|
142
|
+
BLOCK_CALLBACKS.each do |method|
|
143
|
+
eval <<-EOF, nil, __FILE__, __LINE__ + 1
|
144
|
+
def #{method}(*args)
|
145
|
+
@node = super
|
146
|
+
@block.call(@node)
|
147
|
+
return
|
148
|
+
end
|
149
|
+
EOF
|
150
|
+
end
|
151
|
+
|
152
|
+
##
|
153
|
+
# @see Oga::XML::Parser#on_element
|
154
|
+
#
|
155
|
+
def on_element(*args)
|
156
|
+
@node = super
|
157
|
+
|
158
|
+
nesting << @node.name
|
159
|
+
|
160
|
+
@block.call(@node)
|
161
|
+
|
162
|
+
return
|
163
|
+
end
|
164
|
+
|
165
|
+
##
|
166
|
+
# @see Oga::XML::Parser#on_element_children
|
167
|
+
#
|
168
|
+
def after_element(*args)
|
169
|
+
nesting.pop
|
170
|
+
|
171
|
+
return
|
172
|
+
end
|
173
|
+
end # PullParser
|
174
|
+
end # XML
|
175
|
+
end # Oga
|
@@ -0,0 +1,56 @@
|
|
1
|
+
module Oga
|
2
|
+
module XML
|
3
|
+
##
|
4
|
+
# The Querying module provides methods that make it easy to run XPath/CSS
|
5
|
+
# queries on XML documents/elements.
|
6
|
+
#
|
7
|
+
module Querying
|
8
|
+
##
|
9
|
+
# Evaluates the given XPath expression.
|
10
|
+
#
|
11
|
+
# @param [String] expression The XPath expression to run.
|
12
|
+
# @param [Hash] variables Variables to bind.
|
13
|
+
# @see [Oga::XPath::Evaluator#initialize]
|
14
|
+
#
|
15
|
+
def xpath(expression, variables = {})
|
16
|
+
XPath::Evaluator.new(self, variables).evaluate(expression)
|
17
|
+
end
|
18
|
+
|
19
|
+
##
|
20
|
+
# Evaluates the given XPath expression and returns the first node in the
|
21
|
+
# set.
|
22
|
+
#
|
23
|
+
# @see [#xpath]
|
24
|
+
#
|
25
|
+
def at_xpath(*args)
|
26
|
+
result = xpath(*args)
|
27
|
+
|
28
|
+
result.is_a?(XML::NodeSet) ? result.first : result
|
29
|
+
end
|
30
|
+
|
31
|
+
##
|
32
|
+
# Evaluates the given CSS expression.
|
33
|
+
#
|
34
|
+
# @param [String] expression The CSS expression to run.
|
35
|
+
# @see [Oga::XPath::Evaluator#initialize]
|
36
|
+
#
|
37
|
+
def css(expression)
|
38
|
+
ast = CSS::Parser.parse_with_cache(expression)
|
39
|
+
|
40
|
+
XPath::Evaluator.new(self).evaluate_ast(ast)
|
41
|
+
end
|
42
|
+
|
43
|
+
##
|
44
|
+
# Evaluates the given CSS expression and returns the first node in the
|
45
|
+
# set.
|
46
|
+
#
|
47
|
+
# @see [#css]
|
48
|
+
#
|
49
|
+
def at_css(*args)
|
50
|
+
result = css(*args)
|
51
|
+
|
52
|
+
result.is_a?(XML::NodeSet) ? result.first : result
|
53
|
+
end
|
54
|
+
end # Querying
|
55
|
+
end # XML
|
56
|
+
end # Oga
|
@@ -0,0 +1,192 @@
|
|
1
|
+
module Oga
|
2
|
+
module XML
|
3
|
+
##
|
4
|
+
# The SaxParser class provides the basic interface for writing custom SAX
|
5
|
+
# parsers. All callback methods defined in {Oga::XML::Parser} are delegated
|
6
|
+
# to a dedicated handler class.
|
7
|
+
#
|
8
|
+
# To write a custom handler for the SAX parser, create a class that
|
9
|
+
# implements one (or many) of the following callback methods:
|
10
|
+
#
|
11
|
+
# * `on_document`
|
12
|
+
# * `on_doctype`
|
13
|
+
# * `on_cdata`
|
14
|
+
# * `on_comment`
|
15
|
+
# * `on_proc_ins`
|
16
|
+
# * `on_xml_decl`
|
17
|
+
# * `on_text`
|
18
|
+
# * `on_element`
|
19
|
+
# * `on_element_children`
|
20
|
+
# * `on_attribute`
|
21
|
+
# * `on_attributes`
|
22
|
+
# * `after_element`
|
23
|
+
#
|
24
|
+
# For example:
|
25
|
+
#
|
26
|
+
# class SaxHandler
|
27
|
+
# def on_element(namespace, name, attrs = {})
|
28
|
+
# puts name
|
29
|
+
# end
|
30
|
+
# end
|
31
|
+
#
|
32
|
+
# You can then use it as following:
|
33
|
+
#
|
34
|
+
# handler = SaxHandler.new
|
35
|
+
# parser = Oga::XML::SaxParser.new(handler, '<foo />')
|
36
|
+
#
|
37
|
+
# parser.parse
|
38
|
+
#
|
39
|
+
# For information on the callback arguments see the documentation of the
|
40
|
+
# corresponding methods in {Oga::XML::Parser}.
|
41
|
+
#
|
42
|
+
# ## Element Callbacks
|
43
|
+
#
|
44
|
+
# The SAX parser changes the behaviour of both `on_element` and
|
45
|
+
# `after_element`. The latter in the regular parser only takes a
|
46
|
+
# {Oga::XML::Element} instance. In the SAX parser it will instead take a
|
47
|
+
# namespace name and the element name. This eases the process of figuring
|
48
|
+
# out what element a callback is associated with.
|
49
|
+
#
|
50
|
+
# An example:
|
51
|
+
#
|
52
|
+
# class SaxHandler
|
53
|
+
# def on_element(namespace, name, attrs = {})
|
54
|
+
# # ...
|
55
|
+
# end
|
56
|
+
#
|
57
|
+
# def after_element(namespace, name)
|
58
|
+
# puts name # => "foo", "bar", etc
|
59
|
+
# end
|
60
|
+
# end
|
61
|
+
#
|
62
|
+
# ## Attributes
|
63
|
+
#
|
64
|
+
# Attributes returned by `on_attribute` are passed as an Hash as the 3rd
|
65
|
+
# argument of the `on_element` callback. The keys of this Hash are the
|
66
|
+
# attribute names (optionally prefixed by their namespace) and their values.
|
67
|
+
# You can overwrite `on_attribute` to control individual attributes and
|
68
|
+
# `on_attributes` to control the final set.
|
69
|
+
#
|
70
|
+
class SaxParser < Parser
|
71
|
+
##
|
72
|
+
# @param [Object] handler The SAX handler to delegate callbacks to.
|
73
|
+
# @see [Oga::XML::Parser#initialize]
|
74
|
+
#
|
75
|
+
def initialize(handler, *args)
|
76
|
+
@handler = handler
|
77
|
+
|
78
|
+
super(*args)
|
79
|
+
end
|
80
|
+
|
81
|
+
# Delegate all callbacks to the handler object.
|
82
|
+
instance_methods.grep(/^(on_|after_)/).each do |method|
|
83
|
+
eval <<-EOF, nil, __FILE__, __LINE__ + 1
|
84
|
+
def #{method}(*args)
|
85
|
+
run_callback(:#{method}, *args)
|
86
|
+
|
87
|
+
return
|
88
|
+
end
|
89
|
+
EOF
|
90
|
+
end
|
91
|
+
|
92
|
+
##
|
93
|
+
# Manually overwrite `on_element` so we can ensure that `after_element`
|
94
|
+
# always receives the namespace and name.
|
95
|
+
#
|
96
|
+
# @see [Oga::XML::Parser#on_element]
|
97
|
+
# @return [Array]
|
98
|
+
#
|
99
|
+
def on_element(namespace, name, attrs = [])
|
100
|
+
run_callback(:on_element, namespace, name, attrs)
|
101
|
+
|
102
|
+
[namespace, name]
|
103
|
+
end
|
104
|
+
|
105
|
+
##
|
106
|
+
# Manually overwrite `after_element` so it can take a namespace and name.
|
107
|
+
# This differs a bit from the regular `after_element` which only takes an
|
108
|
+
# {Oga::XML::Element} instance.
|
109
|
+
#
|
110
|
+
# @param [Array] namespace_with_name
|
111
|
+
#
|
112
|
+
def after_element(namespace_with_name)
|
113
|
+
run_callback(:after_element, *namespace_with_name)
|
114
|
+
|
115
|
+
return
|
116
|
+
end
|
117
|
+
|
118
|
+
##
|
119
|
+
# Manually overwrite this method since for this one we _do_ want the
|
120
|
+
# return value so it can be passed to `on_element`.
|
121
|
+
#
|
122
|
+
# @see [Oga::XML::Parser#on_attribute]
|
123
|
+
#
|
124
|
+
def on_attribute(name, ns = nil, value = nil)
|
125
|
+
if @handler.respond_to?(:on_attribute)
|
126
|
+
return run_callback(:on_attribute, name, ns, value)
|
127
|
+
end
|
128
|
+
|
129
|
+
key = ns ? "#{ns}:#{name}" : name
|
130
|
+
|
131
|
+
if value
|
132
|
+
value = EntityDecoder.try_decode(value, @lexer.html?)
|
133
|
+
end
|
134
|
+
|
135
|
+
{key => value}
|
136
|
+
end
|
137
|
+
|
138
|
+
##
|
139
|
+
# Merges the attributes together into a Hash.
|
140
|
+
#
|
141
|
+
# @param [Array] attrs
|
142
|
+
# @return [Hash]
|
143
|
+
#
|
144
|
+
def on_attributes(attrs)
|
145
|
+
if @handler.respond_to?(:on_attributes)
|
146
|
+
return run_callback(:on_attributes, attrs)
|
147
|
+
end
|
148
|
+
|
149
|
+
merged = {}
|
150
|
+
|
151
|
+
attrs.each do |pair|
|
152
|
+
# Hash#merge requires an extra allocation, this doesn't.
|
153
|
+
pair.each { |key, value| merged[key] = value }
|
154
|
+
end
|
155
|
+
|
156
|
+
merged
|
157
|
+
end
|
158
|
+
|
159
|
+
##
|
160
|
+
# @param [String] text
|
161
|
+
#
|
162
|
+
def on_text(text)
|
163
|
+
if @handler.respond_to?(:on_text)
|
164
|
+
unless inside_literal_html?
|
165
|
+
text = EntityDecoder.try_decode(text, @lexer.html?)
|
166
|
+
end
|
167
|
+
|
168
|
+
run_callback(:on_text, text)
|
169
|
+
end
|
170
|
+
|
171
|
+
return
|
172
|
+
end
|
173
|
+
|
174
|
+
private
|
175
|
+
|
176
|
+
##
|
177
|
+
# @return [TrueClass|FalseClass]
|
178
|
+
#
|
179
|
+
def inside_literal_html?
|
180
|
+
@lexer.html_script? || @lexer.html_style?
|
181
|
+
end
|
182
|
+
|
183
|
+
##
|
184
|
+
# @param [Symbol] method
|
185
|
+
# @param [Array] args
|
186
|
+
#
|
187
|
+
def run_callback(method, *args)
|
188
|
+
@handler.send(method, *args) if @handler.respond_to?(method)
|
189
|
+
end
|
190
|
+
end # SaxParser
|
191
|
+
end # XML
|
192
|
+
end # Oga
|