rubyjedi-oga 1.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.yardopts +13 -0
- data/LICENSE +362 -0
- data/README.md +317 -0
- data/doc/css/common.css +77 -0
- data/doc/css_selectors.md +935 -0
- data/doc/manually_creating_documents.md +67 -0
- data/doc/migrating_from_nokogiri.md +169 -0
- data/doc/xml_namespaces.md +63 -0
- data/ext/c/extconf.rb +11 -0
- data/ext/c/lexer.c +2595 -0
- data/ext/c/lexer.h +16 -0
- data/ext/c/lexer.rl +198 -0
- data/ext/c/liboga.c +6 -0
- data/ext/c/liboga.h +11 -0
- data/ext/java/Liboga.java +14 -0
- data/ext/java/org/liboga/xml/Lexer.java +1363 -0
- data/ext/java/org/liboga/xml/Lexer.rl +223 -0
- data/ext/ragel/base_lexer.rl +633 -0
- data/lib/oga.rb +57 -0
- data/lib/oga/blacklist.rb +40 -0
- data/lib/oga/css/lexer.rb +743 -0
- data/lib/oga/css/parser.rb +976 -0
- data/lib/oga/entity_decoder.rb +21 -0
- data/lib/oga/html/entities.rb +2150 -0
- data/lib/oga/html/parser.rb +25 -0
- data/lib/oga/html/sax_parser.rb +18 -0
- data/lib/oga/lru.rb +160 -0
- data/lib/oga/oga.rb +57 -0
- data/lib/oga/version.rb +3 -0
- data/lib/oga/whitelist.rb +20 -0
- data/lib/oga/xml/attribute.rb +136 -0
- data/lib/oga/xml/cdata.rb +17 -0
- data/lib/oga/xml/character_node.rb +37 -0
- data/lib/oga/xml/comment.rb +17 -0
- data/lib/oga/xml/default_namespace.rb +13 -0
- data/lib/oga/xml/doctype.rb +82 -0
- data/lib/oga/xml/document.rb +108 -0
- data/lib/oga/xml/element.rb +428 -0
- data/lib/oga/xml/entities.rb +122 -0
- data/lib/oga/xml/html_void_elements.rb +15 -0
- data/lib/oga/xml/lexer.rb +550 -0
- data/lib/oga/xml/namespace.rb +48 -0
- data/lib/oga/xml/node.rb +219 -0
- data/lib/oga/xml/node_set.rb +333 -0
- data/lib/oga/xml/parser.rb +631 -0
- data/lib/oga/xml/processing_instruction.rb +37 -0
- data/lib/oga/xml/pull_parser.rb +175 -0
- data/lib/oga/xml/querying.rb +56 -0
- data/lib/oga/xml/sax_parser.rb +192 -0
- data/lib/oga/xml/text.rb +66 -0
- data/lib/oga/xml/traversal.rb +50 -0
- data/lib/oga/xml/xml_declaration.rb +65 -0
- data/lib/oga/xpath/evaluator.rb +1798 -0
- data/lib/oga/xpath/lexer.rb +1958 -0
- data/lib/oga/xpath/parser.rb +622 -0
- data/oga.gemspec +45 -0
- metadata +227 -0
@@ -0,0 +1,37 @@
|
|
1
|
+
module Oga
|
2
|
+
module XML
|
3
|
+
##
|
4
|
+
# Class used for storing information about a single processing instruction.
|
5
|
+
#
|
6
|
+
class ProcessingInstruction < CharacterNode
|
7
|
+
# @return [String]
|
8
|
+
attr_accessor :name
|
9
|
+
|
10
|
+
##
|
11
|
+
# @param [Hash] options
|
12
|
+
#
|
13
|
+
# @option options [String] :name The name of the instruction.
|
14
|
+
# @see [Oga::XML::CharacterNode#initialize]
|
15
|
+
#
|
16
|
+
def initialize(options = {})
|
17
|
+
super
|
18
|
+
|
19
|
+
@name = options[:name]
|
20
|
+
end
|
21
|
+
|
22
|
+
##
|
23
|
+
# @return [String]
|
24
|
+
#
|
25
|
+
def to_xml
|
26
|
+
"<?#{name}#{text}?>"
|
27
|
+
end
|
28
|
+
|
29
|
+
##
|
30
|
+
# @return [String]
|
31
|
+
#
|
32
|
+
def inspect
|
33
|
+
"ProcessingInstruction(name: #{name.inspect} text: #{text.inspect})"
|
34
|
+
end
|
35
|
+
end # ProcessingInstruction
|
36
|
+
end # XML
|
37
|
+
end # Oga
|
@@ -0,0 +1,175 @@
|
|
1
|
+
module Oga
|
2
|
+
module XML
|
3
|
+
##
|
4
|
+
# The PullParser class can be used to parse an XML document incrementally
|
5
|
+
# instead of parsing it as a whole. This results in lower memory usage and
|
6
|
+
# potentially faster parsing times. The downside is that pull parsers are
|
7
|
+
# typically more difficult to use compared to DOM parsers.
|
8
|
+
#
|
9
|
+
# Basic parsing using this class works as following:
|
10
|
+
#
|
11
|
+
# parser = Oga::XML::PullParser.new('... xml here ...')
|
12
|
+
#
|
13
|
+
# parser.parse do |node|
|
14
|
+
# if node.is_a?(Oga::XML::PullParser)
|
15
|
+
#
|
16
|
+
# end
|
17
|
+
# end
|
18
|
+
#
|
19
|
+
# This parses yields proper XML instances such as {Oga::XML::Element}.
|
20
|
+
# Doctypes and XML declarations are ignored by this parser.
|
21
|
+
#
|
22
|
+
class PullParser < Parser
|
23
|
+
# @return [Oga::XML::Node]
|
24
|
+
attr_reader :node
|
25
|
+
|
26
|
+
# Array containing the names of the currently nested elements.
|
27
|
+
# @return [Array]
|
28
|
+
attr_reader :nesting
|
29
|
+
|
30
|
+
##
|
31
|
+
# @return [Array]
|
32
|
+
#
|
33
|
+
DISABLED_CALLBACKS = [
|
34
|
+
:on_document,
|
35
|
+
:on_doctype,
|
36
|
+
:on_xml_decl,
|
37
|
+
:on_element_children
|
38
|
+
]
|
39
|
+
|
40
|
+
##
|
41
|
+
# @return [Array]
|
42
|
+
#
|
43
|
+
BLOCK_CALLBACKS = [
|
44
|
+
:on_cdata,
|
45
|
+
:on_comment,
|
46
|
+
:on_text,
|
47
|
+
:on_proc_ins
|
48
|
+
]
|
49
|
+
|
50
|
+
##
|
51
|
+
# Returns the shorthands that can be used for various node classes.
|
52
|
+
#
|
53
|
+
# @return [Hash]
|
54
|
+
#
|
55
|
+
NODE_SHORTHANDS = {
|
56
|
+
:text => XML::Text,
|
57
|
+
:node => XML::Node,
|
58
|
+
:cdata => XML::Cdata,
|
59
|
+
:element => XML::Element,
|
60
|
+
:doctype => XML::Doctype,
|
61
|
+
:comment => XML::Comment,
|
62
|
+
:xml_declaration => XML::XmlDeclaration
|
63
|
+
}
|
64
|
+
|
65
|
+
##
|
66
|
+
# @see Oga::XML::Parser#reset
|
67
|
+
#
|
68
|
+
def reset
|
69
|
+
super
|
70
|
+
|
71
|
+
@block = nil
|
72
|
+
@nesting = []
|
73
|
+
@node = nil
|
74
|
+
end
|
75
|
+
|
76
|
+
##
|
77
|
+
# Parses the input and yields every node to the supplied block.
|
78
|
+
#
|
79
|
+
# @yieldparam [Oga::XML::Node]
|
80
|
+
#
|
81
|
+
def parse(&block)
|
82
|
+
@block = block
|
83
|
+
|
84
|
+
super
|
85
|
+
|
86
|
+
return
|
87
|
+
end
|
88
|
+
|
89
|
+
##
|
90
|
+
# Calls the supplied block if the current node type and optionally the
|
91
|
+
# nesting match. This method allows you to write this:
|
92
|
+
#
|
93
|
+
# parser.parse do |node|
|
94
|
+
# parser.on(:text, %w{people person name}) do
|
95
|
+
# puts node.text
|
96
|
+
# end
|
97
|
+
# end
|
98
|
+
#
|
99
|
+
# Instead of this:
|
100
|
+
#
|
101
|
+
# parser.parse do |node|
|
102
|
+
# if node.is_a?(Oga::XML::Text) and parser.nesting == %w{people person name}
|
103
|
+
# puts node.text
|
104
|
+
# end
|
105
|
+
# end
|
106
|
+
#
|
107
|
+
# When calling this method you can specify the following node types:
|
108
|
+
#
|
109
|
+
# * `:cdata`
|
110
|
+
# * `:comment`
|
111
|
+
# * `:element`
|
112
|
+
# * `:text`
|
113
|
+
#
|
114
|
+
# @example
|
115
|
+
# parser.on(:element, %w{people person name}) do
|
116
|
+
#
|
117
|
+
# end
|
118
|
+
#
|
119
|
+
# @param [Symbol] type The type of node to act upon. This is a symbol as
|
120
|
+
# returned by {Oga::XML::Node#node_type}.
|
121
|
+
#
|
122
|
+
# @param [Array] nesting The element name nesting to act upon.
|
123
|
+
#
|
124
|
+
def on(type, nesting = [])
|
125
|
+
if node.is_a?(NODE_SHORTHANDS[type])
|
126
|
+
if nesting.empty? or nesting == self.nesting
|
127
|
+
yield
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
# eval is a heck of a lot faster than define_method on both Rubinius and
|
133
|
+
# JRuby.
|
134
|
+
DISABLED_CALLBACKS.each do |method|
|
135
|
+
eval <<-EOF, nil, __FILE__, __LINE__ + 1
|
136
|
+
def #{method}(*args)
|
137
|
+
return
|
138
|
+
end
|
139
|
+
EOF
|
140
|
+
end
|
141
|
+
|
142
|
+
BLOCK_CALLBACKS.each do |method|
|
143
|
+
eval <<-EOF, nil, __FILE__, __LINE__ + 1
|
144
|
+
def #{method}(*args)
|
145
|
+
@node = super
|
146
|
+
@block.call(@node)
|
147
|
+
return
|
148
|
+
end
|
149
|
+
EOF
|
150
|
+
end
|
151
|
+
|
152
|
+
##
|
153
|
+
# @see Oga::XML::Parser#on_element
|
154
|
+
#
|
155
|
+
def on_element(*args)
|
156
|
+
@node = super
|
157
|
+
|
158
|
+
nesting << @node.name
|
159
|
+
|
160
|
+
@block.call(@node)
|
161
|
+
|
162
|
+
return
|
163
|
+
end
|
164
|
+
|
165
|
+
##
|
166
|
+
# @see Oga::XML::Parser#on_element_children
|
167
|
+
#
|
168
|
+
def after_element(*args)
|
169
|
+
nesting.pop
|
170
|
+
|
171
|
+
return
|
172
|
+
end
|
173
|
+
end # PullParser
|
174
|
+
end # XML
|
175
|
+
end # Oga
|
@@ -0,0 +1,56 @@
|
|
1
|
+
module Oga
|
2
|
+
module XML
|
3
|
+
##
|
4
|
+
# The Querying module provides methods that make it easy to run XPath/CSS
|
5
|
+
# queries on XML documents/elements.
|
6
|
+
#
|
7
|
+
module Querying
|
8
|
+
##
|
9
|
+
# Evaluates the given XPath expression.
|
10
|
+
#
|
11
|
+
# @param [String] expression The XPath expression to run.
|
12
|
+
# @param [Hash] variables Variables to bind.
|
13
|
+
# @see [Oga::XPath::Evaluator#initialize]
|
14
|
+
#
|
15
|
+
def xpath(expression, variables = {})
|
16
|
+
XPath::Evaluator.new(self, variables).evaluate(expression)
|
17
|
+
end
|
18
|
+
|
19
|
+
##
|
20
|
+
# Evaluates the given XPath expression and returns the first node in the
|
21
|
+
# set.
|
22
|
+
#
|
23
|
+
# @see [#xpath]
|
24
|
+
#
|
25
|
+
def at_xpath(*args)
|
26
|
+
result = xpath(*args)
|
27
|
+
|
28
|
+
result.is_a?(XML::NodeSet) ? result.first : result
|
29
|
+
end
|
30
|
+
|
31
|
+
##
|
32
|
+
# Evaluates the given CSS expression.
|
33
|
+
#
|
34
|
+
# @param [String] expression The CSS expression to run.
|
35
|
+
# @see [Oga::XPath::Evaluator#initialize]
|
36
|
+
#
|
37
|
+
def css(expression)
|
38
|
+
ast = CSS::Parser.parse_with_cache(expression)
|
39
|
+
|
40
|
+
XPath::Evaluator.new(self).evaluate_ast(ast)
|
41
|
+
end
|
42
|
+
|
43
|
+
##
|
44
|
+
# Evaluates the given CSS expression and returns the first node in the
|
45
|
+
# set.
|
46
|
+
#
|
47
|
+
# @see [#css]
|
48
|
+
#
|
49
|
+
def at_css(*args)
|
50
|
+
result = css(*args)
|
51
|
+
|
52
|
+
result.is_a?(XML::NodeSet) ? result.first : result
|
53
|
+
end
|
54
|
+
end # Querying
|
55
|
+
end # XML
|
56
|
+
end # Oga
|
@@ -0,0 +1,192 @@
|
|
1
|
+
module Oga
|
2
|
+
module XML
|
3
|
+
##
|
4
|
+
# The SaxParser class provides the basic interface for writing custom SAX
|
5
|
+
# parsers. All callback methods defined in {Oga::XML::Parser} are delegated
|
6
|
+
# to a dedicated handler class.
|
7
|
+
#
|
8
|
+
# To write a custom handler for the SAX parser, create a class that
|
9
|
+
# implements one (or many) of the following callback methods:
|
10
|
+
#
|
11
|
+
# * `on_document`
|
12
|
+
# * `on_doctype`
|
13
|
+
# * `on_cdata`
|
14
|
+
# * `on_comment`
|
15
|
+
# * `on_proc_ins`
|
16
|
+
# * `on_xml_decl`
|
17
|
+
# * `on_text`
|
18
|
+
# * `on_element`
|
19
|
+
# * `on_element_children`
|
20
|
+
# * `on_attribute`
|
21
|
+
# * `on_attributes`
|
22
|
+
# * `after_element`
|
23
|
+
#
|
24
|
+
# For example:
|
25
|
+
#
|
26
|
+
# class SaxHandler
|
27
|
+
# def on_element(namespace, name, attrs = {})
|
28
|
+
# puts name
|
29
|
+
# end
|
30
|
+
# end
|
31
|
+
#
|
32
|
+
# You can then use it as following:
|
33
|
+
#
|
34
|
+
# handler = SaxHandler.new
|
35
|
+
# parser = Oga::XML::SaxParser.new(handler, '<foo />')
|
36
|
+
#
|
37
|
+
# parser.parse
|
38
|
+
#
|
39
|
+
# For information on the callback arguments see the documentation of the
|
40
|
+
# corresponding methods in {Oga::XML::Parser}.
|
41
|
+
#
|
42
|
+
# ## Element Callbacks
|
43
|
+
#
|
44
|
+
# The SAX parser changes the behaviour of both `on_element` and
|
45
|
+
# `after_element`. The latter in the regular parser only takes a
|
46
|
+
# {Oga::XML::Element} instance. In the SAX parser it will instead take a
|
47
|
+
# namespace name and the element name. This eases the process of figuring
|
48
|
+
# out what element a callback is associated with.
|
49
|
+
#
|
50
|
+
# An example:
|
51
|
+
#
|
52
|
+
# class SaxHandler
|
53
|
+
# def on_element(namespace, name, attrs = {})
|
54
|
+
# # ...
|
55
|
+
# end
|
56
|
+
#
|
57
|
+
# def after_element(namespace, name)
|
58
|
+
# puts name # => "foo", "bar", etc
|
59
|
+
# end
|
60
|
+
# end
|
61
|
+
#
|
62
|
+
# ## Attributes
|
63
|
+
#
|
64
|
+
# Attributes returned by `on_attribute` are passed as an Hash as the 3rd
|
65
|
+
# argument of the `on_element` callback. The keys of this Hash are the
|
66
|
+
# attribute names (optionally prefixed by their namespace) and their values.
|
67
|
+
# You can overwrite `on_attribute` to control individual attributes and
|
68
|
+
# `on_attributes` to control the final set.
|
69
|
+
#
|
70
|
+
class SaxParser < Parser
|
71
|
+
##
|
72
|
+
# @param [Object] handler The SAX handler to delegate callbacks to.
|
73
|
+
# @see [Oga::XML::Parser#initialize]
|
74
|
+
#
|
75
|
+
def initialize(handler, *args)
|
76
|
+
@handler = handler
|
77
|
+
|
78
|
+
super(*args)
|
79
|
+
end
|
80
|
+
|
81
|
+
# Delegate all callbacks to the handler object.
|
82
|
+
instance_methods.grep(/^(on_|after_)/).each do |method|
|
83
|
+
eval <<-EOF, nil, __FILE__, __LINE__ + 1
|
84
|
+
def #{method}(*args)
|
85
|
+
run_callback(:#{method}, *args)
|
86
|
+
|
87
|
+
return
|
88
|
+
end
|
89
|
+
EOF
|
90
|
+
end
|
91
|
+
|
92
|
+
##
|
93
|
+
# Manually overwrite `on_element` so we can ensure that `after_element`
|
94
|
+
# always receives the namespace and name.
|
95
|
+
#
|
96
|
+
# @see [Oga::XML::Parser#on_element]
|
97
|
+
# @return [Array]
|
98
|
+
#
|
99
|
+
def on_element(namespace, name, attrs = [])
|
100
|
+
run_callback(:on_element, namespace, name, attrs)
|
101
|
+
|
102
|
+
[namespace, name]
|
103
|
+
end
|
104
|
+
|
105
|
+
##
|
106
|
+
# Manually overwrite `after_element` so it can take a namespace and name.
|
107
|
+
# This differs a bit from the regular `after_element` which only takes an
|
108
|
+
# {Oga::XML::Element} instance.
|
109
|
+
#
|
110
|
+
# @param [Array] namespace_with_name
|
111
|
+
#
|
112
|
+
def after_element(namespace_with_name)
|
113
|
+
run_callback(:after_element, *namespace_with_name)
|
114
|
+
|
115
|
+
return
|
116
|
+
end
|
117
|
+
|
118
|
+
##
|
119
|
+
# Manually overwrite this method since for this one we _do_ want the
|
120
|
+
# return value so it can be passed to `on_element`.
|
121
|
+
#
|
122
|
+
# @see [Oga::XML::Parser#on_attribute]
|
123
|
+
#
|
124
|
+
def on_attribute(name, ns = nil, value = nil)
|
125
|
+
if @handler.respond_to?(:on_attribute)
|
126
|
+
return run_callback(:on_attribute, name, ns, value)
|
127
|
+
end
|
128
|
+
|
129
|
+
key = ns ? "#{ns}:#{name}" : name
|
130
|
+
|
131
|
+
if value
|
132
|
+
value = EntityDecoder.try_decode(value, @lexer.html?)
|
133
|
+
end
|
134
|
+
|
135
|
+
{key => value}
|
136
|
+
end
|
137
|
+
|
138
|
+
##
|
139
|
+
# Merges the attributes together into a Hash.
|
140
|
+
#
|
141
|
+
# @param [Array] attrs
|
142
|
+
# @return [Hash]
|
143
|
+
#
|
144
|
+
def on_attributes(attrs)
|
145
|
+
if @handler.respond_to?(:on_attributes)
|
146
|
+
return run_callback(:on_attributes, attrs)
|
147
|
+
end
|
148
|
+
|
149
|
+
merged = {}
|
150
|
+
|
151
|
+
attrs.each do |pair|
|
152
|
+
# Hash#merge requires an extra allocation, this doesn't.
|
153
|
+
pair.each { |key, value| merged[key] = value }
|
154
|
+
end
|
155
|
+
|
156
|
+
merged
|
157
|
+
end
|
158
|
+
|
159
|
+
##
|
160
|
+
# @param [String] text
|
161
|
+
#
|
162
|
+
def on_text(text)
|
163
|
+
if @handler.respond_to?(:on_text)
|
164
|
+
unless inside_literal_html?
|
165
|
+
text = EntityDecoder.try_decode(text, @lexer.html?)
|
166
|
+
end
|
167
|
+
|
168
|
+
run_callback(:on_text, text)
|
169
|
+
end
|
170
|
+
|
171
|
+
return
|
172
|
+
end
|
173
|
+
|
174
|
+
private
|
175
|
+
|
176
|
+
##
|
177
|
+
# @return [TrueClass|FalseClass]
|
178
|
+
#
|
179
|
+
def inside_literal_html?
|
180
|
+
@lexer.html_script? || @lexer.html_style?
|
181
|
+
end
|
182
|
+
|
183
|
+
##
|
184
|
+
# @param [Symbol] method
|
185
|
+
# @param [Array] args
|
186
|
+
#
|
187
|
+
def run_callback(method, *args)
|
188
|
+
@handler.send(method, *args) if @handler.respond_to?(method)
|
189
|
+
end
|
190
|
+
end # SaxParser
|
191
|
+
end # XML
|
192
|
+
end # Oga
|