oga 1.0.2-java → 1.0.3-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +5 -0
- data/ext/c/lexer.c +394 -312
- data/ext/c/lexer.rl +3 -3
- data/ext/java/org/liboga/xml/Lexer.java +216 -172
- data/ext/java/org/liboga/xml/Lexer.rl +1 -1
- data/ext/ragel/base_lexer.rl +30 -11
- data/lib/liboga.jar +0 -0
- data/lib/oga/blacklist.rb +2 -2
- data/lib/oga/css/parser.rb +26 -28
- data/lib/oga/entity_decoder.rb +2 -2
- data/lib/oga/html/entities.rb +1 -1
- data/lib/oga/lru.rb +6 -6
- data/lib/oga/oga.rb +14 -14
- data/lib/oga/version.rb +1 -1
- data/lib/oga/whitelist.rb +2 -2
- data/lib/oga/xml/attribute.rb +16 -18
- data/lib/oga/xml/cdata.rb +1 -1
- data/lib/oga/xml/character_node.rb +3 -5
- data/lib/oga/xml/comment.rb +1 -1
- data/lib/oga/xml/doctype.rb +21 -23
- data/lib/oga/xml/document.rb +11 -17
- data/lib/oga/xml/element.rb +19 -29
- data/lib/oga/xml/entities.rb +3 -3
- data/lib/oga/xml/lexer.rb +34 -15
- data/lib/oga/xml/namespace.rb +8 -10
- data/lib/oga/xml/node.rb +8 -10
- data/lib/oga/xml/node_set.rb +16 -18
- data/lib/oga/xml/parser.rb +1 -1
- data/lib/oga/xml/processing_instruction.rb +3 -5
- data/lib/oga/xml/pull_parser.rb +6 -9
- data/lib/oga/xml/querying.rb +4 -4
- data/lib/oga/xml/sax_parser.rb +4 -4
- data/lib/oga/xml/text.rb +4 -4
- data/lib/oga/xml/xml_declaration.rb +11 -15
- data/lib/oga/xpath/evaluator.rb +81 -81
- metadata +66 -66
data/lib/oga/xml/lexer.rb
CHANGED
@@ -34,12 +34,20 @@ module Oga
|
|
34
34
|
# However, it is perfectly save to use different instances per thread.
|
35
35
|
# There is no _global_ state used by this lexer.
|
36
36
|
#
|
37
|
-
#
|
38
|
-
#
|
37
|
+
# ## Strict Mode
|
38
|
+
#
|
39
|
+
# By default the lexer is rather permissive regarding the input. For
|
40
|
+
# example, missing closing tags are inserted by default. To disable this
|
41
|
+
# behaviour the lexer can be run in "strict mode" by setting `:strict` to
|
42
|
+
# `true`:
|
43
|
+
#
|
44
|
+
# lexer = Oga::XML::Lexer.new('...', :strict => true)
|
45
|
+
#
|
46
|
+
# Strict mode only applies to XML documents.
|
47
|
+
#
|
48
|
+
# @private
|
39
49
|
#
|
40
50
|
class Lexer
|
41
|
-
attr_reader :html
|
42
|
-
|
43
51
|
# These are all constant/frozen to remove the need for String allocations
|
44
52
|
# every time they are referenced in the lexer.
|
45
53
|
HTML_SCRIPT = 'script'.freeze
|
@@ -99,13 +107,17 @@ module Oga
|
|
99
107
|
#
|
100
108
|
# @param [Hash] options
|
101
109
|
#
|
102
|
-
# @option options [
|
103
|
-
# the input as HTML instead of
|
104
|
-
# HTML void elements such as `<link href="">`.
|
110
|
+
# @option options [TrueClass|FalseClass] :html When set to `true` the
|
111
|
+
# lexer will treat the input as HTML instead of XML. This makes it
|
112
|
+
# possible to lex HTML void elements such as `<link href="">`.
|
113
|
+
#
|
114
|
+
# @option options [TrueClass|FalseClass] :strict Enables/disables strict
|
115
|
+
# parsing of XML documents, disabled by default.
|
105
116
|
#
|
106
117
|
def initialize(data, options = {})
|
107
|
-
@data
|
108
|
-
@html
|
118
|
+
@data = data
|
119
|
+
@html = options[:html]
|
120
|
+
@strict = options[:strict] || false
|
109
121
|
|
110
122
|
reset
|
111
123
|
end
|
@@ -163,7 +175,7 @@ module Oga
|
|
163
175
|
|
164
176
|
reset
|
165
177
|
|
166
|
-
|
178
|
+
tokens
|
167
179
|
end
|
168
180
|
|
169
181
|
##
|
@@ -193,7 +205,7 @@ module Oga
|
|
193
205
|
end
|
194
206
|
|
195
207
|
# Add any missing closing tags
|
196
|
-
|
208
|
+
if !strict? and !@elements.empty?
|
197
209
|
@elements.length.times { on_element_end }
|
198
210
|
end
|
199
211
|
ensure
|
@@ -204,21 +216,28 @@ module Oga
|
|
204
216
|
# @return [TrueClass|FalseClass]
|
205
217
|
#
|
206
218
|
def html?
|
207
|
-
|
219
|
+
@html == true
|
220
|
+
end
|
221
|
+
|
222
|
+
##
|
223
|
+
# @return [TrueClass|FalseClass]
|
224
|
+
#
|
225
|
+
def strict?
|
226
|
+
@strict
|
208
227
|
end
|
209
228
|
|
210
229
|
##
|
211
230
|
# @return [TrueClass|FalseClass]
|
212
231
|
#
|
213
232
|
def html_script?
|
214
|
-
|
233
|
+
html? && current_element == HTML_SCRIPT
|
215
234
|
end
|
216
235
|
|
217
236
|
##
|
218
237
|
# @return [TrueClass|FalseClass]
|
219
238
|
#
|
220
239
|
def html_style?
|
221
|
-
|
240
|
+
html? && current_element == HTML_STYLE
|
222
241
|
end
|
223
242
|
|
224
243
|
private
|
@@ -250,7 +269,7 @@ module Oga
|
|
250
269
|
# @return [String]
|
251
270
|
#
|
252
271
|
def current_element
|
253
|
-
|
272
|
+
@elements.last
|
254
273
|
end
|
255
274
|
|
256
275
|
##
|
data/lib/oga/xml/namespace.rb
CHANGED
@@ -4,14 +4,12 @@ module Oga
|
|
4
4
|
# The Namespace class contains information about XML namespaces such as the
|
5
5
|
# name and URI.
|
6
6
|
#
|
7
|
-
# @!attribute [r] name
|
8
|
-
# @return [String]
|
9
|
-
#
|
10
|
-
# @!attribute [r] uri
|
11
|
-
# @return [String]
|
12
|
-
#
|
13
7
|
class Namespace
|
14
|
-
|
8
|
+
# @return [String]
|
9
|
+
attr_accessor :name
|
10
|
+
|
11
|
+
# @return [String]
|
12
|
+
attr_accessor :uri
|
15
13
|
|
16
14
|
##
|
17
15
|
# @param [Hash] options
|
@@ -28,14 +26,14 @@ module Oga
|
|
28
26
|
# @return [String]
|
29
27
|
#
|
30
28
|
def to_s
|
31
|
-
|
29
|
+
name.to_s
|
32
30
|
end
|
33
31
|
|
34
32
|
##
|
35
33
|
# @return [String]
|
36
34
|
#
|
37
35
|
def inspect
|
38
|
-
|
36
|
+
"Namespace(name: #{name.inspect} uri: #{uri.inspect})"
|
39
37
|
end
|
40
38
|
|
41
39
|
##
|
@@ -43,7 +41,7 @@ module Oga
|
|
43
41
|
# @return [TrueClass|FalseClass]
|
44
42
|
#
|
45
43
|
def ==(other)
|
46
|
-
|
44
|
+
other.is_a?(self.class) && name == other.name && uri == other.uri
|
47
45
|
end
|
48
46
|
end # Namespace
|
49
47
|
end # XML
|
data/lib/oga/xml/node.rb
CHANGED
@@ -5,12 +5,10 @@ module Oga
|
|
5
5
|
# {Oga::XML::NodeSet} and can be used to query surrounding and parent
|
6
6
|
# nodes.
|
7
7
|
#
|
8
|
-
# @!attribute [r] node_set
|
9
|
-
# @return [Oga::XML::NodeSet]
|
10
|
-
#
|
11
8
|
class Node
|
12
9
|
include Traversal
|
13
10
|
|
11
|
+
# @return [Oga::XML::NodeSet]
|
14
12
|
attr_reader :node_set
|
15
13
|
|
16
14
|
##
|
@@ -42,7 +40,7 @@ module Oga
|
|
42
40
|
# @return [Oga::XML::NodeSet]
|
43
41
|
#
|
44
42
|
def children
|
45
|
-
|
43
|
+
@children ||= NodeSet.new([], self)
|
46
44
|
end
|
47
45
|
|
48
46
|
##
|
@@ -65,7 +63,7 @@ module Oga
|
|
65
63
|
# @return [Oga::XML::Node]
|
66
64
|
#
|
67
65
|
def parent
|
68
|
-
|
66
|
+
node_set ? node_set.owner : nil
|
69
67
|
end
|
70
68
|
|
71
69
|
##
|
@@ -76,7 +74,7 @@ module Oga
|
|
76
74
|
def previous
|
77
75
|
index = node_set.index(self) - 1
|
78
76
|
|
79
|
-
|
77
|
+
index >= 0 ? node_set[index] : nil
|
80
78
|
end
|
81
79
|
|
82
80
|
##
|
@@ -88,7 +86,7 @@ module Oga
|
|
88
86
|
index = node_set.index(self) + 1
|
89
87
|
length = node_set.length
|
90
88
|
|
91
|
-
|
89
|
+
index <= length ? node_set[index] : nil
|
92
90
|
end
|
93
91
|
|
94
92
|
##
|
@@ -142,7 +140,7 @@ module Oga
|
|
142
140
|
@root_node = node
|
143
141
|
end
|
144
142
|
|
145
|
-
|
143
|
+
@root_node
|
146
144
|
end
|
147
145
|
|
148
146
|
##
|
@@ -186,14 +184,14 @@ module Oga
|
|
186
184
|
@html_p = root.is_a?(Document) && root.html?
|
187
185
|
end
|
188
186
|
|
189
|
-
|
187
|
+
@html_p
|
190
188
|
end
|
191
189
|
|
192
190
|
##
|
193
191
|
# @return [TrueClass|FalseClass]
|
194
192
|
#
|
195
193
|
def xml?
|
196
|
-
|
194
|
+
!html?
|
197
195
|
end
|
198
196
|
end # Element
|
199
197
|
end # XML
|
data/lib/oga/xml/node_set.rb
CHANGED
@@ -31,12 +31,10 @@ module Oga
|
|
31
31
|
# If ownership was not handled then you'd have to manually set the
|
32
32
|
# `element` variable's `node_set` attribute after pushing it into a set.
|
33
33
|
#
|
34
|
-
# @!attribute [rw] owner
|
35
|
-
# @return [Oga::XML::Node]
|
36
|
-
#
|
37
34
|
class NodeSet
|
38
35
|
include Enumerable
|
39
36
|
|
37
|
+
# @return [Oga::XML::Node]
|
40
38
|
attr_accessor :owner
|
41
39
|
|
42
40
|
##
|
@@ -65,7 +63,7 @@ module Oga
|
|
65
63
|
# @return [Oga::XML::Node]
|
66
64
|
#
|
67
65
|
def last
|
68
|
-
|
66
|
+
@nodes[-1]
|
69
67
|
end
|
70
68
|
|
71
69
|
##
|
@@ -74,7 +72,7 @@ module Oga
|
|
74
72
|
# @return [TrueClass|FalseClass]
|
75
73
|
#
|
76
74
|
def empty?
|
77
|
-
|
75
|
+
@nodes.empty?
|
78
76
|
end
|
79
77
|
|
80
78
|
##
|
@@ -83,7 +81,7 @@ module Oga
|
|
83
81
|
# @return [Fixnum]
|
84
82
|
#
|
85
83
|
def length
|
86
|
-
|
84
|
+
@nodes.length
|
87
85
|
end
|
88
86
|
|
89
87
|
alias_method :count, :length
|
@@ -96,7 +94,7 @@ module Oga
|
|
96
94
|
# @return [Fixnum]
|
97
95
|
#
|
98
96
|
def index(node)
|
99
|
-
|
97
|
+
@nodes.index(node)
|
100
98
|
end
|
101
99
|
|
102
100
|
##
|
@@ -137,7 +135,7 @@ module Oga
|
|
137
135
|
|
138
136
|
remove_ownership(node)
|
139
137
|
|
140
|
-
|
138
|
+
node
|
141
139
|
end
|
142
140
|
|
143
141
|
##
|
@@ -150,7 +148,7 @@ module Oga
|
|
150
148
|
|
151
149
|
remove_ownership(node)
|
152
150
|
|
153
|
-
|
151
|
+
node
|
154
152
|
end
|
155
153
|
|
156
154
|
##
|
@@ -174,7 +172,7 @@ module Oga
|
|
174
172
|
# @return [Oga::XML::Node]
|
175
173
|
#
|
176
174
|
def [](index)
|
177
|
-
|
175
|
+
@nodes[index]
|
178
176
|
end
|
179
177
|
|
180
178
|
##
|
@@ -183,7 +181,7 @@ module Oga
|
|
183
181
|
# @return [Array]
|
184
182
|
#
|
185
183
|
def to_a
|
186
|
-
|
184
|
+
@nodes
|
187
185
|
end
|
188
186
|
|
189
187
|
##
|
@@ -194,7 +192,7 @@ module Oga
|
|
194
192
|
# @return [Oga::XML::NodeSet]
|
195
193
|
#
|
196
194
|
def +(other)
|
197
|
-
|
195
|
+
self.class.new(to_a | other.to_a)
|
198
196
|
end
|
199
197
|
|
200
198
|
##
|
@@ -204,7 +202,7 @@ module Oga
|
|
204
202
|
# @param [Oga::XML::NodeSet] other
|
205
203
|
#
|
206
204
|
def ==(other)
|
207
|
-
|
205
|
+
other.is_a?(NodeSet) && other.equal_nodes?(@nodes)
|
208
206
|
end
|
209
207
|
|
210
208
|
##
|
@@ -216,7 +214,7 @@ module Oga
|
|
216
214
|
# @param [Array<Oga::XML::Node>] nodes
|
217
215
|
#
|
218
216
|
def equal_nodes?(nodes)
|
219
|
-
|
217
|
+
@nodes == nodes
|
220
218
|
end
|
221
219
|
|
222
220
|
##
|
@@ -261,7 +259,7 @@ module Oga
|
|
261
259
|
|
262
260
|
remove_ownership(removed) if removed
|
263
261
|
|
264
|
-
|
262
|
+
removed
|
265
263
|
end
|
266
264
|
|
267
265
|
##
|
@@ -279,7 +277,7 @@ module Oga
|
|
279
277
|
end
|
280
278
|
end
|
281
279
|
|
282
|
-
|
280
|
+
values
|
283
281
|
end
|
284
282
|
|
285
283
|
alias_method :attr, :attribute
|
@@ -298,7 +296,7 @@ module Oga
|
|
298
296
|
end
|
299
297
|
end
|
300
298
|
|
301
|
-
|
299
|
+
text
|
302
300
|
end
|
303
301
|
|
304
302
|
##
|
@@ -307,7 +305,7 @@ module Oga
|
|
307
305
|
def inspect
|
308
306
|
values = @nodes.map(&:inspect).join(', ')
|
309
307
|
|
310
|
-
|
308
|
+
"NodeSet(#{values})"
|
311
309
|
end
|
312
310
|
|
313
311
|
private
|
data/lib/oga/xml/parser.rb
CHANGED
@@ -291,7 +291,7 @@ class Parser < LL::Driver
|
|
291
291
|
# @return [Oga::XML::Document]
|
292
292
|
#
|
293
293
|
def on_document(children = [])
|
294
|
-
document = Document.new(:type => @lexer.html ? :html : :xml)
|
294
|
+
document = Document.new(:type => @lexer.html? ? :html : :xml)
|
295
295
|
|
296
296
|
children.each do |child|
|
297
297
|
if child.is_a?(Doctype)
|
@@ -3,10 +3,8 @@ module Oga
|
|
3
3
|
##
|
4
4
|
# Class used for storing information about a single processing instruction.
|
5
5
|
#
|
6
|
-
# @!attribute [rw] name
|
7
|
-
# @return [String]
|
8
|
-
#
|
9
6
|
class ProcessingInstruction < CharacterNode
|
7
|
+
# @return [String]
|
10
8
|
attr_accessor :name
|
11
9
|
|
12
10
|
##
|
@@ -25,14 +23,14 @@ module Oga
|
|
25
23
|
# @return [String]
|
26
24
|
#
|
27
25
|
def to_xml
|
28
|
-
|
26
|
+
"<?#{name}#{text}?>"
|
29
27
|
end
|
30
28
|
|
31
29
|
##
|
32
30
|
# @return [String]
|
33
31
|
#
|
34
32
|
def inspect
|
35
|
-
|
33
|
+
"ProcessingInstruction(name: #{name.inspect} text: #{text.inspect})"
|
36
34
|
end
|
37
35
|
end # ProcessingInstruction
|
38
36
|
end # XML
|
data/lib/oga/xml/pull_parser.rb
CHANGED
@@ -19,16 +19,13 @@ module Oga
|
|
19
19
|
# This parses yields proper XML instances such as {Oga::XML::Element}.
|
20
20
|
# Doctypes and XML declarations are ignored by this parser.
|
21
21
|
#
|
22
|
-
# @!attribute [r] node
|
23
|
-
# The current node.
|
24
|
-
# @return [Oga::XML::Node]
|
25
|
-
#
|
26
|
-
# @!attribute [r] nesting
|
27
|
-
# Array containing the names of the currently nested elements.
|
28
|
-
# @return [Array]
|
29
|
-
#
|
30
22
|
class PullParser < Parser
|
31
|
-
|
23
|
+
# @return [Oga::XML::Node]
|
24
|
+
attr_reader :node
|
25
|
+
|
26
|
+
# Array containing the names of the currently nested elements.
|
27
|
+
# @return [Array]
|
28
|
+
attr_reader :nesting
|
32
29
|
|
33
30
|
##
|
34
31
|
# @return [Array]
|
data/lib/oga/xml/querying.rb
CHANGED
@@ -13,7 +13,7 @@ module Oga
|
|
13
13
|
# @see [Oga::XPath::Evaluator#initialize]
|
14
14
|
#
|
15
15
|
def xpath(expression, variables = {})
|
16
|
-
|
16
|
+
XPath::Evaluator.new(self, variables).evaluate(expression)
|
17
17
|
end
|
18
18
|
|
19
19
|
##
|
@@ -25,7 +25,7 @@ module Oga
|
|
25
25
|
def at_xpath(*args)
|
26
26
|
result = xpath(*args)
|
27
27
|
|
28
|
-
|
28
|
+
result.is_a?(XML::NodeSet) ? result.first : result
|
29
29
|
end
|
30
30
|
|
31
31
|
##
|
@@ -37,7 +37,7 @@ module Oga
|
|
37
37
|
def css(expression)
|
38
38
|
ast = CSS::Parser.parse_with_cache(expression)
|
39
39
|
|
40
|
-
|
40
|
+
XPath::Evaluator.new(self).evaluate_ast(ast)
|
41
41
|
end
|
42
42
|
|
43
43
|
##
|
@@ -49,7 +49,7 @@ module Oga
|
|
49
49
|
def at_css(*args)
|
50
50
|
result = css(*args)
|
51
51
|
|
52
|
-
|
52
|
+
result.is_a?(XML::NodeSet) ? result.first : result
|
53
53
|
end
|
54
54
|
end # Querying
|
55
55
|
end # XML
|