oga 0.2.3 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.yardopts +1 -0
- data/README.md +38 -8
- data/ext/c/extconf.rb +3 -5
- data/ext/c/lexer.c +545 -537
- data/ext/c/lexer.rl +6 -5
- data/ext/java/org/liboga/xml/Lexer.java +159 -153
- data/ext/ragel/base_lexer.rl +1 -1
- data/lib/oga.rb +4 -2
- data/lib/oga/css/lexer.rb +2 -3
- data/lib/oga/css/parser.rb +619 -473
- data/lib/oga/html/entities.rb +2150 -0
- data/lib/oga/lru.rb +158 -0
- data/lib/oga/version.rb +1 -1
- data/lib/oga/xml/attribute.rb +1 -1
- data/lib/oga/xml/default_namespace.rb +13 -0
- data/lib/oga/xml/element.rb +25 -1
- data/lib/oga/xml/entities.rb +40 -15
- data/lib/oga/xml/lexer.rb +2 -2
- data/lib/oga/xml/namespace.rb +8 -0
- data/lib/oga/xml/node.rb +16 -0
- data/lib/oga/xml/node_set.rb +2 -2
- data/lib/oga/xml/parser.rb +367 -408
- data/lib/oga/xml/pull_parser.rb +1 -3
- data/lib/oga/xml/querying.rb +1 -1
- data/lib/oga/xml/sax_parser.rb +50 -1
- data/lib/oga/xml/text.rb +33 -4
- data/lib/oga/xpath/evaluator.rb +6 -1
- data/lib/oga/xpath/lexer.rb +2 -3
- data/lib/oga/xpath/parser.rb +528 -470
- data/oga.gemspec +1 -1
- metadata +13 -17
- data/doc/changelog.md +0 -324
data/oga.gemspec
CHANGED
@@ -32,8 +32,8 @@ Gem::Specification.new do |s|
|
|
32
32
|
s.has_rdoc = 'yard'
|
33
33
|
s.required_ruby_version = '>= 1.9.3'
|
34
34
|
|
35
|
-
s.add_dependency 'racc', ['~> 1.4', '>= 1.4.12']
|
36
35
|
s.add_dependency 'ast'
|
36
|
+
s.add_dependency 'ruby-ll', '~> 2.1'
|
37
37
|
|
38
38
|
s.add_development_dependency 'rake'
|
39
39
|
s.add_development_dependency 'rspec', ['~> 3.0']
|
metadata
CHANGED
@@ -1,49 +1,43 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: oga
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yorick Peterse
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-03
|
11
|
+
date: 2015-04-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
|
-
name:
|
14
|
+
name: ast
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - "~>"
|
18
|
-
- !ruby/object:Gem::Version
|
19
|
-
version: '1.4'
|
20
17
|
- - ">="
|
21
18
|
- !ruby/object:Gem::Version
|
22
|
-
version:
|
19
|
+
version: '0'
|
23
20
|
type: :runtime
|
24
21
|
prerelease: false
|
25
22
|
version_requirements: !ruby/object:Gem::Requirement
|
26
23
|
requirements:
|
27
|
-
- - "~>"
|
28
|
-
- !ruby/object:Gem::Version
|
29
|
-
version: '1.4'
|
30
24
|
- - ">="
|
31
25
|
- !ruby/object:Gem::Version
|
32
|
-
version:
|
26
|
+
version: '0'
|
33
27
|
- !ruby/object:Gem::Dependency
|
34
|
-
name:
|
28
|
+
name: ruby-ll
|
35
29
|
requirement: !ruby/object:Gem::Requirement
|
36
30
|
requirements:
|
37
|
-
- - "
|
31
|
+
- - "~>"
|
38
32
|
- !ruby/object:Gem::Version
|
39
|
-
version: '
|
33
|
+
version: '2.1'
|
40
34
|
type: :runtime
|
41
35
|
prerelease: false
|
42
36
|
version_requirements: !ruby/object:Gem::Requirement
|
43
37
|
requirements:
|
44
|
-
- - "
|
38
|
+
- - "~>"
|
45
39
|
- !ruby/object:Gem::Version
|
46
|
-
version: '
|
40
|
+
version: '2.1'
|
47
41
|
- !ruby/object:Gem::Dependency
|
48
42
|
name: rake
|
49
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -153,7 +147,6 @@ files:
|
|
153
147
|
- LICENSE
|
154
148
|
- README.md
|
155
149
|
- doc/DCO.md
|
156
|
-
- doc/changelog.md
|
157
150
|
- doc/css/common.css
|
158
151
|
- doc/css_selectors.md
|
159
152
|
- doc/manually_creating_documents.md
|
@@ -172,14 +165,17 @@ files:
|
|
172
165
|
- lib/oga.rb
|
173
166
|
- lib/oga/css/lexer.rb
|
174
167
|
- lib/oga/css/parser.rb
|
168
|
+
- lib/oga/html/entities.rb
|
175
169
|
- lib/oga/html/parser.rb
|
176
170
|
- lib/oga/html/sax_parser.rb
|
171
|
+
- lib/oga/lru.rb
|
177
172
|
- lib/oga/oga.rb
|
178
173
|
- lib/oga/version.rb
|
179
174
|
- lib/oga/xml/attribute.rb
|
180
175
|
- lib/oga/xml/cdata.rb
|
181
176
|
- lib/oga/xml/character_node.rb
|
182
177
|
- lib/oga/xml/comment.rb
|
178
|
+
- lib/oga/xml/default_namespace.rb
|
183
179
|
- lib/oga/xml/doctype.rb
|
184
180
|
- lib/oga/xml/document.rb
|
185
181
|
- lib/oga/xml/element.rb
|
data/doc/changelog.md
DELETED
@@ -1,324 +0,0 @@
|
|
1
|
-
# Changelog
|
2
|
-
|
3
|
-
This document contains details of the various releases and their release dates.
|
4
|
-
Dates are in the format `yyyy-mm-dd`.
|
5
|
-
|
6
|
-
## 0.2.3 - 2015-03-04
|
7
|
-
|
8
|
-
This release adds support for lexing HTML `<style>` tags similar to how
|
9
|
-
`<script>` tags are handled. This ensures that the contents of these tags are
|
10
|
-
treated as-is without any HTML entity conversion being applied.
|
11
|
-
|
12
|
-
See commits 78e40b55c0e5941bee5791a5014260e9c2cf8aad and
|
13
|
-
3b2055a30b128aa679a83332dfdfa68314271b24 for more information.
|
14
|
-
|
15
|
-
## 0.2.2 - 2015-03-03
|
16
|
-
|
17
|
-
This release fixes a bug where setting the text of an element using
|
18
|
-
`Oga::XML::Element#inner_text=` would not set the parent element of the newly
|
19
|
-
created text node. This would result in the following:
|
20
|
-
|
21
|
-
some_element.inner_text = 'foo'
|
22
|
-
|
23
|
-
some_element.children[0].parent # => nil
|
24
|
-
|
25
|
-
Here `parent` is supposed to return `some_element` instead. See commit
|
26
|
-
142b467277dc9864df8279347ba737ddf60f4836 for more information.
|
27
|
-
|
28
|
-
## 0.2.1 - 2015-03-02
|
29
|
-
|
30
|
-
### Proper HTML serializing support for script tags
|
31
|
-
|
32
|
-
When serializing an HTML document back to HTML (as a String) the contents of
|
33
|
-
`<script>` tags are serialized correctly. Previously XML unsafe characters
|
34
|
-
(e.g. `<`) would be converted to XML entities, which results in invalid
|
35
|
-
Javascript syntax. This has been changed so that `<script>` tags in HTML
|
36
|
-
documents _don't_ have their contents converted, ensuring proper Javascript
|
37
|
-
syntax upon output.
|
38
|
-
|
39
|
-
See commit 874d7124af540f0bc78e6c586868bbffb4310c5d and issue
|
40
|
-
<https://github.com/YorickPeterse/oga/issues/79> for more information.
|
41
|
-
|
42
|
-
### Proper lexing support for script tags
|
43
|
-
|
44
|
-
When lexing HTML documents the XML lexer is now capable of lexing the contents
|
45
|
-
of `<script>` tags properly. Previously input such as `<script>x >y</script>`
|
46
|
-
would result in incorrect tokens being emitted. See commit
|
47
|
-
ba2177e2cfda958ea12c5b04dbf60907aaa8816d and issue
|
48
|
-
<https://github.com/YorickPeterse/oga/issues/70> for more information.
|
49
|
-
|
50
|
-
### Element Inner Text
|
51
|
-
|
52
|
-
When setting the inner text of an element using `Oga::XML::Element#inner_text=`
|
53
|
-
_all_ child nodes of the element are now removed first, instead of only text
|
54
|
-
nodes being removed.
|
55
|
-
|
56
|
-
See <https://github.com/YorickPeterse/oga/issues/64> for more information.
|
57
|
-
|
58
|
-
### Support for extra XML entities
|
59
|
-
|
60
|
-
Support for encoding/decoding extra XML entities was added by Dmitry
|
61
|
-
Krasnoukhov. This includes entities such as `<`, `"`, etc. See commit
|
62
|
-
26baf89440d97bd9dd5e50ec3d6d9b7ab3bdf737 for more information.
|
63
|
-
|
64
|
-
### Support for inline doctypes with newlines in IO input
|
65
|
-
|
66
|
-
The XML lexer (and thus the parser) can now handle inline doctypes containing
|
67
|
-
newlines when using an IO object as the input. For example:
|
68
|
-
|
69
|
-
<!DOCTYPE html[foo
|
70
|
-
bar]>
|
71
|
-
|
72
|
-
Previously this would result in incorrect tokens being emitted by the lexer. See
|
73
|
-
commit cbb2815146a79805b8da483d2ef48d17e2959e72 for more information.
|
74
|
-
|
75
|
-
## 0.2.0 - 2014-11-17
|
76
|
-
|
77
|
-
### CSS Selector Support
|
78
|
-
|
79
|
-
Probably the biggest feature of this release: support for querying documents
|
80
|
-
using CSS selectors. Oga supports a subset of the CSS3 selector specification,
|
81
|
-
in particular the following selectors are supported:
|
82
|
-
|
83
|
-
* Element, class and ID selectors
|
84
|
-
* Attribute selectors (e.g. `foo[x ~= "y"]`)
|
85
|
-
|
86
|
-
The following pseudo classes are supported:
|
87
|
-
|
88
|
-
* `:root`
|
89
|
-
* `:nth-child(n)`
|
90
|
-
* `:nth-last-child(n)`
|
91
|
-
* `:nth-of-type(n)`
|
92
|
-
* `:nth-last-of-type(n)`
|
93
|
-
* `:first-child`
|
94
|
-
* `:last-child`
|
95
|
-
* `:first-of-type`
|
96
|
-
* `:last-of-type`
|
97
|
-
* `:only-child`
|
98
|
-
* `:only-of-type`
|
99
|
-
* `:empty`
|
100
|
-
|
101
|
-
You can use CSS selectors using the methods `css` and `at_css` on an instance of
|
102
|
-
`Oga::XML::Document` or `Oga::XML::Element`. For example:
|
103
|
-
|
104
|
-
document = Oga.parse_xml('<people><person>Alice</person></people>')
|
105
|
-
|
106
|
-
document.css('people person') # => NodeSet(Element(name: "person" ...))
|
107
|
-
|
108
|
-
The architecture behind this is quite similar to parsing XPath. There's a lexer
|
109
|
-
(`Oga::CSS::Lexer`) and a parser (`Oga::CSS::Parser`). Unlike Nokogiri (and
|
110
|
-
perhaps other libraries) the parser _does not_ output XPath expressions as a
|
111
|
-
String or a CSS specific AST. Instead it directly emits an XPath AST. This
|
112
|
-
allows the resulting AST to be directly evaluated by `Oga::XPath::Evaluator`.
|
113
|
-
|
114
|
-
See <https://github.com/YorickPeterse/oga/issues/11> for more information.
|
115
|
-
|
116
|
-
### Mutli-line Attribute Support
|
117
|
-
|
118
|
-
Oga can now lex/parse elements that have attributes with newlines in them.
|
119
|
-
Previously this would trigger memory allocation errors.
|
120
|
-
|
121
|
-
See <https://github.com/YorickPeterse/oga/issues/58> for more information.
|
122
|
-
|
123
|
-
### SAX after_element
|
124
|
-
|
125
|
-
The `after_element` method in the SAX parsing API now always takes two
|
126
|
-
arguments: the namespace name and element name. Previously this method would
|
127
|
-
always receive a single nil value as its argument, which is rather pointless.
|
128
|
-
|
129
|
-
See <https://github.com/YorickPeterse/oga/issues/54> for more information.
|
130
|
-
|
131
|
-
### XPath Grouping
|
132
|
-
|
133
|
-
XPath expressions can now be grouped together using parenthesis. This allows one
|
134
|
-
to specify a custom operator precedence.
|
135
|
-
|
136
|
-
### Enumerator Parsing Input
|
137
|
-
|
138
|
-
Enumerator instances can now be used as input for `Oga.parse_xml` and friends.
|
139
|
-
This can be used to download and parse XML files on the fly. For example:
|
140
|
-
|
141
|
-
enum = Enumerator.new do |yielder|
|
142
|
-
HTTPClient.get('http://some-website.com/some-big-file.xml') do |chunk|
|
143
|
-
yielder << chunk
|
144
|
-
end
|
145
|
-
end
|
146
|
-
|
147
|
-
document = Oga.parse_xml(enum)
|
148
|
-
|
149
|
-
See <https://github.com/YorickPeterse/oga/issues/48> for more information.
|
150
|
-
|
151
|
-
### Removing Attributes
|
152
|
-
|
153
|
-
Element attributes can now be removed using `Oga::XML::Element#unset`:
|
154
|
-
|
155
|
-
element = Oga::XML::Element.new(:name => 'foo')
|
156
|
-
|
157
|
-
element.set('class', 'foo')
|
158
|
-
element.unset('class')
|
159
|
-
|
160
|
-
### XPath Attributes
|
161
|
-
|
162
|
-
XPath predicates are now evaluated for every context node opposed to being
|
163
|
-
evaluated once for the entire context. This ensures that expressions such as
|
164
|
-
`descendant-or-self::node()/foo[1]` are evaluated correctly.
|
165
|
-
|
166
|
-
### Available Namespaces
|
167
|
-
|
168
|
-
When calling `Oga::XML::Element#available_namespaces` the Hash returned by
|
169
|
-
`Oga::XML::Element#namespaces` would be modified in place. This was a bug that
|
170
|
-
has been fixed in this release.
|
171
|
-
|
172
|
-
### NodeSets
|
173
|
-
|
174
|
-
NodeSet instances can now be compared with each other using `==`. Previously
|
175
|
-
this would always consider two instances to be different from each other due to
|
176
|
-
the usage of the default `Object#==` method.
|
177
|
-
|
178
|
-
### XML Entities
|
179
|
-
|
180
|
-
XML entities such as `&` and `<` are now encoded/decoded by the lexer,
|
181
|
-
string and text nodes.
|
182
|
-
|
183
|
-
See <https://github.com/YorickPeterse/oga/issues/49> for more information.
|
184
|
-
|
185
|
-
### General
|
186
|
-
|
187
|
-
Source lines are no longer included in error messages generated by the XML
|
188
|
-
parser. This simplifies the code and removes the need of re-reading the input
|
189
|
-
(in case of IO/Enumerable inputs).
|
190
|
-
|
191
|
-
### XML Lexer Newlines
|
192
|
-
|
193
|
-
Newlines in the XML lexer are now counted in native code (C/Java). On MRI and
|
194
|
-
JRuby the improvement is quite small, but on Rubinius it's a massive
|
195
|
-
improvement. See commit `8db77c0a09bf6c996dd2856a6dbe1ad076b1d30a` for more
|
196
|
-
information.
|
197
|
-
|
198
|
-
### HTML Void Element Performance
|
199
|
-
|
200
|
-
Performance for detecting HTML void elements (e.g. `<br>` and `<link>`) has been
|
201
|
-
improved by removing String allocations that were not needed.
|
202
|
-
|
203
|
-
## 0.1.3 - 2014-09-24
|
204
|
-
|
205
|
-
This release fixes a problem with serializing attributes using the namespace
|
206
|
-
prefix "xmlns". See <https://github.com/YorickPeterse/oga/issues/47> for more
|
207
|
-
information.
|
208
|
-
|
209
|
-
## 0.1.2 - 2014-09-23
|
210
|
-
|
211
|
-
### SAX API
|
212
|
-
|
213
|
-
A SAX parser/API has been added. This API is useful when even the overhead of
|
214
|
-
the pull-parser is too much memory wise. Example:
|
215
|
-
|
216
|
-
class ElementNames
|
217
|
-
attr_reader :names
|
218
|
-
|
219
|
-
def initialize
|
220
|
-
@names = []
|
221
|
-
end
|
222
|
-
|
223
|
-
def on_element(namespace, name, attrs = {})
|
224
|
-
@names << name
|
225
|
-
end
|
226
|
-
end
|
227
|
-
|
228
|
-
handler = ElementNames.new
|
229
|
-
|
230
|
-
Oga.sax_parse_xml(handler, '<foo><bar></bar></foo>')
|
231
|
-
|
232
|
-
handler.names # => ["foo", "bar"]
|
233
|
-
|
234
|
-
### Racc Gem
|
235
|
-
|
236
|
-
Oga will now always use the Racc gem instead of the version shipped with the
|
237
|
-
Ruby standard library.
|
238
|
-
|
239
|
-
### Error Reporting
|
240
|
-
|
241
|
-
XML parser errors have been made a little bit more user friendly, though they
|
242
|
-
can still be quite cryptic.
|
243
|
-
|
244
|
-
### Serializing Elements
|
245
|
-
|
246
|
-
Elements serialized to XML/HTML will use self-closing tags whenever possible.
|
247
|
-
When parsing HTML documents only HTML void elements will use self-closing tags
|
248
|
-
(e.g. `<link>` tags). Example:
|
249
|
-
|
250
|
-
Oga.parse_xml('<foo></foo>').to_xml # => "<foo />"
|
251
|
-
Oga.parse_html('<script></script>').to_xml # => "<script></script>"
|
252
|
-
|
253
|
-
### Default Namespaces
|
254
|
-
|
255
|
-
Namespaces are no longer removed from the attributes list when an element is
|
256
|
-
created.
|
257
|
-
|
258
|
-
Default XML namespaces can now be registered using `xmlns="..."`. Previously
|
259
|
-
this would be ignored. Example:
|
260
|
-
|
261
|
-
document = Oga.parse_xml('<root xmlns="baz"></root>')
|
262
|
-
root = document.children[0]
|
263
|
-
|
264
|
-
root.namespace # => Namespace(name: "xmlns" uri: "baz")
|
265
|
-
|
266
|
-
### Lexing Incomplete Input
|
267
|
-
|
268
|
-
Oga can now lex input such as `</` without entering an infinite loop. Example:
|
269
|
-
|
270
|
-
Oga.parse_xml('</') # => Document(children: NodeSet(Text("</")))
|
271
|
-
|
272
|
-
### Absolute XPath Paths
|
273
|
-
|
274
|
-
Oga can now parse and evaluate the XPath expression "/" (that is, just "/").
|
275
|
-
This will return the root node (usually a Document instance). Example:
|
276
|
-
|
277
|
-
document = Oga.parse_xml('<root></root>')
|
278
|
-
|
279
|
-
document.xpath('/') # => NodeSet(Document(children: NodeSet(Element(name: "root"))))
|
280
|
-
|
281
|
-
### Namespace Ordering
|
282
|
-
|
283
|
-
Namespaces available to an element are now returned in the correct order.
|
284
|
-
Previously outer namespaces would take precedence over inner namespaces, instead
|
285
|
-
of it being the other way around. Example:
|
286
|
-
|
287
|
-
document = Oga.parse_xml <<-EOF
|
288
|
-
<root xmlns:foo="bar">
|
289
|
-
<container xmlns:foo="baz">
|
290
|
-
<foo:text>Text!</foo:text>
|
291
|
-
</container>
|
292
|
-
</root>
|
293
|
-
EOF
|
294
|
-
|
295
|
-
foo = document.at_xpath('root/container/foo:text')
|
296
|
-
|
297
|
-
foo.namespace # => Namespace(name: "foo" uri: "baz")
|
298
|
-
|
299
|
-
### Parsing Capitalized HTML Void Elements
|
300
|
-
|
301
|
-
Oga is now capable of parsing capitalized HTML void elements (e.g. `<BR>`).
|
302
|
-
Previously it could only parse lower-cased void elements. Thanks to Tero Tasanen
|
303
|
-
for fixing this. Example:
|
304
|
-
|
305
|
-
Oga.parse_html('<BR>') # => Document(children: NodeSet(Element(name: "BR")))
|
306
|
-
|
307
|
-
### Node Type Method Removed
|
308
|
-
|
309
|
-
The `node_type` method has been removed and its purpose has been moved into
|
310
|
-
the `XML::PullParser` class itself. This method was solely used by the pull
|
311
|
-
parser to provide shorthands for node classes. As such it doesn't make sense to
|
312
|
-
expose this as a method to the outside world as a public method.
|
313
|
-
|
314
|
-
## 0.1.1 - 2014-09-13
|
315
|
-
|
316
|
-
This release fixes a problem where element attributes were not separated by
|
317
|
-
spaces. Thanks to Jonathan Rochkind for reporting it and Bill Dueber providing
|
318
|
-
an initial patch for this problem.
|
319
|
-
|
320
|
-
## 0.1.0 - 2014-09-12
|
321
|
-
|
322
|
-
The first public release of Oga. This release contains support for parsing XML,
|
323
|
-
basic support for parsing HTML, support for querying documents using XPath and
|
324
|
-
more.
|