saxerator 0.3.0 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.travis.yml +1 -0
- data/README.md +13 -20
- data/lib/saxerator.rb +3 -3
- data/lib/saxerator/dsl.rb +6 -1
- data/lib/saxerator/hash_element.rb +11 -0
- data/lib/saxerator/parser/accumulator.rb +1 -1
- data/lib/saxerator/parser/at_depth_latch.rb +1 -3
- data/lib/saxerator/parser/child_of_latch.rb +5 -14
- data/lib/saxerator/parser/{for_tag_latch.rb → for_tags_latch.rb} +5 -5
- data/lib/saxerator/parser/within_latch.rb +5 -5
- data/lib/saxerator/string_element.rb +12 -0
- data/lib/saxerator/version.rb +1 -1
- data/lib/saxerator/xml_node.rb +19 -22
- data/saxerator.gemspec +3 -6
- data/spec/fixtures/nested_elements.xml +1 -1
- data/spec/lib/dsl/all_spec.rb +20 -0
- data/spec/lib/dsl/at_depth_spec.rb +34 -0
- data/spec/lib/dsl/child_of_spec.rb +36 -0
- data/spec/lib/dsl/for_tag_spec.rb +20 -0
- data/spec/lib/dsl/for_tags_spec.rb +19 -0
- data/spec/lib/dsl/with_attribute_spec.rb +28 -0
- data/spec/lib/dsl/within_spec.rb +28 -0
- data/spec/lib/saxerator_spec.rb +40 -135
- metadata +24 -28
- data/lib/saxerator/hash_with_attributes.rb +0 -5
- data/lib/saxerator/string_with_attributes.rb +0 -5
data/.travis.yml
CHANGED
data/README.md
CHANGED
@@ -1,10 +1,8 @@
|
|
1
1
|
Saxerator [![Build Status](https://secure.travis-ci.org/soulcutter/saxerator.png?branch=master)](http://travis-ci.org/soulcutter/saxerator)
|
2
2
|
=========
|
3
3
|
|
4
|
-
Saxerator is a
|
5
|
-
|
6
|
-
This approach is ideal for large xml files containing a collection of elements that you can process
|
7
|
-
independently.
|
4
|
+
Saxerator is a streaming xml-to-hash parser designed for working with very large xml files by
|
5
|
+
giving you Enumerable access to manageable chunks of the document.
|
8
6
|
|
9
7
|
Each xml chunk is parsed into a JSON-like Ruby Hash structure for consumption.
|
10
8
|
|
@@ -21,14 +19,15 @@ The DSL consists of predicates that may be combined to describe which elements t
|
|
21
19
|
Saxerator will only enumerate over chunks of xml that match all of the combined predicates (see Examples section
|
22
20
|
for added clarity).
|
23
21
|
|
24
|
-
| Predicate
|
25
|
-
|
26
|
-
| `all`
|
27
|
-
| `for_tag(name)`
|
28
|
-
| `
|
29
|
-
| `
|
30
|
-
| `
|
31
|
-
| `
|
22
|
+
| Predicate | Explanation |
|
23
|
+
|:-----------------|:------------|
|
24
|
+
| `all` | Returns the entire document parsed into a hash. Cannot combine with other predicates
|
25
|
+
| `for_tag(name)` | Elements whose name matches the given `name`
|
26
|
+
| `for_tags(names)`| Elements whose name is in the `names` Array
|
27
|
+
| `at_depth(n)` | Elements `n` levels deep inside the root of an xml document. The root element itself is `n = 0`
|
28
|
+
| `within(name)` | Elements nested anywhere within an element with the given `name`
|
29
|
+
| `child_of(name)` | Elements that are direct children of an element with the given `name`
|
30
|
+
| `with_attribute(name, value)` | Elements that have an attribute with a given `name` and `value`. If no `value` is given, matches any element with the specified attribute name present
|
32
31
|
|
33
32
|
|
34
33
|
Examples
|
@@ -70,20 +69,14 @@ books = bookshelf_contents.for_tag(:book)
|
|
70
69
|
magazines = bookshelf_contents.for_tag(:magazine)
|
71
70
|
|
72
71
|
books.each do |book|
|
73
|
-
#
|
72
|
+
# ...
|
74
73
|
end
|
75
74
|
|
76
75
|
magazines.each do |magazine|
|
77
|
-
#
|
76
|
+
# ...
|
78
77
|
end
|
79
78
|
```
|
80
79
|
|
81
|
-
Don't care about memory/streaming, you just want your xml in one big hash? Saxerator can do that too.
|
82
|
-
|
83
|
-
```ruby
|
84
|
-
parser.all # big, giant hash
|
85
|
-
```
|
86
|
-
|
87
80
|
Known Issues
|
88
81
|
------------
|
89
82
|
* JRuby closes the file stream at the end of parsing, therefor to perform multiple operations
|
data/lib/saxerator.rb
CHANGED
@@ -2,12 +2,12 @@ require 'saxerator/version'
|
|
2
2
|
|
3
3
|
require 'saxerator/full_document'
|
4
4
|
require 'saxerator/document_fragment'
|
5
|
-
require 'saxerator/
|
6
|
-
require 'saxerator/
|
5
|
+
require 'saxerator/string_element'
|
6
|
+
require 'saxerator/hash_element'
|
7
7
|
require 'saxerator/xml_node'
|
8
8
|
|
9
9
|
require 'saxerator/parser/accumulator'
|
10
|
-
require 'saxerator/parser/
|
10
|
+
require 'saxerator/parser/for_tags_latch'
|
11
11
|
require 'saxerator/parser/at_depth_latch'
|
12
12
|
require 'saxerator/parser/within_latch'
|
13
13
|
require 'saxerator/parser/latched_accumulator'
|
data/lib/saxerator/dsl.rb
CHANGED
@@ -1,7 +1,12 @@
|
|
1
1
|
module Saxerator
|
2
2
|
module DSL
|
3
3
|
def for_tag(tag)
|
4
|
-
|
4
|
+
for_tags([tag])
|
5
|
+
end
|
6
|
+
|
7
|
+
def for_tags(tags)
|
8
|
+
raise ArgumentError('#for_tags requires an Array argument') unless tags.is_a? Array
|
9
|
+
specify Parser::ForTagsLatch.new(tags.map(&:to_s))
|
5
10
|
end
|
6
11
|
|
7
12
|
def at_depth(depth)
|
@@ -1,18 +1,14 @@
|
|
1
|
-
require 'saxerator/parser/document_latch'
|
2
|
-
|
3
1
|
module Saxerator
|
4
2
|
module Parser
|
5
|
-
class ChildOfLatch <
|
3
|
+
class ChildOfLatch < Nokogiri::XML::SAX::Document
|
6
4
|
def initialize(name)
|
7
5
|
@name = name
|
8
6
|
@depths = []
|
9
|
-
@depth_within_element = 0
|
10
7
|
end
|
11
8
|
|
12
9
|
def start_element name, _
|
13
10
|
if depth_within_element > 0
|
14
11
|
increment_depth(1)
|
15
|
-
resolve_open_status
|
16
12
|
end
|
17
13
|
if @name == name
|
18
14
|
@depths.push 1
|
@@ -23,10 +19,13 @@ module Saxerator
|
|
23
19
|
if depth_within_element > 0
|
24
20
|
increment_depth(-1)
|
25
21
|
@depths.pop if @depths.last == 0
|
26
|
-
resolve_open_status
|
27
22
|
end
|
28
23
|
end
|
29
24
|
|
25
|
+
def open?
|
26
|
+
depth_within_element == 2
|
27
|
+
end
|
28
|
+
|
30
29
|
def increment_depth(amount)
|
31
30
|
@depths.map! { |depth| depth + amount }
|
32
31
|
end
|
@@ -34,14 +33,6 @@ module Saxerator
|
|
34
33
|
def depth_within_element
|
35
34
|
@depths.size > 0 ? @depths.last : 0
|
36
35
|
end
|
37
|
-
|
38
|
-
def resolve_open_status
|
39
|
-
if depth_within_element == 2
|
40
|
-
open
|
41
|
-
else
|
42
|
-
close
|
43
|
-
end
|
44
|
-
end
|
45
36
|
end
|
46
37
|
end
|
47
38
|
end
|
@@ -2,17 +2,17 @@ require 'saxerator/parser/document_latch'
|
|
2
2
|
|
3
3
|
module Saxerator
|
4
4
|
module Parser
|
5
|
-
class
|
6
|
-
def initialize(
|
7
|
-
@
|
5
|
+
class ForTagsLatch < DocumentLatch
|
6
|
+
def initialize(names)
|
7
|
+
@names = names
|
8
8
|
end
|
9
9
|
|
10
10
|
def start_element name, _
|
11
|
-
|
11
|
+
@names.include?(name) ? open : close
|
12
12
|
end
|
13
13
|
|
14
14
|
def end_element name
|
15
|
-
close if
|
15
|
+
close if @names.include?(name)
|
16
16
|
end
|
17
17
|
end
|
18
18
|
end
|
@@ -1,8 +1,6 @@
|
|
1
|
-
require 'saxerator/parser/document_latch'
|
2
|
-
|
3
1
|
module Saxerator
|
4
2
|
module Parser
|
5
|
-
class WithinLatch <
|
3
|
+
class WithinLatch < Nokogiri::XML::SAX::Document
|
6
4
|
def initialize(name)
|
7
5
|
@name = name
|
8
6
|
@depth_within_element = 0
|
@@ -11,16 +9,18 @@ module Saxerator
|
|
11
9
|
def start_element name, _
|
12
10
|
if name == @name || @depth_within_element > 0
|
13
11
|
@depth_within_element += 1
|
14
|
-
open if @depth_within_element == 2
|
15
12
|
end
|
16
13
|
end
|
17
14
|
|
18
15
|
def end_element _
|
19
16
|
if @depth_within_element > 0
|
20
17
|
@depth_within_element -= 1
|
21
|
-
close if @depth_within_element == 1
|
22
18
|
end
|
23
19
|
end
|
20
|
+
|
21
|
+
def open?
|
22
|
+
@depth_within_element > 1
|
23
|
+
end
|
24
24
|
end
|
25
25
|
end
|
26
26
|
end
|
data/lib/saxerator/version.rb
CHANGED
data/lib/saxerator/xml_node.rb
CHANGED
@@ -16,34 +16,31 @@ module Saxerator
|
|
16
16
|
end
|
17
17
|
|
18
18
|
def to_s
|
19
|
-
|
20
|
-
string.attributes = @attributes
|
21
|
-
string
|
19
|
+
StringElement.new(@children.join, @name, @attributes)
|
22
20
|
end
|
23
21
|
|
24
22
|
def to_hash
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
name
|
33
|
-
|
34
|
-
|
35
|
-
if out[name]
|
36
|
-
if !out[name].is_a?(Array)
|
37
|
-
out[name] = [out[name]]
|
38
|
-
end
|
39
|
-
out[name] << element
|
40
|
-
else
|
41
|
-
out[name] = element
|
23
|
+
hash = HashElement.new(@name, @attributes)
|
24
|
+
|
25
|
+
@children.each do |child|
|
26
|
+
name = child.name
|
27
|
+
element = child.block_variable
|
28
|
+
|
29
|
+
if hash[name]
|
30
|
+
if !hash[name].is_a?(Array)
|
31
|
+
hash[name] = [hash[name]]
|
42
32
|
end
|
33
|
+
hash[name] << element
|
34
|
+
else
|
35
|
+
hash[name] = element
|
43
36
|
end
|
44
|
-
|
45
|
-
out
|
46
37
|
end
|
38
|
+
|
39
|
+
hash
|
40
|
+
end
|
41
|
+
|
42
|
+
def block_variable
|
43
|
+
@text ? to_s : to_hash
|
47
44
|
end
|
48
45
|
end
|
49
46
|
end
|
data/saxerator.gemspec
CHANGED
@@ -10,10 +10,8 @@ Gem::Specification.new do |s|
|
|
10
10
|
s.homepage = 'https://github.com/soulcutter/saxerator'
|
11
11
|
s.summary = 'A SAX-based XML-to-hash parser for parsing large files into manageable chunks'
|
12
12
|
s.description = <<-eos
|
13
|
-
Saxerator is a
|
14
|
-
|
15
|
-
This approach is ideal for large xml files containing a collection of elements that you can process
|
16
|
-
independently.
|
13
|
+
Saxerator is a streaming xml-to-hash parser designed for working with very large xml files by
|
14
|
+
giving you Enumerable access to manageable chunks of the document.
|
17
15
|
eos
|
18
16
|
s.license = 'MIT'
|
19
17
|
|
@@ -37,6 +35,5 @@ Gem::Specification.new do |s|
|
|
37
35
|
|
38
36
|
s.add_runtime_dependency 'nokogiri', '>= 1.4.0'
|
39
37
|
|
40
|
-
s.add_development_dependency '
|
41
|
-
s.add_development_dependency 'rspec'
|
38
|
+
s.add_development_dependency 'rspec', '>= 2.11.0'
|
42
39
|
end
|
@@ -14,7 +14,7 @@
|
|
14
14
|
<content type="html"><p>Airplanes are very large — this can present difficulty in digestion.</p></content>
|
15
15
|
<media:thumbnail url="http://www.gravatar.com/avatar/a9eb6ba22e482b71b266daadf9c9a080?s=80"/>
|
16
16
|
<author>
|
17
|
-
<name
|
17
|
+
<name><![CDATA[Soul<utter]]></name>
|
18
18
|
</author>
|
19
19
|
<contributor type="primary">
|
20
20
|
<name>Jane Doe</name>
|
@@ -0,0 +1,20 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe "Saxerator::FullDocument#all" do
|
4
|
+
subject(:parser) { Saxerator.parser(xml) }
|
5
|
+
|
6
|
+
let(:xml) do
|
7
|
+
<<-eos
|
8
|
+
<blurbs>
|
9
|
+
<blurb>one</blurb>
|
10
|
+
<blurb>two</blurb>
|
11
|
+
<blurb>three</blurb>
|
12
|
+
<notablurb>four</notablurb>
|
13
|
+
</blurbs>
|
14
|
+
eos
|
15
|
+
end
|
16
|
+
|
17
|
+
it "should allow you to parse an entire document" do
|
18
|
+
parser.all.should == {'blurb' => ['one', 'two', 'three'], 'notablurb' => 'four'}
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe "Saxerator::DSL#at_depth" do
|
4
|
+
subject(:parser) { Saxerator.parser(xml) }
|
5
|
+
|
6
|
+
let(:xml) do
|
7
|
+
<<-eos
|
8
|
+
<publications>
|
9
|
+
<book>
|
10
|
+
<name>How to eat an airplane</name>
|
11
|
+
<author>Leviticus Alabaster</author>
|
12
|
+
</book>
|
13
|
+
<book>
|
14
|
+
<name>To wallop a horse in the face</name>
|
15
|
+
<author>Jeanne Clarewood</author>
|
16
|
+
</book>
|
17
|
+
</publications>
|
18
|
+
eos
|
19
|
+
end
|
20
|
+
|
21
|
+
it "should parse elements at the requested tag depth" do
|
22
|
+
parser.at_depth(2).inject([], :<<).should == [
|
23
|
+
'How to eat an airplane', 'Leviticus Alabaster',
|
24
|
+
'To wallop a horse in the face', 'Jeanne Clarewood'
|
25
|
+
]
|
26
|
+
end
|
27
|
+
|
28
|
+
it "should work in combination with #for_tag" do
|
29
|
+
parser.at_depth(2).for_tag(:name).inject([], :<<).should == [
|
30
|
+
'How to eat an airplane',
|
31
|
+
'To wallop a horse in the face'
|
32
|
+
]
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe "Saxerator::DSL#child_of" do
|
4
|
+
subject(:parser) { Saxerator.parser(xml) }
|
5
|
+
|
6
|
+
let(:xml) do
|
7
|
+
<<-eos
|
8
|
+
<root>
|
9
|
+
<children>
|
10
|
+
<name>Rudy McMannis</name>
|
11
|
+
<children>
|
12
|
+
<name>Tom McMannis</name>
|
13
|
+
</children>
|
14
|
+
<grandchildren>
|
15
|
+
<name>Mildred Marston</name>
|
16
|
+
</grandchildren>
|
17
|
+
<name>Anne Welsh</name>
|
18
|
+
</children>
|
19
|
+
</root>
|
20
|
+
eos
|
21
|
+
end
|
22
|
+
|
23
|
+
it "should only parse children of the specified tag" do
|
24
|
+
parser.child_of(:grandchildren).inject([], :<<).should == [
|
25
|
+
'Mildred Marston'
|
26
|
+
]
|
27
|
+
end
|
28
|
+
|
29
|
+
it "should work in combination with #for_tag" do
|
30
|
+
parser.for_tag(:name).child_of(:children).inject([], :<<).should == [
|
31
|
+
'Rudy McMannis',
|
32
|
+
'Tom McMannis',
|
33
|
+
'Anne Welsh'
|
34
|
+
]
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe "Saxerator::DSL#for_tag" do
|
4
|
+
subject(:parser) { Saxerator.parser(xml) }
|
5
|
+
|
6
|
+
let(:xml) do
|
7
|
+
<<-eos
|
8
|
+
<blurbs>
|
9
|
+
<blurb>one</blurb>
|
10
|
+
<blurb>two</blurb>
|
11
|
+
<blurb>three</blurb>
|
12
|
+
<notablurb>four</notablurb>
|
13
|
+
</blurbs>
|
14
|
+
eos
|
15
|
+
end
|
16
|
+
|
17
|
+
it "should only select the specified tag" do
|
18
|
+
parser.for_tag(:blurb).inject([], :<<).should == ['one', 'two', 'three']
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe "Saxerator::DSL#for_tags" do
|
4
|
+
subject(:parser) { Saxerator.parser(xml) }
|
5
|
+
|
6
|
+
let(:xml) do
|
7
|
+
<<-eos
|
8
|
+
<blurbs>
|
9
|
+
<blurb1>one</blurb1>
|
10
|
+
<blurb2>two</blurb2>
|
11
|
+
<blurb3>three</blurb3>
|
12
|
+
</blurbs>
|
13
|
+
eos
|
14
|
+
end
|
15
|
+
|
16
|
+
it "should only select the specified tags" do
|
17
|
+
parser.for_tags(%w(blurb1 blurb3)).inject([], :<<).should == ['one', 'three']
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe "Saxerator::DSL#with_attribute" do
|
4
|
+
subject(:parser) { Saxerator.parser(xml) }
|
5
|
+
|
6
|
+
let(:xml) do
|
7
|
+
<<-eos
|
8
|
+
<book>
|
9
|
+
<name>How to eat an airplane</name>
|
10
|
+
<author>
|
11
|
+
<name type="primary">Leviticus Alabaster</name>
|
12
|
+
<name type="foreword">Eunice Diesel</name>
|
13
|
+
</author>
|
14
|
+
</book>
|
15
|
+
eos
|
16
|
+
end
|
17
|
+
|
18
|
+
it "should match tags with the specified attributes" do
|
19
|
+
subject.with_attribute(:type).inject([], :<<).should == [
|
20
|
+
'Leviticus Alabaster',
|
21
|
+
'Eunice Diesel'
|
22
|
+
]
|
23
|
+
end
|
24
|
+
|
25
|
+
it "should match tags with the specified attributes" do
|
26
|
+
subject.with_attribute(:type, :primary).inject([], :<<).should == ['Leviticus Alabaster']
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe "Saxerator::DSL#within" do
|
4
|
+
subject(:parser) { Saxerator.parser(xml) }
|
5
|
+
|
6
|
+
let(:xml) do
|
7
|
+
<<-eos
|
8
|
+
<magazine>
|
9
|
+
<name>The Smarterest</name>
|
10
|
+
<article>
|
11
|
+
<name>Is our children learning?</name>
|
12
|
+
<author>Hazel Nutt</author>
|
13
|
+
</article>
|
14
|
+
</magazine>
|
15
|
+
eos
|
16
|
+
end
|
17
|
+
|
18
|
+
it "should only parse elements nested within the specified tag" do
|
19
|
+
parser.within(:article).inject([], :<<).should == [
|
20
|
+
'Is our children learning?',
|
21
|
+
'Hazel Nutt'
|
22
|
+
]
|
23
|
+
end
|
24
|
+
|
25
|
+
it "should work in combination with #for_tag" do
|
26
|
+
parser.for_tag(:name).within(:article).inject([], :<<).should == ['Is our children learning?']
|
27
|
+
end
|
28
|
+
end
|
data/spec/lib/saxerator_spec.rb
CHANGED
@@ -7,164 +7,69 @@ describe Saxerator do
|
|
7
7
|
File.new(File.join(File.dirname(__FILE__), '..', 'fixtures', name))
|
8
8
|
end
|
9
9
|
|
10
|
-
context "
|
11
|
-
subject { parser }
|
12
|
-
let(:parser) { Saxerator.parser(xml) }
|
10
|
+
context "#parser" do
|
11
|
+
subject(:parser) { Saxerator.parser(xml) }
|
13
12
|
|
14
|
-
context "with a
|
15
|
-
let(:xml)
|
16
|
-
<<-eos
|
17
|
-
<blurbs>
|
18
|
-
<blurb>one</blurb>
|
19
|
-
<blurb>two</blurb>
|
20
|
-
<blurb>three</blurb>
|
21
|
-
<notablurb>four</notablurb>
|
22
|
-
</blurbs>
|
23
|
-
eos
|
24
|
-
end
|
25
|
-
|
26
|
-
it "should parse simple strings" do
|
27
|
-
subject.for_tag(:blurb).inject([], :<<).should == ['one', 'two', 'three']
|
28
|
-
end
|
13
|
+
context "with a File argument" do
|
14
|
+
let(:xml) { fixture_file('flat_blurbs.xml') }
|
29
15
|
|
30
|
-
it "should
|
31
|
-
|
16
|
+
it "should be able to parse it" do
|
17
|
+
parser.all.should == {'blurb' => ['one', 'two', 'three']}
|
32
18
|
end
|
33
19
|
|
34
|
-
it "should allow
|
35
|
-
|
20
|
+
it "should allow multiple operations on the same parser" do
|
21
|
+
# This exposes a bug where if a File is not reset only the first
|
22
|
+
# Enumerable method works as expected
|
23
|
+
parser.for_tag(:blurb).first.should == 'one'
|
24
|
+
parser.for_tag(:blurb).first.should == 'one'
|
36
25
|
end
|
37
26
|
end
|
38
27
|
|
39
|
-
context "with a
|
28
|
+
context "with a String argument" do
|
40
29
|
let(:xml) do
|
41
30
|
<<-eos
|
42
|
-
<
|
43
|
-
<
|
44
|
-
|
45
|
-
|
46
|
-
<name type="primary">Leviticus Alabaster</name>
|
47
|
-
<name type="foreword">Eunice Diesel</name>
|
48
|
-
</author>
|
49
|
-
</book>
|
50
|
-
<book>
|
51
|
-
<name>To wallop a horse in the face</name>
|
52
|
-
<author>
|
53
|
-
<name>Jeanne Clarewood</name>
|
54
|
-
</author>
|
55
|
-
</book>
|
56
|
-
<article>
|
57
|
-
<name>Is our children learning?</name>
|
58
|
-
<author>
|
59
|
-
<name>Hazel Nutt</name>
|
60
|
-
</author>
|
61
|
-
</article>
|
62
|
-
</publication>
|
31
|
+
<book>
|
32
|
+
<name>Illiterates that can read</name>
|
33
|
+
<author>Eunice Diesel</author>
|
34
|
+
</book>
|
63
35
|
eos
|
64
36
|
end
|
65
37
|
|
66
|
-
it "should
|
67
|
-
|
68
|
-
'How to eat an airplane', { 'name' => ['Leviticus Alabaster', 'Eunice Diesel'] },
|
69
|
-
'To wallop a horse in the face', { 'name' => 'Jeanne Clarewood' },
|
70
|
-
'Is our children learning?', { 'name' => 'Hazel Nutt' }
|
71
|
-
]
|
72
|
-
end
|
73
|
-
|
74
|
-
it "should only parse the requested tag depth and tag" do
|
75
|
-
subject.at_depth(2).for_tag(:name).inject([], :<<).should == [
|
76
|
-
'How to eat an airplane',
|
77
|
-
'To wallop a horse in the face',
|
78
|
-
'Is our children learning?'
|
79
|
-
]
|
80
|
-
end
|
81
|
-
|
82
|
-
it "should only parse tags nested inside the specified tag" do
|
83
|
-
subject.within(:article).inject([], :<<).should == [
|
84
|
-
'Is our children learning?',
|
85
|
-
{ 'name' => 'Hazel Nutt' }
|
86
|
-
]
|
87
|
-
end
|
88
|
-
|
89
|
-
it "should combine #for_tag and #within to parse the specified elements" do
|
90
|
-
subject.for_tag(:name).within(:article).inject([], :<<).should == [
|
91
|
-
'Is our children learning?',
|
92
|
-
'Hazel Nutt'
|
93
|
-
]
|
94
|
-
end
|
95
|
-
|
96
|
-
it "should match tags with the specified attributes" do
|
97
|
-
subject.with_attribute(:type).inject([], :<<).should == [
|
98
|
-
'Leviticus Alabaster',
|
99
|
-
'Eunice Diesel'
|
100
|
-
]
|
101
|
-
end
|
102
|
-
|
103
|
-
it "should match tags with the specified attributes" do
|
104
|
-
subject.with_attribute(:type, :primary).inject([], :<<).should == ['Leviticus Alabaster']
|
38
|
+
it "should be able to parse it" do
|
39
|
+
parser.all.should == { 'name' => 'Illiterates that can read', 'author' => 'Eunice Diesel' }
|
105
40
|
end
|
106
41
|
end
|
42
|
+
end
|
107
43
|
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
<root>
|
112
|
-
<children>
|
113
|
-
<name>Rudy McMannis</name>
|
114
|
-
<children>
|
115
|
-
<name>Tom McMannis</name>
|
116
|
-
</children>
|
117
|
-
<grandchildren>
|
118
|
-
<name>Mildred Marston</name>
|
119
|
-
</grandchildren>
|
120
|
-
<name>Anne Welsh</name>
|
121
|
-
</children>
|
122
|
-
</root>
|
123
|
-
eos
|
124
|
-
end
|
44
|
+
context "block_variable format" do
|
45
|
+
let(:xml) { fixture_file('nested_elements.xml') }
|
46
|
+
subject(:entry) { Saxerator.parser(xml).for_tag(:entry).first }
|
125
47
|
|
126
|
-
|
127
|
-
|
128
|
-
'Mildred Marston'
|
129
|
-
]
|
130
|
-
end
|
48
|
+
# string
|
49
|
+
specify { entry['title'].should == 'How to eat an airplane' }
|
131
50
|
|
132
|
-
|
133
|
-
|
134
|
-
'Rudy McMannis',
|
135
|
-
'Tom McMannis',
|
136
|
-
'Anne Welsh'
|
137
|
-
]
|
138
|
-
end
|
139
|
-
end
|
51
|
+
# hash and cdata inside name
|
52
|
+
specify { entry['author'].should == {'name' => 'Soul<utter'} }
|
140
53
|
|
141
|
-
|
142
|
-
|
54
|
+
# array of hashes
|
55
|
+
specify { entry['contributor'].should == [{'name' => 'Jane Doe'}, {'name' => 'Leviticus Alabaster'}] }
|
143
56
|
|
144
|
-
|
145
|
-
|
146
|
-
end
|
57
|
+
# attributes on a hash
|
58
|
+
specify { entry['contributor'][0].attributes['type'].should == 'primary' }
|
147
59
|
|
148
|
-
|
149
|
-
|
150
|
-
# Enumerable method works as expected
|
151
|
-
subject.for_tag(:blurb).first.should == 'one'
|
152
|
-
subject.for_tag(:blurb).first.should == 'one'
|
153
|
-
end
|
154
|
-
end
|
60
|
+
# attributes on a string
|
61
|
+
specify { entry['content'].attributes['type'].should == 'html' }
|
155
62
|
|
156
|
-
#
|
157
|
-
|
158
|
-
let(:xml) { fixture_file('nested_elements.xml') }
|
159
|
-
subject { parser.for_tag(:entry).first }
|
63
|
+
# name on a hash
|
64
|
+
specify { entry.name.should == 'entry' }
|
160
65
|
|
161
|
-
|
162
|
-
|
66
|
+
# name on a string
|
67
|
+
specify { entry['title'].name.should == 'title' }
|
163
68
|
|
164
|
-
|
165
|
-
|
69
|
+
# character entity decoding
|
70
|
+
specify { entry['content'].should == "<p>Airplanes are very large — this can present difficulty in digestion.</p>"}
|
166
71
|
|
167
|
-
|
168
|
-
|
72
|
+
# empty element
|
73
|
+
specify { entry['media:thumbnail'].should == {} }
|
169
74
|
end
|
170
75
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: saxerator
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-09-05 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|
@@ -27,22 +27,6 @@ dependencies:
|
|
27
27
|
- - ! '>='
|
28
28
|
- !ruby/object:Gem::Version
|
29
29
|
version: 1.4.0
|
30
|
-
- !ruby/object:Gem::Dependency
|
31
|
-
name: rake
|
32
|
-
requirement: !ruby/object:Gem::Requirement
|
33
|
-
none: false
|
34
|
-
requirements:
|
35
|
-
- - ! '>='
|
36
|
-
- !ruby/object:Gem::Version
|
37
|
-
version: '0'
|
38
|
-
type: :development
|
39
|
-
prerelease: false
|
40
|
-
version_requirements: !ruby/object:Gem::Requirement
|
41
|
-
none: false
|
42
|
-
requirements:
|
43
|
-
- - ! '>='
|
44
|
-
- !ruby/object:Gem::Version
|
45
|
-
version: '0'
|
46
30
|
- !ruby/object:Gem::Dependency
|
47
31
|
name: rspec
|
48
32
|
requirement: !ruby/object:Gem::Requirement
|
@@ -50,7 +34,7 @@ dependencies:
|
|
50
34
|
requirements:
|
51
35
|
- - ! '>='
|
52
36
|
- !ruby/object:Gem::Version
|
53
|
-
version:
|
37
|
+
version: 2.11.0
|
54
38
|
type: :development
|
55
39
|
prerelease: false
|
56
40
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -58,12 +42,10 @@ dependencies:
|
|
58
42
|
requirements:
|
59
43
|
- - ! '>='
|
60
44
|
- !ruby/object:Gem::Version
|
61
|
-
version:
|
62
|
-
description: ! " Saxerator is a
|
63
|
-
very large files
|
64
|
-
|
65
|
-
document.\n This approach is ideal for large xml files containing a collection
|
66
|
-
of elements that you can process\n independently.\n"
|
45
|
+
version: 2.11.0
|
46
|
+
description: ! " Saxerator is a streaming xml-to-hash parser designed for working
|
47
|
+
with very large xml files by\n giving you Enumerable access to manageable chunks
|
48
|
+
of the document.\n"
|
67
49
|
email:
|
68
50
|
- bradley.schaefer@gmail.com
|
69
51
|
executables: []
|
@@ -80,21 +62,28 @@ files:
|
|
80
62
|
- lib/saxerator/document_fragment.rb
|
81
63
|
- lib/saxerator/dsl.rb
|
82
64
|
- lib/saxerator/full_document.rb
|
83
|
-
- lib/saxerator/
|
65
|
+
- lib/saxerator/hash_element.rb
|
84
66
|
- lib/saxerator/parser/accumulator.rb
|
85
67
|
- lib/saxerator/parser/at_depth_latch.rb
|
86
68
|
- lib/saxerator/parser/child_of_latch.rb
|
87
69
|
- lib/saxerator/parser/document_latch.rb
|
88
|
-
- lib/saxerator/parser/
|
70
|
+
- lib/saxerator/parser/for_tags_latch.rb
|
89
71
|
- lib/saxerator/parser/latched_accumulator.rb
|
90
72
|
- lib/saxerator/parser/with_attribute_latch.rb
|
91
73
|
- lib/saxerator/parser/within_latch.rb
|
92
|
-
- lib/saxerator/
|
74
|
+
- lib/saxerator/string_element.rb
|
93
75
|
- lib/saxerator/version.rb
|
94
76
|
- lib/saxerator/xml_node.rb
|
95
77
|
- lib/saxerator.rb
|
96
78
|
- spec/fixtures/flat_blurbs.xml
|
97
79
|
- spec/fixtures/nested_elements.xml
|
80
|
+
- spec/lib/dsl/all_spec.rb
|
81
|
+
- spec/lib/dsl/at_depth_spec.rb
|
82
|
+
- spec/lib/dsl/child_of_spec.rb
|
83
|
+
- spec/lib/dsl/for_tag_spec.rb
|
84
|
+
- spec/lib/dsl/for_tags_spec.rb
|
85
|
+
- spec/lib/dsl/with_attribute_spec.rb
|
86
|
+
- spec/lib/dsl/within_spec.rb
|
98
87
|
- spec/lib/saxerator_spec.rb
|
99
88
|
- spec/spec_helper.rb
|
100
89
|
- benchmark/benchmark.rb
|
@@ -127,5 +116,12 @@ summary: A SAX-based XML-to-hash parser for parsing large files into manageable
|
|
127
116
|
test_files:
|
128
117
|
- spec/fixtures/flat_blurbs.xml
|
129
118
|
- spec/fixtures/nested_elements.xml
|
119
|
+
- spec/lib/dsl/all_spec.rb
|
120
|
+
- spec/lib/dsl/at_depth_spec.rb
|
121
|
+
- spec/lib/dsl/child_of_spec.rb
|
122
|
+
- spec/lib/dsl/for_tag_spec.rb
|
123
|
+
- spec/lib/dsl/for_tags_spec.rb
|
124
|
+
- spec/lib/dsl/with_attribute_spec.rb
|
125
|
+
- spec/lib/dsl/within_spec.rb
|
130
126
|
- spec/lib/saxerator_spec.rb
|
131
127
|
- spec/spec_helper.rb
|