saxerator 0.3.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.travis.yml +1 -0
- data/README.md +13 -20
- data/lib/saxerator.rb +3 -3
- data/lib/saxerator/dsl.rb +6 -1
- data/lib/saxerator/hash_element.rb +11 -0
- data/lib/saxerator/parser/accumulator.rb +1 -1
- data/lib/saxerator/parser/at_depth_latch.rb +1 -3
- data/lib/saxerator/parser/child_of_latch.rb +5 -14
- data/lib/saxerator/parser/{for_tag_latch.rb → for_tags_latch.rb} +5 -5
- data/lib/saxerator/parser/within_latch.rb +5 -5
- data/lib/saxerator/string_element.rb +12 -0
- data/lib/saxerator/version.rb +1 -1
- data/lib/saxerator/xml_node.rb +19 -22
- data/saxerator.gemspec +3 -6
- data/spec/fixtures/nested_elements.xml +1 -1
- data/spec/lib/dsl/all_spec.rb +20 -0
- data/spec/lib/dsl/at_depth_spec.rb +34 -0
- data/spec/lib/dsl/child_of_spec.rb +36 -0
- data/spec/lib/dsl/for_tag_spec.rb +20 -0
- data/spec/lib/dsl/for_tags_spec.rb +19 -0
- data/spec/lib/dsl/with_attribute_spec.rb +28 -0
- data/spec/lib/dsl/within_spec.rb +28 -0
- data/spec/lib/saxerator_spec.rb +40 -135
- metadata +24 -28
- data/lib/saxerator/hash_with_attributes.rb +0 -5
- data/lib/saxerator/string_with_attributes.rb +0 -5
data/.travis.yml
CHANGED
data/README.md
CHANGED
@@ -1,10 +1,8 @@
|
|
1
1
|
Saxerator [](http://travis-ci.org/soulcutter/saxerator)
|
2
2
|
=========
|
3
3
|
|
4
|
-
Saxerator is a
|
5
|
-
|
6
|
-
This approach is ideal for large xml files containing a collection of elements that you can process
|
7
|
-
independently.
|
4
|
+
Saxerator is a streaming xml-to-hash parser designed for working with very large xml files by
|
5
|
+
giving you Enumerable access to manageable chunks of the document.
|
8
6
|
|
9
7
|
Each xml chunk is parsed into a JSON-like Ruby Hash structure for consumption.
|
10
8
|
|
@@ -21,14 +19,15 @@ The DSL consists of predicates that may be combined to describe which elements t
|
|
21
19
|
Saxerator will only enumerate over chunks of xml that match all of the combined predicates (see Examples section
|
22
20
|
for added clarity).
|
23
21
|
|
24
|
-
| Predicate
|
25
|
-
|
26
|
-
| `all`
|
27
|
-
| `for_tag(name)`
|
28
|
-
| `
|
29
|
-
| `
|
30
|
-
| `
|
31
|
-
| `
|
22
|
+
| Predicate | Explanation |
|
23
|
+
|:-----------------|:------------|
|
24
|
+
| `all` | Returns the entire document parsed into a hash. Cannot combine with other predicates
|
25
|
+
| `for_tag(name)` | Elements whose name matches the given `name`
|
26
|
+
| `for_tags(names)`| Elements whose name is in the `names` Array
|
27
|
+
| `at_depth(n)` | Elements `n` levels deep inside the root of an xml document. The root element itself is `n = 0`
|
28
|
+
| `within(name)` | Elements nested anywhere within an element with the given `name`
|
29
|
+
| `child_of(name)` | Elements that are direct children of an element with the given `name`
|
30
|
+
| `with_attribute(name, value)` | Elements that have an attribute with a given `name` and `value`. If no `value` is given, matches any element with the specified attribute name present
|
32
31
|
|
33
32
|
|
34
33
|
Examples
|
@@ -70,20 +69,14 @@ books = bookshelf_contents.for_tag(:book)
|
|
70
69
|
magazines = bookshelf_contents.for_tag(:magazine)
|
71
70
|
|
72
71
|
books.each do |book|
|
73
|
-
#
|
72
|
+
# ...
|
74
73
|
end
|
75
74
|
|
76
75
|
magazines.each do |magazine|
|
77
|
-
#
|
76
|
+
# ...
|
78
77
|
end
|
79
78
|
```
|
80
79
|
|
81
|
-
Don't care about memory/streaming, you just want your xml in one big hash? Saxerator can do that too.
|
82
|
-
|
83
|
-
```ruby
|
84
|
-
parser.all # big, giant hash
|
85
|
-
```
|
86
|
-
|
87
80
|
Known Issues
|
88
81
|
------------
|
89
82
|
* JRuby closes the file stream at the end of parsing, therefor to perform multiple operations
|
data/lib/saxerator.rb
CHANGED
@@ -2,12 +2,12 @@ require 'saxerator/version'
|
|
2
2
|
|
3
3
|
require 'saxerator/full_document'
|
4
4
|
require 'saxerator/document_fragment'
|
5
|
-
require 'saxerator/
|
6
|
-
require 'saxerator/
|
5
|
+
require 'saxerator/string_element'
|
6
|
+
require 'saxerator/hash_element'
|
7
7
|
require 'saxerator/xml_node'
|
8
8
|
|
9
9
|
require 'saxerator/parser/accumulator'
|
10
|
-
require 'saxerator/parser/
|
10
|
+
require 'saxerator/parser/for_tags_latch'
|
11
11
|
require 'saxerator/parser/at_depth_latch'
|
12
12
|
require 'saxerator/parser/within_latch'
|
13
13
|
require 'saxerator/parser/latched_accumulator'
|
data/lib/saxerator/dsl.rb
CHANGED
@@ -1,7 +1,12 @@
|
|
1
1
|
module Saxerator
|
2
2
|
module DSL
|
3
3
|
def for_tag(tag)
|
4
|
-
|
4
|
+
for_tags([tag])
|
5
|
+
end
|
6
|
+
|
7
|
+
def for_tags(tags)
|
8
|
+
raise ArgumentError('#for_tags requires an Array argument') unless tags.is_a? Array
|
9
|
+
specify Parser::ForTagsLatch.new(tags.map(&:to_s))
|
5
10
|
end
|
6
11
|
|
7
12
|
def at_depth(depth)
|
@@ -1,18 +1,14 @@
|
|
1
|
-
require 'saxerator/parser/document_latch'
|
2
|
-
|
3
1
|
module Saxerator
|
4
2
|
module Parser
|
5
|
-
class ChildOfLatch <
|
3
|
+
class ChildOfLatch < Nokogiri::XML::SAX::Document
|
6
4
|
def initialize(name)
|
7
5
|
@name = name
|
8
6
|
@depths = []
|
9
|
-
@depth_within_element = 0
|
10
7
|
end
|
11
8
|
|
12
9
|
def start_element name, _
|
13
10
|
if depth_within_element > 0
|
14
11
|
increment_depth(1)
|
15
|
-
resolve_open_status
|
16
12
|
end
|
17
13
|
if @name == name
|
18
14
|
@depths.push 1
|
@@ -23,10 +19,13 @@ module Saxerator
|
|
23
19
|
if depth_within_element > 0
|
24
20
|
increment_depth(-1)
|
25
21
|
@depths.pop if @depths.last == 0
|
26
|
-
resolve_open_status
|
27
22
|
end
|
28
23
|
end
|
29
24
|
|
25
|
+
def open?
|
26
|
+
depth_within_element == 2
|
27
|
+
end
|
28
|
+
|
30
29
|
def increment_depth(amount)
|
31
30
|
@depths.map! { |depth| depth + amount }
|
32
31
|
end
|
@@ -34,14 +33,6 @@ module Saxerator
|
|
34
33
|
def depth_within_element
|
35
34
|
@depths.size > 0 ? @depths.last : 0
|
36
35
|
end
|
37
|
-
|
38
|
-
def resolve_open_status
|
39
|
-
if depth_within_element == 2
|
40
|
-
open
|
41
|
-
else
|
42
|
-
close
|
43
|
-
end
|
44
|
-
end
|
45
36
|
end
|
46
37
|
end
|
47
38
|
end
|
@@ -2,17 +2,17 @@ require 'saxerator/parser/document_latch'
|
|
2
2
|
|
3
3
|
module Saxerator
|
4
4
|
module Parser
|
5
|
-
class
|
6
|
-
def initialize(
|
7
|
-
@
|
5
|
+
class ForTagsLatch < DocumentLatch
|
6
|
+
def initialize(names)
|
7
|
+
@names = names
|
8
8
|
end
|
9
9
|
|
10
10
|
def start_element name, _
|
11
|
-
|
11
|
+
@names.include?(name) ? open : close
|
12
12
|
end
|
13
13
|
|
14
14
|
def end_element name
|
15
|
-
close if
|
15
|
+
close if @names.include?(name)
|
16
16
|
end
|
17
17
|
end
|
18
18
|
end
|
@@ -1,8 +1,6 @@
|
|
1
|
-
require 'saxerator/parser/document_latch'
|
2
|
-
|
3
1
|
module Saxerator
|
4
2
|
module Parser
|
5
|
-
class WithinLatch <
|
3
|
+
class WithinLatch < Nokogiri::XML::SAX::Document
|
6
4
|
def initialize(name)
|
7
5
|
@name = name
|
8
6
|
@depth_within_element = 0
|
@@ -11,16 +9,18 @@ module Saxerator
|
|
11
9
|
def start_element name, _
|
12
10
|
if name == @name || @depth_within_element > 0
|
13
11
|
@depth_within_element += 1
|
14
|
-
open if @depth_within_element == 2
|
15
12
|
end
|
16
13
|
end
|
17
14
|
|
18
15
|
def end_element _
|
19
16
|
if @depth_within_element > 0
|
20
17
|
@depth_within_element -= 1
|
21
|
-
close if @depth_within_element == 1
|
22
18
|
end
|
23
19
|
end
|
20
|
+
|
21
|
+
def open?
|
22
|
+
@depth_within_element > 1
|
23
|
+
end
|
24
24
|
end
|
25
25
|
end
|
26
26
|
end
|
data/lib/saxerator/version.rb
CHANGED
data/lib/saxerator/xml_node.rb
CHANGED
@@ -16,34 +16,31 @@ module Saxerator
|
|
16
16
|
end
|
17
17
|
|
18
18
|
def to_s
|
19
|
-
|
20
|
-
string.attributes = @attributes
|
21
|
-
string
|
19
|
+
StringElement.new(@children.join, @name, @attributes)
|
22
20
|
end
|
23
21
|
|
24
22
|
def to_hash
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
name
|
33
|
-
|
34
|
-
|
35
|
-
if out[name]
|
36
|
-
if !out[name].is_a?(Array)
|
37
|
-
out[name] = [out[name]]
|
38
|
-
end
|
39
|
-
out[name] << element
|
40
|
-
else
|
41
|
-
out[name] = element
|
23
|
+
hash = HashElement.new(@name, @attributes)
|
24
|
+
|
25
|
+
@children.each do |child|
|
26
|
+
name = child.name
|
27
|
+
element = child.block_variable
|
28
|
+
|
29
|
+
if hash[name]
|
30
|
+
if !hash[name].is_a?(Array)
|
31
|
+
hash[name] = [hash[name]]
|
42
32
|
end
|
33
|
+
hash[name] << element
|
34
|
+
else
|
35
|
+
hash[name] = element
|
43
36
|
end
|
44
|
-
|
45
|
-
out
|
46
37
|
end
|
38
|
+
|
39
|
+
hash
|
40
|
+
end
|
41
|
+
|
42
|
+
def block_variable
|
43
|
+
@text ? to_s : to_hash
|
47
44
|
end
|
48
45
|
end
|
49
46
|
end
|
data/saxerator.gemspec
CHANGED
@@ -10,10 +10,8 @@ Gem::Specification.new do |s|
|
|
10
10
|
s.homepage = 'https://github.com/soulcutter/saxerator'
|
11
11
|
s.summary = 'A SAX-based XML-to-hash parser for parsing large files into manageable chunks'
|
12
12
|
s.description = <<-eos
|
13
|
-
Saxerator is a
|
14
|
-
|
15
|
-
This approach is ideal for large xml files containing a collection of elements that you can process
|
16
|
-
independently.
|
13
|
+
Saxerator is a streaming xml-to-hash parser designed for working with very large xml files by
|
14
|
+
giving you Enumerable access to manageable chunks of the document.
|
17
15
|
eos
|
18
16
|
s.license = 'MIT'
|
19
17
|
|
@@ -37,6 +35,5 @@ Gem::Specification.new do |s|
|
|
37
35
|
|
38
36
|
s.add_runtime_dependency 'nokogiri', '>= 1.4.0'
|
39
37
|
|
40
|
-
s.add_development_dependency '
|
41
|
-
s.add_development_dependency 'rspec'
|
38
|
+
s.add_development_dependency 'rspec', '>= 2.11.0'
|
42
39
|
end
|
@@ -14,7 +14,7 @@
|
|
14
14
|
<content type="html"><p>Airplanes are very large — this can present difficulty in digestion.</p></content>
|
15
15
|
<media:thumbnail url="http://www.gravatar.com/avatar/a9eb6ba22e482b71b266daadf9c9a080?s=80"/>
|
16
16
|
<author>
|
17
|
-
<name
|
17
|
+
<name><![CDATA[Soul<utter]]></name>
|
18
18
|
</author>
|
19
19
|
<contributor type="primary">
|
20
20
|
<name>Jane Doe</name>
|
@@ -0,0 +1,20 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe "Saxerator::FullDocument#all" do
|
4
|
+
subject(:parser) { Saxerator.parser(xml) }
|
5
|
+
|
6
|
+
let(:xml) do
|
7
|
+
<<-eos
|
8
|
+
<blurbs>
|
9
|
+
<blurb>one</blurb>
|
10
|
+
<blurb>two</blurb>
|
11
|
+
<blurb>three</blurb>
|
12
|
+
<notablurb>four</notablurb>
|
13
|
+
</blurbs>
|
14
|
+
eos
|
15
|
+
end
|
16
|
+
|
17
|
+
it "should allow you to parse an entire document" do
|
18
|
+
parser.all.should == {'blurb' => ['one', 'two', 'three'], 'notablurb' => 'four'}
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe "Saxerator::DSL#at_depth" do
|
4
|
+
subject(:parser) { Saxerator.parser(xml) }
|
5
|
+
|
6
|
+
let(:xml) do
|
7
|
+
<<-eos
|
8
|
+
<publications>
|
9
|
+
<book>
|
10
|
+
<name>How to eat an airplane</name>
|
11
|
+
<author>Leviticus Alabaster</author>
|
12
|
+
</book>
|
13
|
+
<book>
|
14
|
+
<name>To wallop a horse in the face</name>
|
15
|
+
<author>Jeanne Clarewood</author>
|
16
|
+
</book>
|
17
|
+
</publications>
|
18
|
+
eos
|
19
|
+
end
|
20
|
+
|
21
|
+
it "should parse elements at the requested tag depth" do
|
22
|
+
parser.at_depth(2).inject([], :<<).should == [
|
23
|
+
'How to eat an airplane', 'Leviticus Alabaster',
|
24
|
+
'To wallop a horse in the face', 'Jeanne Clarewood'
|
25
|
+
]
|
26
|
+
end
|
27
|
+
|
28
|
+
it "should work in combination with #for_tag" do
|
29
|
+
parser.at_depth(2).for_tag(:name).inject([], :<<).should == [
|
30
|
+
'How to eat an airplane',
|
31
|
+
'To wallop a horse in the face'
|
32
|
+
]
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe "Saxerator::DSL#child_of" do
|
4
|
+
subject(:parser) { Saxerator.parser(xml) }
|
5
|
+
|
6
|
+
let(:xml) do
|
7
|
+
<<-eos
|
8
|
+
<root>
|
9
|
+
<children>
|
10
|
+
<name>Rudy McMannis</name>
|
11
|
+
<children>
|
12
|
+
<name>Tom McMannis</name>
|
13
|
+
</children>
|
14
|
+
<grandchildren>
|
15
|
+
<name>Mildred Marston</name>
|
16
|
+
</grandchildren>
|
17
|
+
<name>Anne Welsh</name>
|
18
|
+
</children>
|
19
|
+
</root>
|
20
|
+
eos
|
21
|
+
end
|
22
|
+
|
23
|
+
it "should only parse children of the specified tag" do
|
24
|
+
parser.child_of(:grandchildren).inject([], :<<).should == [
|
25
|
+
'Mildred Marston'
|
26
|
+
]
|
27
|
+
end
|
28
|
+
|
29
|
+
it "should work in combination with #for_tag" do
|
30
|
+
parser.for_tag(:name).child_of(:children).inject([], :<<).should == [
|
31
|
+
'Rudy McMannis',
|
32
|
+
'Tom McMannis',
|
33
|
+
'Anne Welsh'
|
34
|
+
]
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe "Saxerator::DSL#for_tag" do
|
4
|
+
subject(:parser) { Saxerator.parser(xml) }
|
5
|
+
|
6
|
+
let(:xml) do
|
7
|
+
<<-eos
|
8
|
+
<blurbs>
|
9
|
+
<blurb>one</blurb>
|
10
|
+
<blurb>two</blurb>
|
11
|
+
<blurb>three</blurb>
|
12
|
+
<notablurb>four</notablurb>
|
13
|
+
</blurbs>
|
14
|
+
eos
|
15
|
+
end
|
16
|
+
|
17
|
+
it "should only select the specified tag" do
|
18
|
+
parser.for_tag(:blurb).inject([], :<<).should == ['one', 'two', 'three']
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe "Saxerator::DSL#for_tags" do
|
4
|
+
subject(:parser) { Saxerator.parser(xml) }
|
5
|
+
|
6
|
+
let(:xml) do
|
7
|
+
<<-eos
|
8
|
+
<blurbs>
|
9
|
+
<blurb1>one</blurb1>
|
10
|
+
<blurb2>two</blurb2>
|
11
|
+
<blurb3>three</blurb3>
|
12
|
+
</blurbs>
|
13
|
+
eos
|
14
|
+
end
|
15
|
+
|
16
|
+
it "should only select the specified tags" do
|
17
|
+
parser.for_tags(%w(blurb1 blurb3)).inject([], :<<).should == ['one', 'three']
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe "Saxerator::DSL#with_attribute" do
|
4
|
+
subject(:parser) { Saxerator.parser(xml) }
|
5
|
+
|
6
|
+
let(:xml) do
|
7
|
+
<<-eos
|
8
|
+
<book>
|
9
|
+
<name>How to eat an airplane</name>
|
10
|
+
<author>
|
11
|
+
<name type="primary">Leviticus Alabaster</name>
|
12
|
+
<name type="foreword">Eunice Diesel</name>
|
13
|
+
</author>
|
14
|
+
</book>
|
15
|
+
eos
|
16
|
+
end
|
17
|
+
|
18
|
+
it "should match tags with the specified attributes" do
|
19
|
+
subject.with_attribute(:type).inject([], :<<).should == [
|
20
|
+
'Leviticus Alabaster',
|
21
|
+
'Eunice Diesel'
|
22
|
+
]
|
23
|
+
end
|
24
|
+
|
25
|
+
it "should match tags with the specified attributes" do
|
26
|
+
subject.with_attribute(:type, :primary).inject([], :<<).should == ['Leviticus Alabaster']
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe "Saxerator::DSL#within" do
|
4
|
+
subject(:parser) { Saxerator.parser(xml) }
|
5
|
+
|
6
|
+
let(:xml) do
|
7
|
+
<<-eos
|
8
|
+
<magazine>
|
9
|
+
<name>The Smarterest</name>
|
10
|
+
<article>
|
11
|
+
<name>Is our children learning?</name>
|
12
|
+
<author>Hazel Nutt</author>
|
13
|
+
</article>
|
14
|
+
</magazine>
|
15
|
+
eos
|
16
|
+
end
|
17
|
+
|
18
|
+
it "should only parse elements nested within the specified tag" do
|
19
|
+
parser.within(:article).inject([], :<<).should == [
|
20
|
+
'Is our children learning?',
|
21
|
+
'Hazel Nutt'
|
22
|
+
]
|
23
|
+
end
|
24
|
+
|
25
|
+
it "should work in combination with #for_tag" do
|
26
|
+
parser.for_tag(:name).within(:article).inject([], :<<).should == ['Is our children learning?']
|
27
|
+
end
|
28
|
+
end
|
data/spec/lib/saxerator_spec.rb
CHANGED
@@ -7,164 +7,69 @@ describe Saxerator do
|
|
7
7
|
File.new(File.join(File.dirname(__FILE__), '..', 'fixtures', name))
|
8
8
|
end
|
9
9
|
|
10
|
-
context "
|
11
|
-
subject { parser }
|
12
|
-
let(:parser) { Saxerator.parser(xml) }
|
10
|
+
context "#parser" do
|
11
|
+
subject(:parser) { Saxerator.parser(xml) }
|
13
12
|
|
14
|
-
context "with a
|
15
|
-
let(:xml)
|
16
|
-
<<-eos
|
17
|
-
<blurbs>
|
18
|
-
<blurb>one</blurb>
|
19
|
-
<blurb>two</blurb>
|
20
|
-
<blurb>three</blurb>
|
21
|
-
<notablurb>four</notablurb>
|
22
|
-
</blurbs>
|
23
|
-
eos
|
24
|
-
end
|
25
|
-
|
26
|
-
it "should parse simple strings" do
|
27
|
-
subject.for_tag(:blurb).inject([], :<<).should == ['one', 'two', 'three']
|
28
|
-
end
|
13
|
+
context "with a File argument" do
|
14
|
+
let(:xml) { fixture_file('flat_blurbs.xml') }
|
29
15
|
|
30
|
-
it "should
|
31
|
-
|
16
|
+
it "should be able to parse it" do
|
17
|
+
parser.all.should == {'blurb' => ['one', 'two', 'three']}
|
32
18
|
end
|
33
19
|
|
34
|
-
it "should allow
|
35
|
-
|
20
|
+
it "should allow multiple operations on the same parser" do
|
21
|
+
# This exposes a bug where if a File is not reset only the first
|
22
|
+
# Enumerable method works as expected
|
23
|
+
parser.for_tag(:blurb).first.should == 'one'
|
24
|
+
parser.for_tag(:blurb).first.should == 'one'
|
36
25
|
end
|
37
26
|
end
|
38
27
|
|
39
|
-
context "with a
|
28
|
+
context "with a String argument" do
|
40
29
|
let(:xml) do
|
41
30
|
<<-eos
|
42
|
-
<
|
43
|
-
<
|
44
|
-
|
45
|
-
|
46
|
-
<name type="primary">Leviticus Alabaster</name>
|
47
|
-
<name type="foreword">Eunice Diesel</name>
|
48
|
-
</author>
|
49
|
-
</book>
|
50
|
-
<book>
|
51
|
-
<name>To wallop a horse in the face</name>
|
52
|
-
<author>
|
53
|
-
<name>Jeanne Clarewood</name>
|
54
|
-
</author>
|
55
|
-
</book>
|
56
|
-
<article>
|
57
|
-
<name>Is our children learning?</name>
|
58
|
-
<author>
|
59
|
-
<name>Hazel Nutt</name>
|
60
|
-
</author>
|
61
|
-
</article>
|
62
|
-
</publication>
|
31
|
+
<book>
|
32
|
+
<name>Illiterates that can read</name>
|
33
|
+
<author>Eunice Diesel</author>
|
34
|
+
</book>
|
63
35
|
eos
|
64
36
|
end
|
65
37
|
|
66
|
-
it "should
|
67
|
-
|
68
|
-
'How to eat an airplane', { 'name' => ['Leviticus Alabaster', 'Eunice Diesel'] },
|
69
|
-
'To wallop a horse in the face', { 'name' => 'Jeanne Clarewood' },
|
70
|
-
'Is our children learning?', { 'name' => 'Hazel Nutt' }
|
71
|
-
]
|
72
|
-
end
|
73
|
-
|
74
|
-
it "should only parse the requested tag depth and tag" do
|
75
|
-
subject.at_depth(2).for_tag(:name).inject([], :<<).should == [
|
76
|
-
'How to eat an airplane',
|
77
|
-
'To wallop a horse in the face',
|
78
|
-
'Is our children learning?'
|
79
|
-
]
|
80
|
-
end
|
81
|
-
|
82
|
-
it "should only parse tags nested inside the specified tag" do
|
83
|
-
subject.within(:article).inject([], :<<).should == [
|
84
|
-
'Is our children learning?',
|
85
|
-
{ 'name' => 'Hazel Nutt' }
|
86
|
-
]
|
87
|
-
end
|
88
|
-
|
89
|
-
it "should combine #for_tag and #within to parse the specified elements" do
|
90
|
-
subject.for_tag(:name).within(:article).inject([], :<<).should == [
|
91
|
-
'Is our children learning?',
|
92
|
-
'Hazel Nutt'
|
93
|
-
]
|
94
|
-
end
|
95
|
-
|
96
|
-
it "should match tags with the specified attributes" do
|
97
|
-
subject.with_attribute(:type).inject([], :<<).should == [
|
98
|
-
'Leviticus Alabaster',
|
99
|
-
'Eunice Diesel'
|
100
|
-
]
|
101
|
-
end
|
102
|
-
|
103
|
-
it "should match tags with the specified attributes" do
|
104
|
-
subject.with_attribute(:type, :primary).inject([], :<<).should == ['Leviticus Alabaster']
|
38
|
+
it "should be able to parse it" do
|
39
|
+
parser.all.should == { 'name' => 'Illiterates that can read', 'author' => 'Eunice Diesel' }
|
105
40
|
end
|
106
41
|
end
|
42
|
+
end
|
107
43
|
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
<root>
|
112
|
-
<children>
|
113
|
-
<name>Rudy McMannis</name>
|
114
|
-
<children>
|
115
|
-
<name>Tom McMannis</name>
|
116
|
-
</children>
|
117
|
-
<grandchildren>
|
118
|
-
<name>Mildred Marston</name>
|
119
|
-
</grandchildren>
|
120
|
-
<name>Anne Welsh</name>
|
121
|
-
</children>
|
122
|
-
</root>
|
123
|
-
eos
|
124
|
-
end
|
44
|
+
context "block_variable format" do
|
45
|
+
let(:xml) { fixture_file('nested_elements.xml') }
|
46
|
+
subject(:entry) { Saxerator.parser(xml).for_tag(:entry).first }
|
125
47
|
|
126
|
-
|
127
|
-
|
128
|
-
'Mildred Marston'
|
129
|
-
]
|
130
|
-
end
|
48
|
+
# string
|
49
|
+
specify { entry['title'].should == 'How to eat an airplane' }
|
131
50
|
|
132
|
-
|
133
|
-
|
134
|
-
'Rudy McMannis',
|
135
|
-
'Tom McMannis',
|
136
|
-
'Anne Welsh'
|
137
|
-
]
|
138
|
-
end
|
139
|
-
end
|
51
|
+
# hash and cdata inside name
|
52
|
+
specify { entry['author'].should == {'name' => 'Soul<utter'} }
|
140
53
|
|
141
|
-
|
142
|
-
|
54
|
+
# array of hashes
|
55
|
+
specify { entry['contributor'].should == [{'name' => 'Jane Doe'}, {'name' => 'Leviticus Alabaster'}] }
|
143
56
|
|
144
|
-
|
145
|
-
|
146
|
-
end
|
57
|
+
# attributes on a hash
|
58
|
+
specify { entry['contributor'][0].attributes['type'].should == 'primary' }
|
147
59
|
|
148
|
-
|
149
|
-
|
150
|
-
# Enumerable method works as expected
|
151
|
-
subject.for_tag(:blurb).first.should == 'one'
|
152
|
-
subject.for_tag(:blurb).first.should == 'one'
|
153
|
-
end
|
154
|
-
end
|
60
|
+
# attributes on a string
|
61
|
+
specify { entry['content'].attributes['type'].should == 'html' }
|
155
62
|
|
156
|
-
#
|
157
|
-
|
158
|
-
let(:xml) { fixture_file('nested_elements.xml') }
|
159
|
-
subject { parser.for_tag(:entry).first }
|
63
|
+
# name on a hash
|
64
|
+
specify { entry.name.should == 'entry' }
|
160
65
|
|
161
|
-
|
162
|
-
|
66
|
+
# name on a string
|
67
|
+
specify { entry['title'].name.should == 'title' }
|
163
68
|
|
164
|
-
|
165
|
-
|
69
|
+
# character entity decoding
|
70
|
+
specify { entry['content'].should == "<p>Airplanes are very large — this can present difficulty in digestion.</p>"}
|
166
71
|
|
167
|
-
|
168
|
-
|
72
|
+
# empty element
|
73
|
+
specify { entry['media:thumbnail'].should == {} }
|
169
74
|
end
|
170
75
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: saxerator
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-09-05 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|
@@ -27,22 +27,6 @@ dependencies:
|
|
27
27
|
- - ! '>='
|
28
28
|
- !ruby/object:Gem::Version
|
29
29
|
version: 1.4.0
|
30
|
-
- !ruby/object:Gem::Dependency
|
31
|
-
name: rake
|
32
|
-
requirement: !ruby/object:Gem::Requirement
|
33
|
-
none: false
|
34
|
-
requirements:
|
35
|
-
- - ! '>='
|
36
|
-
- !ruby/object:Gem::Version
|
37
|
-
version: '0'
|
38
|
-
type: :development
|
39
|
-
prerelease: false
|
40
|
-
version_requirements: !ruby/object:Gem::Requirement
|
41
|
-
none: false
|
42
|
-
requirements:
|
43
|
-
- - ! '>='
|
44
|
-
- !ruby/object:Gem::Version
|
45
|
-
version: '0'
|
46
30
|
- !ruby/object:Gem::Dependency
|
47
31
|
name: rspec
|
48
32
|
requirement: !ruby/object:Gem::Requirement
|
@@ -50,7 +34,7 @@ dependencies:
|
|
50
34
|
requirements:
|
51
35
|
- - ! '>='
|
52
36
|
- !ruby/object:Gem::Version
|
53
|
-
version:
|
37
|
+
version: 2.11.0
|
54
38
|
type: :development
|
55
39
|
prerelease: false
|
56
40
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -58,12 +42,10 @@ dependencies:
|
|
58
42
|
requirements:
|
59
43
|
- - ! '>='
|
60
44
|
- !ruby/object:Gem::Version
|
61
|
-
version:
|
62
|
-
description: ! " Saxerator is a
|
63
|
-
very large files
|
64
|
-
|
65
|
-
document.\n This approach is ideal for large xml files containing a collection
|
66
|
-
of elements that you can process\n independently.\n"
|
45
|
+
version: 2.11.0
|
46
|
+
description: ! " Saxerator is a streaming xml-to-hash parser designed for working
|
47
|
+
with very large xml files by\n giving you Enumerable access to manageable chunks
|
48
|
+
of the document.\n"
|
67
49
|
email:
|
68
50
|
- bradley.schaefer@gmail.com
|
69
51
|
executables: []
|
@@ -80,21 +62,28 @@ files:
|
|
80
62
|
- lib/saxerator/document_fragment.rb
|
81
63
|
- lib/saxerator/dsl.rb
|
82
64
|
- lib/saxerator/full_document.rb
|
83
|
-
- lib/saxerator/
|
65
|
+
- lib/saxerator/hash_element.rb
|
84
66
|
- lib/saxerator/parser/accumulator.rb
|
85
67
|
- lib/saxerator/parser/at_depth_latch.rb
|
86
68
|
- lib/saxerator/parser/child_of_latch.rb
|
87
69
|
- lib/saxerator/parser/document_latch.rb
|
88
|
-
- lib/saxerator/parser/
|
70
|
+
- lib/saxerator/parser/for_tags_latch.rb
|
89
71
|
- lib/saxerator/parser/latched_accumulator.rb
|
90
72
|
- lib/saxerator/parser/with_attribute_latch.rb
|
91
73
|
- lib/saxerator/parser/within_latch.rb
|
92
|
-
- lib/saxerator/
|
74
|
+
- lib/saxerator/string_element.rb
|
93
75
|
- lib/saxerator/version.rb
|
94
76
|
- lib/saxerator/xml_node.rb
|
95
77
|
- lib/saxerator.rb
|
96
78
|
- spec/fixtures/flat_blurbs.xml
|
97
79
|
- spec/fixtures/nested_elements.xml
|
80
|
+
- spec/lib/dsl/all_spec.rb
|
81
|
+
- spec/lib/dsl/at_depth_spec.rb
|
82
|
+
- spec/lib/dsl/child_of_spec.rb
|
83
|
+
- spec/lib/dsl/for_tag_spec.rb
|
84
|
+
- spec/lib/dsl/for_tags_spec.rb
|
85
|
+
- spec/lib/dsl/with_attribute_spec.rb
|
86
|
+
- spec/lib/dsl/within_spec.rb
|
98
87
|
- spec/lib/saxerator_spec.rb
|
99
88
|
- spec/spec_helper.rb
|
100
89
|
- benchmark/benchmark.rb
|
@@ -127,5 +116,12 @@ summary: A SAX-based XML-to-hash parser for parsing large files into manageable
|
|
127
116
|
test_files:
|
128
117
|
- spec/fixtures/flat_blurbs.xml
|
129
118
|
- spec/fixtures/nested_elements.xml
|
119
|
+
- spec/lib/dsl/all_spec.rb
|
120
|
+
- spec/lib/dsl/at_depth_spec.rb
|
121
|
+
- spec/lib/dsl/child_of_spec.rb
|
122
|
+
- spec/lib/dsl/for_tag_spec.rb
|
123
|
+
- spec/lib/dsl/for_tags_spec.rb
|
124
|
+
- spec/lib/dsl/with_attribute_spec.rb
|
125
|
+
- spec/lib/dsl/within_spec.rb
|
130
126
|
- spec/lib/saxerator_spec.rb
|
131
127
|
- spec/spec_helper.rb
|