saxerator 0.1.4 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore CHANGED
@@ -5,4 +5,4 @@ pkg/*
5
5
  .DS_Store
6
6
  .idea/
7
7
  coverage/
8
-
8
+ .rvmrc
@@ -1,5 +1,5 @@
1
1
  language: ruby
2
- bundler_args: --without coverage benchmark
2
+ bundler_args: --without coverage
3
3
  rvm:
4
4
  - 1.8.7
5
5
  - 1.9.2
data/Gemfile CHANGED
@@ -3,10 +3,6 @@ source "http://rubygems.org"
3
3
  # Specify your gem's dependencies in saxerator.gemspec
4
4
  gemspec
5
5
 
6
- group :benchmark do
7
- gem 'ipsum'
8
- end
9
-
10
6
  group :coverage do
11
7
  gem 'simplecov'
12
8
  end
data/README.md CHANGED
@@ -40,9 +40,26 @@ primary_authors = parser.for_tag(:author).select { |author| author.attributes['t
40
40
  You can combine predicates to isolate just the tags you want.
41
41
 
42
42
  ```ruby
43
- parser.for_tag(:name).each { |x| all_the_names_in_a_document << x }
44
- parser.for_tag(:name).at_depth(2).each { |x| names_nested_under_document_root << x }
45
- parser.for_tag(:name).within(:author).each { |x| author_names << x }
43
+ require 'saxerator'
44
+
45
+ parser = Saxerator.parser(bookshelf_xml)
46
+
47
+ # You can chain predicates
48
+ parser.for_tag(:name).within(:book).each { |book_name| puts book_name }
49
+
50
+ # You can re-use intermediary predicates
51
+ bookshelf_contents = parser.within(:bookshelf)
52
+
53
+ books = bookshelf_contents.for_tag(:book)
54
+ magazines = bookshelf_contents.for_tag(:magazine)
55
+
56
+ books.each do |book|
57
+ # Some processing on a book
58
+ end
59
+
60
+ magazines.each do |magazine|
61
+ # Do some work with a magazine
62
+ end
46
63
  ```
47
64
 
48
65
  Don't care about memory/streaming, you just want your xml in one big hash? Saxerator can do that too.
@@ -75,7 +92,7 @@ Why not DOM parsing?
75
92
  > document is very large, this can be an important consideration.
76
93
 
77
94
  ### Acknowledgements ###
78
- Saxerator was inspired by [Nori](https://github.com/rubiii/nori) and [Gregory Brown](http://majesticseacreature.com/)'s
95
+ Saxerator was inspired by - but not affiliated with - [nori](https://github.com/rubiii/nori) and [Gregory Brown](http://majesticseacreature.com/)'s
79
96
  [Practicing Ruby](http://practicingruby.com/)
80
97
 
81
98
  #### Legal Stuff ####
@@ -1,5 +1,4 @@
1
- require 'ipsum'
2
-
1
+ #!/usr/bin/env ruby
3
2
  filename = ARGV.shift
4
3
 
5
4
  unless filename
@@ -14,7 +13,7 @@ element = <<-eos
14
13
  <artist id="%i">
15
14
  <name>Rock Star %i</name>
16
15
  <active>true</active>
17
- <description>%s</description>
16
+ <description>...</description>
18
17
  <genre>Rock,Metal</genre>
19
18
  </artist>
20
19
  eos
@@ -25,7 +24,7 @@ File.open(filename, 'w') do |f|
25
24
  f.puts '<?xml version="1.0" encoding="UTF-8"?>'
26
25
  f.puts '<artists>'
27
26
  num_records.times do |count|
28
- f.puts(element % [count, count, 5.sentences])
27
+ f.puts(element % [count, count])
29
28
  end
30
29
  f.puts '</artists>'
31
30
  end
@@ -1,5 +1,3 @@
1
- require 'nokogiri'
2
-
3
1
  require 'saxerator/version'
4
2
 
5
3
  require 'saxerator/full_document'
@@ -13,6 +11,7 @@ require 'saxerator/parser/element_name_latch'
13
11
  require 'saxerator/parser/depth_latch'
14
12
  require 'saxerator/parser/within_element_latch'
15
13
  require 'saxerator/parser/latched_accumulator'
14
+ require 'saxerator/parser/child_of_latch'
16
15
 
17
16
  module Saxerator
18
17
  extend self
@@ -1,4 +1,5 @@
1
1
  require 'saxerator/dsl'
2
+ require 'nokogiri'
2
3
 
3
4
  module Saxerator
4
5
  class DocumentFragment
@@ -13,10 +14,10 @@ module Saxerator
13
14
 
14
15
  def each(&block)
15
16
  reader = Parser::LatchedAccumulator.new(@config, @latches, block)
16
- parser = ::Nokogiri::XML::SAX::Parser.new(reader)
17
+ parser = Nokogiri::XML::SAX::Parser.new(reader)
17
18
 
18
19
  # Always have to start at the beginning of a File
19
- @source.rewind if(@source.is_a?(File))
20
+ @source.rewind if @source.respond_to?(:rewind)
20
21
 
21
22
  parser.parse(@source)
22
23
  end
@@ -1,15 +1,24 @@
1
1
  module Saxerator
2
2
  module DSL
3
3
  def for_tag(tag)
4
- DocumentFragment.new(@source, @config, @latches + [Parser::ElementNameLatch.new(tag.to_s)])
4
+ specify(Parser::ElementNameLatch.new(tag.to_s))
5
5
  end
6
6
 
7
7
  def at_depth(depth)
8
- DocumentFragment.new(@source, @config, @latches + [Parser::DepthLatch.new(depth.to_i)])
8
+ specify(Parser::DepthLatch.new(depth.to_i))
9
9
  end
10
10
 
11
11
  def within(tag)
12
- DocumentFragment.new(@source, @config, @latches + [Parser::WithinElementLatch.new(tag.to_s)])
12
+ specify(Parser::WithinElementLatch.new(tag.to_s))
13
+ end
14
+
15
+ def child_of(tag)
16
+ specify(Parser::ChildOfLatch.new(tag.to_s))
17
+ end
18
+
19
+ private
20
+ def specify(predicate)
21
+ DocumentFragment.new(@source, @config, @latches + [predicate])
13
22
  end
14
23
  end
15
24
  end
@@ -1,6 +1,8 @@
1
+ require 'nokogiri'
2
+
1
3
  module Saxerator
2
4
  module Parser
3
- class Accumulator < ::Nokogiri::XML::SAX::Document
5
+ class Accumulator < Nokogiri::XML::SAX::Document
4
6
  def initialize(config, block)
5
7
  @stack = []
6
8
  @config = config
@@ -25,6 +27,10 @@ module Saxerator
25
27
  end
26
28
 
27
29
  alias cdata_block characters
30
+
31
+ def accumulating?
32
+ @stack.size > 0
33
+ end
28
34
  end
29
35
  end
30
36
  end
@@ -0,0 +1,47 @@
1
+ require 'saxerator/parser/document_latch'
2
+
3
+ module Saxerator
4
+ module Parser
5
+ class ChildOfLatch < DocumentLatch
6
+ def initialize(name)
7
+ @name = name
8
+ @depths = []
9
+ @depth_within_element = 0
10
+ end
11
+
12
+ def start_element name, _
13
+ if depth_within_element > 0
14
+ increment_depth(1)
15
+ resolve_open_status
16
+ end
17
+ if @name == name
18
+ @depths.push 1
19
+ end
20
+ end
21
+
22
+ def end_element _
23
+ if depth_within_element > 0
24
+ increment_depth(-1)
25
+ @depths.pop if @depths.last == 0
26
+ resolve_open_status
27
+ end
28
+ end
29
+
30
+ def increment_depth(amount)
31
+ @depths.map! { |depth| depth + amount }
32
+ end
33
+
34
+ def depth_within_element
35
+ @depths.size > 0 ? @depths.last : 0
36
+ end
37
+
38
+ def resolve_open_status
39
+ if depth_within_element == 2
40
+ open
41
+ else
42
+ close
43
+ end
44
+ end
45
+ end
46
+ end
47
+ end
@@ -4,19 +4,20 @@ module Saxerator
4
4
  module Parser
5
5
  class DepthLatch < DocumentLatch
6
6
  def initialize(depth)
7
- @depth = depth
8
- @actual_depth = 0
7
+ @target_depth = depth
8
+ @current_depth = 0
9
9
  end
10
10
 
11
11
  def start_element(_, __)
12
- @actual_depth += 1
13
- if @actual_depth == @depth
14
- open
15
- end
12
+ @current_depth += 1
16
13
  end
17
14
 
18
15
  def end_element(_)
19
- @actual_depth -= 1
16
+ @current_depth -= 1
17
+ end
18
+
19
+ def open?
20
+ @current_depth == @target_depth
20
21
  end
21
22
  end
22
23
  end
@@ -1,6 +1,8 @@
1
+ require 'nokogiri'
2
+
1
3
  module Saxerator
2
4
  module Parser
3
- class DocumentLatch < ::Nokogiri::XML::SAX::Document
5
+ class DocumentLatch < Nokogiri::XML::SAX::Document
4
6
  def open
5
7
  @open = true
6
8
  end
@@ -12,10 +14,6 @@ module Saxerator
12
14
  def open?
13
15
  @open
14
16
  end
15
-
16
- def reset
17
- close
18
- end
19
17
  end
20
18
  end
21
19
  end
@@ -7,10 +7,12 @@ module Saxerator
7
7
  @name = name
8
8
  end
9
9
 
10
- def start_element(name, _)
11
- if name == @name
12
- open
13
- end
10
+ def start_element name, _
11
+ name == @name ? open : close
12
+ end
13
+
14
+ def end_element name
15
+ close if name == @name
14
16
  end
15
17
  end
16
18
  end
@@ -1,25 +1,18 @@
1
+ require 'nokogiri'
2
+
1
3
  module Saxerator
2
4
  module Parser
3
- class LatchedAccumulator < ::Nokogiri::XML::SAX::Document
5
+ class LatchedAccumulator < Nokogiri::XML::SAX::Document
4
6
  def initialize(config, latches, block)
5
7
  @latches = latches
6
- block_and_reset = Proc.new do |x|
7
- block.call(x)
8
- reset_latches
9
- end
10
- @accumulator = Accumulator.new(config, block_and_reset)
11
- end
12
-
13
- def reset_latches
14
- @latches.each { |latch| latch.reset }
8
+ @accumulator = Accumulator.new(config, block)
15
9
  end
16
10
 
17
11
  def check_latches_and_passthrough(method, *args)
18
12
  @latches.each { |latch| latch.send(method, *args) }
19
- if @latches.all? { |latch| latch.open? }
13
+ if @accumulator.accumulating? ||
14
+ @latches.all? { |latch| latch.open? }
20
15
  @accumulator.send(method, *args)
21
- else
22
- reset_latches
23
16
  end
24
17
  end
25
18
 
@@ -5,28 +5,22 @@ module Saxerator
5
5
  class WithinElementLatch < DocumentLatch
6
6
  def initialize(name)
7
7
  @name = name
8
- @inner_depth = 0
8
+ @depth_within_element = 0
9
9
  end
10
10
 
11
11
  def start_element name, _
12
- if @inner_depth == 0
13
- if name == @name
14
- @inner_depth += 1
15
- end
16
- else
17
- open if @inner_depth == 1
18
- @inner_depth += 1
12
+ if name == @name || @depth_within_element > 0
13
+ @depth_within_element += 1
14
+ open if @depth_within_element == 2
19
15
  end
20
16
  end
21
17
 
22
18
  def end_element _
23
- if @inner_depth > 0
24
- @inner_depth -= 1
25
- close if @inner_depth == 0
19
+ if @depth_within_element > 0
20
+ @depth_within_element -= 1
21
+ close if @depth_within_element == 1
26
22
  end
27
23
  end
28
-
29
- def reset; end
30
24
  end
31
25
  end
32
26
  end
@@ -1,3 +1,3 @@
1
1
  module Saxerator
2
- VERSION = "0.1.4"
2
+ VERSION = "0.2.0"
3
3
  end
@@ -31,6 +31,7 @@ module Saxerator
31
31
  @children.each do |child|
32
32
  name = child.name
33
33
  element = child.to_hash
34
+
34
35
  if out[name]
35
36
  if !out[name].is_a?(Array)
36
37
  out[name] = [out[name]]
@@ -40,6 +41,7 @@ module Saxerator
40
41
  out[name] = element
41
42
  end
42
43
  end
44
+
43
45
  out
44
46
  end
45
47
  end
@@ -25,7 +25,6 @@ Gem::Specification.new do |s|
25
25
  'saxerator.gemspec',
26
26
  'Gemfile',
27
27
  'Rakefile',
28
- '.rvmrc',
29
28
  '.gitignore',
30
29
  '.travis.yml'
31
30
  ] +
@@ -11,28 +11,28 @@ describe Saxerator do
11
11
  subject { parser }
12
12
  let(:parser) { Saxerator.parser(xml) }
13
13
 
14
- context "with a string with blurbs" do
15
- let(:xml) { "<blurbs><blurb>one</blurb><blurb>two</blurb><blurb>three</blurb></blurbs>" }
14
+ context "with a string with blurbs and one non-blurb" do
15
+ let(:xml) do
16
+ <<-eos
17
+ <blurbs>
18
+ <blurb>one</blurb>
19
+ <blurb>two</blurb>
20
+ <blurb>three</blurb>
21
+ <notablurb>four</notablurb>
22
+ </blurbs>
23
+ eos
24
+ end
16
25
 
17
26
  it "should parse simple strings" do
18
- results = []
19
- subject.for_tag(:blurb).each { |x| results << x }
20
- results.should == ['one', 'two', 'three']
27
+ subject.for_tag(:blurb).inject([], :<<).should == ['one', 'two', 'three']
21
28
  end
22
29
 
23
- it "should allow you to parse an entire document" do
24
- subject.all.should == {'blurb' => ['one', 'two', 'three']}
30
+ it "should only parse the requested tag" do
31
+ subject.for_tag(:notablurb).inject([], :<<).should == ['four']
25
32
  end
26
33
 
27
- context "and one non-blurb" do
28
- let(:xml) { "<blurbs><blurb>one</blurb><blurb>two</blurb><blurb>three</blurb><notablurb>four</notablurb></blurbs>" }
29
- it "should only parse the requested tag" do
30
- results = []
31
- subject.for_tag(:blurb).each { |x| results << x }
32
- results.should == ['one', 'two', 'three']
33
- subject.for_tag(:notablurb).each { |x| results << x }
34
- results.should == ['one', 'two', 'three', 'four']
35
- end
34
+ it "should allow you to parse an entire document" do
35
+ subject.all.should == {'blurb' => ['one', 'two', 'three'], 'notablurb' => 'four'}
36
36
  end
37
37
  end
38
38
 
@@ -64,9 +64,7 @@ describe Saxerator do
64
64
  end
65
65
 
66
66
  it "should only parse the requested tag depth" do
67
- results = []
68
- subject.at_depth(3).each { |x| results << x }
69
- results.should == [
67
+ subject.at_depth(3).inject([], :<<).should == [
70
68
  'How to eat an airplane', { 'name' => ['Leviticus Alabaster', 'Eunice Diesel'] },
71
69
  'To wallop a horse in the face', { 'name' => 'Jeanne Clarewood' },
72
70
  'Is our children learning?', { 'name' => 'Hazel Nutt' }
@@ -74,21 +72,58 @@ describe Saxerator do
74
72
  end
75
73
 
76
74
  it "should only parse the requested tag depth and tag" do
77
- results = []
78
- subject.at_depth(3).for_tag(:name).each { |x| results << x }
79
- results.should == ['How to eat an airplane', 'To wallop a horse in the face', 'Is our children learning?']
75
+ subject.at_depth(3).for_tag(:name).inject([], :<<).should == [
76
+ 'How to eat an airplane',
77
+ 'To wallop a horse in the face',
78
+ 'Is our children learning?'
79
+ ]
80
80
  end
81
81
 
82
82
  it "should only parse tags nested inside the specified tag" do
83
- results = []
84
- subject.within(:article).each { |x| results << x }
85
- results.should == ['Is our children learning?', { 'name' => 'Hazel Nutt' }]
83
+ subject.within(:article).inject([], :<<).should == [
84
+ 'Is our children learning?',
85
+ { 'name' => 'Hazel Nutt' }
86
+ ]
87
+ end
88
+
89
+ it "should combine #for_tag and #within to parse the specified elements" do
90
+ subject.for_tag(:name).within(:article).inject([], :<<).should == [
91
+ 'Is our children learning?',
92
+ 'Hazel Nutt'
93
+ ]
94
+ end
95
+ end
96
+
97
+ context "with a grand child" do
98
+ let(:xml) do
99
+ <<-eos
100
+ <root>
101
+ <children>
102
+ <name>Rudy McMannis</name>
103
+ <children>
104
+ <name>Tom McMannis</name>
105
+ </children>
106
+ <grandchildren>
107
+ <name>Mildred Marston</name>
108
+ </grandchildren>
109
+ <name>Anne Welsh</name>
110
+ </children>
111
+ </root>
112
+ eos
86
113
  end
87
114
 
88
- it "should only parse specified tags nested inside a specified tag" do
89
- results = []
90
- subject.for_tag(:name).within(:article).each { |x| results << x }
91
- results.should == ['Is our children learning?', 'Hazel Nutt' ]
115
+ it "should only parse children of the specified tag" do
116
+ subject.child_of(:grandchildren).inject([], :<<).should == [
117
+ 'Mildred Marston'
118
+ ]
119
+ end
120
+
121
+ it "should combine #for_tag and #child_of" do
122
+ subject.for_tag(:name).child_of(:children).inject([], :<<).should == [
123
+ 'Rudy McMannis',
124
+ 'Tom McMannis',
125
+ 'Anne Welsh'
126
+ ]
92
127
  end
93
128
  end
94
129
 
@@ -96,9 +131,7 @@ describe Saxerator do
96
131
  let(:xml) { fixture_file('flat_blurbs.xml') }
97
132
 
98
133
  it "should parse simple strings" do
99
- results = []
100
- subject.for_tag(:blurb).each { |x| results << x }
101
- results.should == ['one', 'two', 'three']
134
+ subject.for_tag(:blurb).inject([], :<<).should == ['one', 'two', 'three']
102
135
  end
103
136
 
104
137
  it "should allow multiple operations on the same parser" do
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: saxerator
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.4
4
+ version: 0.2.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-04-17 00:00:00.000000000 Z
12
+ date: 2012-08-24 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri
@@ -75,7 +75,6 @@ files:
75
75
  - saxerator.gemspec
76
76
  - Gemfile
77
77
  - Rakefile
78
- - .rvmrc
79
78
  - .gitignore
80
79
  - .travis.yml
81
80
  - lib/saxerator/document_fragment.rb
@@ -83,6 +82,7 @@ files:
83
82
  - lib/saxerator/full_document.rb
84
83
  - lib/saxerator/hash_with_attributes.rb
85
84
  - lib/saxerator/parser/accumulator.rb
85
+ - lib/saxerator/parser/child_of_latch.rb
86
86
  - lib/saxerator/parser/depth_latch.rb
87
87
  - lib/saxerator/parser/document_latch.rb
88
88
  - lib/saxerator/parser/element_name_latch.rb
@@ -98,7 +98,6 @@ files:
98
98
  - spec/spec_helper.rb
99
99
  - benchmark/benchmark.rb
100
100
  - benchmark/generate_sample_file.rb
101
- - benchmark/profile.rb
102
101
  homepage: https://github.com/soulcutter/saxerator
103
102
  licenses:
104
103
  - MIT
@@ -120,7 +119,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
120
119
  version: '0'
121
120
  requirements: []
122
121
  rubyforge_project: saxerator
123
- rubygems_version: 1.8.21
122
+ rubygems_version: 1.8.24
124
123
  signing_key:
125
124
  specification_version: 3
126
125
  summary: A SAX-based XML-to-hash parser for parsing large files into manageable chunks
data/.rvmrc DELETED
@@ -1,38 +0,0 @@
1
- #!/usr/bin/env bash
2
-
3
- # This is an RVM Project .rvmrc file, used to automatically load the ruby
4
- # development environment upon cd'ing into the directory
5
-
6
- # First we specify our desired <ruby>[@<gemset>], the @gemset name is optional.
7
- environment_id="1.9.3@saxerator"
8
-
9
- #
10
- # First we attempt to load the desired environment directly from the environment
11
- # file. This is very fast and efficient compared to running through the entire
12
- # CLI and selector. If you want feedback on which environment was used then
13
- # insert the word 'use' after --create as this triggers verbose mode.
14
- #
15
- if [[ -d "${rvm_path:-$HOME/.rvm}/environments" \
16
- && -s "${rvm_path:-$HOME/.rvm}/environments/$environment_id" ]]
17
- then
18
- \. "${rvm_path:-$HOME/.rvm}/environments/$environment_id"
19
-
20
- if [[ -s "${rvm_path:-$HOME/.rvm}/hooks/after_use" ]]
21
- then
22
- . "${rvm_path:-$HOME/.rvm}/hooks/after_use"
23
- fi
24
- else
25
- # If the environment file has not yet been created, use the RVM CLI to select.
26
- if ! rvm --create "$environment_id"
27
- then
28
- echo "Failed to create RVM environment '${environment_id}'."
29
- return 1
30
- fi
31
- fi
32
-
33
- if [[ $- == *i* ]] # check for interactive shells
34
- then
35
- echo "Using: $(tput setaf 2)$GEM_HOME$(tput sgr0)" # show the user the ruby and gemset they are using in green
36
- else
37
- echo "Using: $GEM_HOME" # don't use colors in interactive shells
38
- fi
@@ -1,21 +0,0 @@
1
- $:.push File.expand_path('../../lib', __FILE__)
2
- require 'saxerator'
3
- require 'ruby-prof'
4
-
5
- file = ARGV.shift
6
- if !File.exists?(file)
7
- puts "Cannot find file #{file}"
8
- exit 1
9
- end
10
- file = File.new(file)
11
-
12
- count = 0
13
- RubyProf.start
14
-
15
- Saxerator.parser(file).for_tag(:artist).each { count = count + 1 }
16
-
17
- result = RubyProf.stop
18
- printer = RubyProf::FlatPrinter.new(result)
19
- printer.print(STDOUT)
20
-
21
- puts "for_tag: #{count} artist elements parsed"