saxerator 0.1.4 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore CHANGED
@@ -5,4 +5,4 @@ pkg/*
5
5
  .DS_Store
6
6
  .idea/
7
7
  coverage/
8
-
8
+ .rvmrc
@@ -1,5 +1,5 @@
1
1
  language: ruby
2
- bundler_args: --without coverage benchmark
2
+ bundler_args: --without coverage
3
3
  rvm:
4
4
  - 1.8.7
5
5
  - 1.9.2
data/Gemfile CHANGED
@@ -3,10 +3,6 @@ source "http://rubygems.org"
3
3
  # Specify your gem's dependencies in saxerator.gemspec
4
4
  gemspec
5
5
 
6
- group :benchmark do
7
- gem 'ipsum'
8
- end
9
-
10
6
  group :coverage do
11
7
  gem 'simplecov'
12
8
  end
data/README.md CHANGED
@@ -40,9 +40,26 @@ primary_authors = parser.for_tag(:author).select { |author| author.attributes['t
40
40
  You can combine predicates to isolate just the tags you want.
41
41
 
42
42
  ```ruby
43
- parser.for_tag(:name).each { |x| all_the_names_in_a_document << x }
44
- parser.for_tag(:name).at_depth(2).each { |x| names_nested_under_document_root << x }
45
- parser.for_tag(:name).within(:author).each { |x| author_names << x }
43
+ require 'saxerator'
44
+
45
+ parser = Saxerator.parser(bookshelf_xml)
46
+
47
+ # You can chain predicates
48
+ parser.for_tag(:name).within(:book).each { |book_name| puts book_name }
49
+
50
+ # You can re-use intermediary predicates
51
+ bookshelf_contents = parser.within(:bookshelf)
52
+
53
+ books = bookshelf_contents.for_tag(:book)
54
+ magazines = bookshelf_contents.for_tag(:magazine)
55
+
56
+ books.each do |book|
57
+ # Some processing on a book
58
+ end
59
+
60
+ magazines.each do |magazine|
61
+ # Do some work with a magazine
62
+ end
46
63
  ```
47
64
 
48
65
  Don't care about memory/streaming, you just want your xml in one big hash? Saxerator can do that too.
@@ -75,7 +92,7 @@ Why not DOM parsing?
75
92
  > document is very large, this can be an important consideration.
76
93
 
77
94
  ### Acknowledgements ###
78
- Saxerator was inspired by [Nori](https://github.com/rubiii/nori) and [Gregory Brown](http://majesticseacreature.com/)'s
95
+ Saxerator was inspired by - but not affiliated with - [nori](https://github.com/rubiii/nori) and [Gregory Brown](http://majesticseacreature.com/)'s
79
96
  [Practicing Ruby](http://practicingruby.com/)
80
97
 
81
98
  #### Legal Stuff ####
@@ -1,5 +1,4 @@
1
- require 'ipsum'
2
-
1
+ #!/usr/bin/env ruby
3
2
  filename = ARGV.shift
4
3
 
5
4
  unless filename
@@ -14,7 +13,7 @@ element = <<-eos
14
13
  <artist id="%i">
15
14
  <name>Rock Star %i</name>
16
15
  <active>true</active>
17
- <description>%s</description>
16
+ <description>...</description>
18
17
  <genre>Rock,Metal</genre>
19
18
  </artist>
20
19
  eos
@@ -25,7 +24,7 @@ File.open(filename, 'w') do |f|
25
24
  f.puts '<?xml version="1.0" encoding="UTF-8"?>'
26
25
  f.puts '<artists>'
27
26
  num_records.times do |count|
28
- f.puts(element % [count, count, 5.sentences])
27
+ f.puts(element % [count, count])
29
28
  end
30
29
  f.puts '</artists>'
31
30
  end
@@ -1,5 +1,3 @@
1
- require 'nokogiri'
2
-
3
1
  require 'saxerator/version'
4
2
 
5
3
  require 'saxerator/full_document'
@@ -13,6 +11,7 @@ require 'saxerator/parser/element_name_latch'
13
11
  require 'saxerator/parser/depth_latch'
14
12
  require 'saxerator/parser/within_element_latch'
15
13
  require 'saxerator/parser/latched_accumulator'
14
+ require 'saxerator/parser/child_of_latch'
16
15
 
17
16
  module Saxerator
18
17
  extend self
@@ -1,4 +1,5 @@
1
1
  require 'saxerator/dsl'
2
+ require 'nokogiri'
2
3
 
3
4
  module Saxerator
4
5
  class DocumentFragment
@@ -13,10 +14,10 @@ module Saxerator
13
14
 
14
15
  def each(&block)
15
16
  reader = Parser::LatchedAccumulator.new(@config, @latches, block)
16
- parser = ::Nokogiri::XML::SAX::Parser.new(reader)
17
+ parser = Nokogiri::XML::SAX::Parser.new(reader)
17
18
 
18
19
  # Always have to start at the beginning of a File
19
- @source.rewind if(@source.is_a?(File))
20
+ @source.rewind if @source.respond_to?(:rewind)
20
21
 
21
22
  parser.parse(@source)
22
23
  end
@@ -1,15 +1,24 @@
1
1
  module Saxerator
2
2
  module DSL
3
3
  def for_tag(tag)
4
- DocumentFragment.new(@source, @config, @latches + [Parser::ElementNameLatch.new(tag.to_s)])
4
+ specify(Parser::ElementNameLatch.new(tag.to_s))
5
5
  end
6
6
 
7
7
  def at_depth(depth)
8
- DocumentFragment.new(@source, @config, @latches + [Parser::DepthLatch.new(depth.to_i)])
8
+ specify(Parser::DepthLatch.new(depth.to_i))
9
9
  end
10
10
 
11
11
  def within(tag)
12
- DocumentFragment.new(@source, @config, @latches + [Parser::WithinElementLatch.new(tag.to_s)])
12
+ specify(Parser::WithinElementLatch.new(tag.to_s))
13
+ end
14
+
15
+ def child_of(tag)
16
+ specify(Parser::ChildOfLatch.new(tag.to_s))
17
+ end
18
+
19
+ private
20
+ def specify(predicate)
21
+ DocumentFragment.new(@source, @config, @latches + [predicate])
13
22
  end
14
23
  end
15
24
  end
@@ -1,6 +1,8 @@
1
+ require 'nokogiri'
2
+
1
3
  module Saxerator
2
4
  module Parser
3
- class Accumulator < ::Nokogiri::XML::SAX::Document
5
+ class Accumulator < Nokogiri::XML::SAX::Document
4
6
  def initialize(config, block)
5
7
  @stack = []
6
8
  @config = config
@@ -25,6 +27,10 @@ module Saxerator
25
27
  end
26
28
 
27
29
  alias cdata_block characters
30
+
31
+ def accumulating?
32
+ @stack.size > 0
33
+ end
28
34
  end
29
35
  end
30
36
  end
@@ -0,0 +1,47 @@
1
+ require 'saxerator/parser/document_latch'
2
+
3
+ module Saxerator
4
+ module Parser
5
+ class ChildOfLatch < DocumentLatch
6
+ def initialize(name)
7
+ @name = name
8
+ @depths = []
9
+ @depth_within_element = 0
10
+ end
11
+
12
+ def start_element name, _
13
+ if depth_within_element > 0
14
+ increment_depth(1)
15
+ resolve_open_status
16
+ end
17
+ if @name == name
18
+ @depths.push 1
19
+ end
20
+ end
21
+
22
+ def end_element _
23
+ if depth_within_element > 0
24
+ increment_depth(-1)
25
+ @depths.pop if @depths.last == 0
26
+ resolve_open_status
27
+ end
28
+ end
29
+
30
+ def increment_depth(amount)
31
+ @depths.map! { |depth| depth + amount }
32
+ end
33
+
34
+ def depth_within_element
35
+ @depths.size > 0 ? @depths.last : 0
36
+ end
37
+
38
+ def resolve_open_status
39
+ if depth_within_element == 2
40
+ open
41
+ else
42
+ close
43
+ end
44
+ end
45
+ end
46
+ end
47
+ end
@@ -4,19 +4,20 @@ module Saxerator
4
4
  module Parser
5
5
  class DepthLatch < DocumentLatch
6
6
  def initialize(depth)
7
- @depth = depth
8
- @actual_depth = 0
7
+ @target_depth = depth
8
+ @current_depth = 0
9
9
  end
10
10
 
11
11
  def start_element(_, __)
12
- @actual_depth += 1
13
- if @actual_depth == @depth
14
- open
15
- end
12
+ @current_depth += 1
16
13
  end
17
14
 
18
15
  def end_element(_)
19
- @actual_depth -= 1
16
+ @current_depth -= 1
17
+ end
18
+
19
+ def open?
20
+ @current_depth == @target_depth
20
21
  end
21
22
  end
22
23
  end
@@ -1,6 +1,8 @@
1
+ require 'nokogiri'
2
+
1
3
  module Saxerator
2
4
  module Parser
3
- class DocumentLatch < ::Nokogiri::XML::SAX::Document
5
+ class DocumentLatch < Nokogiri::XML::SAX::Document
4
6
  def open
5
7
  @open = true
6
8
  end
@@ -12,10 +14,6 @@ module Saxerator
12
14
  def open?
13
15
  @open
14
16
  end
15
-
16
- def reset
17
- close
18
- end
19
17
  end
20
18
  end
21
19
  end
@@ -7,10 +7,12 @@ module Saxerator
7
7
  @name = name
8
8
  end
9
9
 
10
- def start_element(name, _)
11
- if name == @name
12
- open
13
- end
10
+ def start_element name, _
11
+ name == @name ? open : close
12
+ end
13
+
14
+ def end_element name
15
+ close if name == @name
14
16
  end
15
17
  end
16
18
  end
@@ -1,25 +1,18 @@
1
+ require 'nokogiri'
2
+
1
3
  module Saxerator
2
4
  module Parser
3
- class LatchedAccumulator < ::Nokogiri::XML::SAX::Document
5
+ class LatchedAccumulator < Nokogiri::XML::SAX::Document
4
6
  def initialize(config, latches, block)
5
7
  @latches = latches
6
- block_and_reset = Proc.new do |x|
7
- block.call(x)
8
- reset_latches
9
- end
10
- @accumulator = Accumulator.new(config, block_and_reset)
11
- end
12
-
13
- def reset_latches
14
- @latches.each { |latch| latch.reset }
8
+ @accumulator = Accumulator.new(config, block)
15
9
  end
16
10
 
17
11
  def check_latches_and_passthrough(method, *args)
18
12
  @latches.each { |latch| latch.send(method, *args) }
19
- if @latches.all? { |latch| latch.open? }
13
+ if @accumulator.accumulating? ||
14
+ @latches.all? { |latch| latch.open? }
20
15
  @accumulator.send(method, *args)
21
- else
22
- reset_latches
23
16
  end
24
17
  end
25
18
 
@@ -5,28 +5,22 @@ module Saxerator
5
5
  class WithinElementLatch < DocumentLatch
6
6
  def initialize(name)
7
7
  @name = name
8
- @inner_depth = 0
8
+ @depth_within_element = 0
9
9
  end
10
10
 
11
11
  def start_element name, _
12
- if @inner_depth == 0
13
- if name == @name
14
- @inner_depth += 1
15
- end
16
- else
17
- open if @inner_depth == 1
18
- @inner_depth += 1
12
+ if name == @name || @depth_within_element > 0
13
+ @depth_within_element += 1
14
+ open if @depth_within_element == 2
19
15
  end
20
16
  end
21
17
 
22
18
  def end_element _
23
- if @inner_depth > 0
24
- @inner_depth -= 1
25
- close if @inner_depth == 0
19
+ if @depth_within_element > 0
20
+ @depth_within_element -= 1
21
+ close if @depth_within_element == 1
26
22
  end
27
23
  end
28
-
29
- def reset; end
30
24
  end
31
25
  end
32
26
  end
@@ -1,3 +1,3 @@
1
1
  module Saxerator
2
- VERSION = "0.1.4"
2
+ VERSION = "0.2.0"
3
3
  end
@@ -31,6 +31,7 @@ module Saxerator
31
31
  @children.each do |child|
32
32
  name = child.name
33
33
  element = child.to_hash
34
+
34
35
  if out[name]
35
36
  if !out[name].is_a?(Array)
36
37
  out[name] = [out[name]]
@@ -40,6 +41,7 @@ module Saxerator
40
41
  out[name] = element
41
42
  end
42
43
  end
44
+
43
45
  out
44
46
  end
45
47
  end
@@ -25,7 +25,6 @@ Gem::Specification.new do |s|
25
25
  'saxerator.gemspec',
26
26
  'Gemfile',
27
27
  'Rakefile',
28
- '.rvmrc',
29
28
  '.gitignore',
30
29
  '.travis.yml'
31
30
  ] +
@@ -11,28 +11,28 @@ describe Saxerator do
11
11
  subject { parser }
12
12
  let(:parser) { Saxerator.parser(xml) }
13
13
 
14
- context "with a string with blurbs" do
15
- let(:xml) { "<blurbs><blurb>one</blurb><blurb>two</blurb><blurb>three</blurb></blurbs>" }
14
+ context "with a string with blurbs and one non-blurb" do
15
+ let(:xml) do
16
+ <<-eos
17
+ <blurbs>
18
+ <blurb>one</blurb>
19
+ <blurb>two</blurb>
20
+ <blurb>three</blurb>
21
+ <notablurb>four</notablurb>
22
+ </blurbs>
23
+ eos
24
+ end
16
25
 
17
26
  it "should parse simple strings" do
18
- results = []
19
- subject.for_tag(:blurb).each { |x| results << x }
20
- results.should == ['one', 'two', 'three']
27
+ subject.for_tag(:blurb).inject([], :<<).should == ['one', 'two', 'three']
21
28
  end
22
29
 
23
- it "should allow you to parse an entire document" do
24
- subject.all.should == {'blurb' => ['one', 'two', 'three']}
30
+ it "should only parse the requested tag" do
31
+ subject.for_tag(:notablurb).inject([], :<<).should == ['four']
25
32
  end
26
33
 
27
- context "and one non-blurb" do
28
- let(:xml) { "<blurbs><blurb>one</blurb><blurb>two</blurb><blurb>three</blurb><notablurb>four</notablurb></blurbs>" }
29
- it "should only parse the requested tag" do
30
- results = []
31
- subject.for_tag(:blurb).each { |x| results << x }
32
- results.should == ['one', 'two', 'three']
33
- subject.for_tag(:notablurb).each { |x| results << x }
34
- results.should == ['one', 'two', 'three', 'four']
35
- end
34
+ it "should allow you to parse an entire document" do
35
+ subject.all.should == {'blurb' => ['one', 'two', 'three'], 'notablurb' => 'four'}
36
36
  end
37
37
  end
38
38
 
@@ -64,9 +64,7 @@ describe Saxerator do
64
64
  end
65
65
 
66
66
  it "should only parse the requested tag depth" do
67
- results = []
68
- subject.at_depth(3).each { |x| results << x }
69
- results.should == [
67
+ subject.at_depth(3).inject([], :<<).should == [
70
68
  'How to eat an airplane', { 'name' => ['Leviticus Alabaster', 'Eunice Diesel'] },
71
69
  'To wallop a horse in the face', { 'name' => 'Jeanne Clarewood' },
72
70
  'Is our children learning?', { 'name' => 'Hazel Nutt' }
@@ -74,21 +72,58 @@ describe Saxerator do
74
72
  end
75
73
 
76
74
  it "should only parse the requested tag depth and tag" do
77
- results = []
78
- subject.at_depth(3).for_tag(:name).each { |x| results << x }
79
- results.should == ['How to eat an airplane', 'To wallop a horse in the face', 'Is our children learning?']
75
+ subject.at_depth(3).for_tag(:name).inject([], :<<).should == [
76
+ 'How to eat an airplane',
77
+ 'To wallop a horse in the face',
78
+ 'Is our children learning?'
79
+ ]
80
80
  end
81
81
 
82
82
  it "should only parse tags nested inside the specified tag" do
83
- results = []
84
- subject.within(:article).each { |x| results << x }
85
- results.should == ['Is our children learning?', { 'name' => 'Hazel Nutt' }]
83
+ subject.within(:article).inject([], :<<).should == [
84
+ 'Is our children learning?',
85
+ { 'name' => 'Hazel Nutt' }
86
+ ]
87
+ end
88
+
89
+ it "should combine #for_tag and #within to parse the specified elements" do
90
+ subject.for_tag(:name).within(:article).inject([], :<<).should == [
91
+ 'Is our children learning?',
92
+ 'Hazel Nutt'
93
+ ]
94
+ end
95
+ end
96
+
97
+ context "with a grand child" do
98
+ let(:xml) do
99
+ <<-eos
100
+ <root>
101
+ <children>
102
+ <name>Rudy McMannis</name>
103
+ <children>
104
+ <name>Tom McMannis</name>
105
+ </children>
106
+ <grandchildren>
107
+ <name>Mildred Marston</name>
108
+ </grandchildren>
109
+ <name>Anne Welsh</name>
110
+ </children>
111
+ </root>
112
+ eos
86
113
  end
87
114
 
88
- it "should only parse specified tags nested inside a specified tag" do
89
- results = []
90
- subject.for_tag(:name).within(:article).each { |x| results << x }
91
- results.should == ['Is our children learning?', 'Hazel Nutt' ]
115
+ it "should only parse children of the specified tag" do
116
+ subject.child_of(:grandchildren).inject([], :<<).should == [
117
+ 'Mildred Marston'
118
+ ]
119
+ end
120
+
121
+ it "should combine #for_tag and #child_of" do
122
+ subject.for_tag(:name).child_of(:children).inject([], :<<).should == [
123
+ 'Rudy McMannis',
124
+ 'Tom McMannis',
125
+ 'Anne Welsh'
126
+ ]
92
127
  end
93
128
  end
94
129
 
@@ -96,9 +131,7 @@ describe Saxerator do
96
131
  let(:xml) { fixture_file('flat_blurbs.xml') }
97
132
 
98
133
  it "should parse simple strings" do
99
- results = []
100
- subject.for_tag(:blurb).each { |x| results << x }
101
- results.should == ['one', 'two', 'three']
134
+ subject.for_tag(:blurb).inject([], :<<).should == ['one', 'two', 'three']
102
135
  end
103
136
 
104
137
  it "should allow multiple operations on the same parser" do
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: saxerator
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.4
4
+ version: 0.2.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-04-17 00:00:00.000000000 Z
12
+ date: 2012-08-24 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri
@@ -75,7 +75,6 @@ files:
75
75
  - saxerator.gemspec
76
76
  - Gemfile
77
77
  - Rakefile
78
- - .rvmrc
79
78
  - .gitignore
80
79
  - .travis.yml
81
80
  - lib/saxerator/document_fragment.rb
@@ -83,6 +82,7 @@ files:
83
82
  - lib/saxerator/full_document.rb
84
83
  - lib/saxerator/hash_with_attributes.rb
85
84
  - lib/saxerator/parser/accumulator.rb
85
+ - lib/saxerator/parser/child_of_latch.rb
86
86
  - lib/saxerator/parser/depth_latch.rb
87
87
  - lib/saxerator/parser/document_latch.rb
88
88
  - lib/saxerator/parser/element_name_latch.rb
@@ -98,7 +98,6 @@ files:
98
98
  - spec/spec_helper.rb
99
99
  - benchmark/benchmark.rb
100
100
  - benchmark/generate_sample_file.rb
101
- - benchmark/profile.rb
102
101
  homepage: https://github.com/soulcutter/saxerator
103
102
  licenses:
104
103
  - MIT
@@ -120,7 +119,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
120
119
  version: '0'
121
120
  requirements: []
122
121
  rubyforge_project: saxerator
123
- rubygems_version: 1.8.21
122
+ rubygems_version: 1.8.24
124
123
  signing_key:
125
124
  specification_version: 3
126
125
  summary: A SAX-based XML-to-hash parser for parsing large files into manageable chunks
data/.rvmrc DELETED
@@ -1,38 +0,0 @@
1
- #!/usr/bin/env bash
2
-
3
- # This is an RVM Project .rvmrc file, used to automatically load the ruby
4
- # development environment upon cd'ing into the directory
5
-
6
- # First we specify our desired <ruby>[@<gemset>], the @gemset name is optional.
7
- environment_id="1.9.3@saxerator"
8
-
9
- #
10
- # First we attempt to load the desired environment directly from the environment
11
- # file. This is very fast and efficient compared to running through the entire
12
- # CLI and selector. If you want feedback on which environment was used then
13
- # insert the word 'use' after --create as this triggers verbose mode.
14
- #
15
- if [[ -d "${rvm_path:-$HOME/.rvm}/environments" \
16
- && -s "${rvm_path:-$HOME/.rvm}/environments/$environment_id" ]]
17
- then
18
- \. "${rvm_path:-$HOME/.rvm}/environments/$environment_id"
19
-
20
- if [[ -s "${rvm_path:-$HOME/.rvm}/hooks/after_use" ]]
21
- then
22
- . "${rvm_path:-$HOME/.rvm}/hooks/after_use"
23
- fi
24
- else
25
- # If the environment file has not yet been created, use the RVM CLI to select.
26
- if ! rvm --create "$environment_id"
27
- then
28
- echo "Failed to create RVM environment '${environment_id}'."
29
- return 1
30
- fi
31
- fi
32
-
33
- if [[ $- == *i* ]] # check for interactive shells
34
- then
35
- echo "Using: $(tput setaf 2)$GEM_HOME$(tput sgr0)" # show the user the ruby and gemset they are using in green
36
- else
37
- echo "Using: $GEM_HOME" # don't use colors in interactive shells
38
- fi
@@ -1,21 +0,0 @@
1
- $:.push File.expand_path('../../lib', __FILE__)
2
- require 'saxerator'
3
- require 'ruby-prof'
4
-
5
- file = ARGV.shift
6
- if !File.exists?(file)
7
- puts "Cannot find file #{file}"
8
- exit 1
9
- end
10
- file = File.new(file)
11
-
12
- count = 0
13
- RubyProf.start
14
-
15
- Saxerator.parser(file).for_tag(:artist).each { count = count + 1 }
16
-
17
- result = RubyProf.stop
18
- printer = RubyProf::FlatPrinter.new(result)
19
- printer.print(STDOUT)
20
-
21
- puts "for_tag: #{count} artist elements parsed"