saxerator 0.0.3 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,14 @@
1
+ language: ruby
2
+ bundler_args: --without coverage benchmark
3
+ rvm:
4
+ - 1.8.7
5
+ - 1.9.2
6
+ - 1.9.3
7
+ - jruby-18mode
8
+ - jruby-19mode
9
+ - rbx-18mode
10
+ - rbx-19mode
11
+ matrix:
12
+ allow_failures:
13
+ - rvm: jruby-18mode
14
+ - rvm: jruby-19mode
data/Gemfile CHANGED
@@ -2,3 +2,11 @@ source "http://rubygems.org"
2
2
 
3
3
  # Specify your gem's dependencies in saxerator.gemspec
4
4
  gemspec
5
+
6
+ group :benchmark do
7
+ gem 'ipsum'
8
+ end
9
+
10
+ group :coverage do
11
+ gem 'simplecov'
12
+ end
data/README.md CHANGED
@@ -1,4 +1,4 @@
1
- Saxerator
1
+ Saxerator [![Build Status](https://secure.travis-ci.org/soulcutter/saxerator.png?branch=master)](http://travis-ci.org/soulcutter/saxerator)
2
2
  =========
3
3
 
4
4
  Saxerator is a SAX-based xml-to-hash parser designed for parsing very large files into manageable chunks. Rather than
@@ -10,6 +10,12 @@ Each xml chunk is parsed into a JSON-like Ruby Hash structure for consumption.
10
10
 
11
11
  Examples
12
12
  --------
13
+ You can parse any valid xml in 3 simple steps.
14
+
15
+ 1. Initialize the parser
16
+ 1. Tell it which tag you care about
17
+ 1. Perform your work in an 'each' block, or using any [Enumerable](http://apidock.com/ruby/Enumerable)
18
+ method
13
19
 
14
20
  ```ruby
15
21
  parser = Saxerator.parser(File.new("rss.xml"))
@@ -24,25 +30,24 @@ end
24
30
  puts "First title: #{parser.for_tag(:title).first}"
25
31
  ```
26
32
 
27
- ```ruby
28
- # attributes are stored as a part of the Hash or String object they relate to
33
+ Attributes are stored as a part of the Hash or String object they relate to
29
34
 
30
- # author behaves like a String here, but also responds to .attributes
35
+ ```ruby
36
+ # author is a String here, but also responds to .attributes
31
37
  primary_authors = parser.for_tag(:author).select { |author| author.attributes['type'] == 'primary' }
32
- puts "Primary authors: #{primary_authors.join(", ")}"
33
-
34
- # item behaves like a Hash, but also responds to .attributes
35
- favorite_items = parser.for_tag(:item).select { |item| item.attributes['favorite'] }
36
- puts "First favorite title: #{favorite_items[0]['title']}"
37
38
  ```
38
39
 
39
- Compatibility
40
- -------------
41
- Known compatible rubies:
40
+ You can combine predicates to isolate just the tags you want.
42
41
 
43
- * MRI 1.9.3-p125
44
- * MRI 1.9.2-p318
45
- * JRuby 1.6.7 (with JRUBY_OPTS=--1.9)
42
+ ```ruby
43
+ parser.for_tag(:name).each { |x| all_the_names_in_a_document << x }
44
+ parser.for_tag(:name).at_depth(2).each { |x| names_nested_under_document_root << x }
45
+ ```
46
+
47
+ Known Issues
48
+ ------------
49
+ * JRuby closes the file stream at the end of parsing, therefor to perform multiple operations
50
+ which parse a file you will need to instantiate a new parser with a new File object.
46
51
 
47
52
  FAQ
48
53
  ---
@@ -64,4 +69,7 @@ Why not DOM parsing?
64
69
 
65
70
  ### Acknowledgements ###
66
71
  Saxerator was inspired by [Nori](https://github.com/rubiii/nori) and [Gregory Brown](http://majesticseacreature.com/)'s
67
- [Practicing Ruby](http://practicingruby.com/)
72
+ [Practicing Ruby](http://practicingruby.com/)
73
+
74
+ #### Legal Stuff ####
75
+ Copyright © Bradley Schaefer. MIT License (see LICENSE file).
data/Rakefile CHANGED
@@ -1,4 +1,5 @@
1
- require "bundler/gem_tasks"
1
+ require 'rake/dsl_definition'
2
+ require 'bundler/gem_tasks'
2
3
  require 'rspec/core/rake_task'
3
4
 
4
5
  RSpec::Core::RakeTask.new(:spec)
@@ -1,16 +1,21 @@
1
- require "saxerator/version"
2
- require 'saxerator/configuration'
1
+ require 'nokogiri'
3
2
 
3
+ require "saxerator/version"
4
+ require 'saxerator/document'
4
5
  require 'saxerator/string_with_attributes'
5
6
  require 'saxerator/hash_with_attributes'
6
-
7
7
  require 'saxerator/xml_node'
8
- require 'saxerator/parser/nokogiri'
8
+
9
+ require 'saxerator/parser/accumulator'
10
+ require 'saxerator/parser/document_latch'
11
+ require 'saxerator/parser/element_name_latch'
12
+ require 'saxerator/parser/depth_latch'
13
+ require 'saxerator/parser/latched_accumulator'
9
14
 
10
15
  module Saxerator
11
16
  extend self
12
17
 
13
18
  def parser(xml)
14
- Saxerator::Configuration.new(xml)
19
+ Saxerator::Document.new(xml)
15
20
  end
16
21
  end
@@ -0,0 +1,29 @@
1
+ module Saxerator
2
+ class Document
3
+ include Enumerable
4
+
5
+ def initialize(source, config = nil, latches = [])
6
+ @source = source
7
+ @latches = latches
8
+ @config = config
9
+ end
10
+
11
+ def for_tag(tag)
12
+ Document.new(@source, @config, @latches + [Parser::ElementNameLatch.new(tag.to_s)])
13
+ end
14
+
15
+ def at_depth(depth)
16
+ Document.new(@source, @config, @latches + [Parser::DepthLatch.new(depth.to_i)])
17
+ end
18
+
19
+ def each(&block)
20
+ document = Parser::LatchedAccumulator.new(@config, @latches, block)
21
+ parser = ::Nokogiri::XML::SAX::Parser.new document
22
+
23
+ # Always have to start at the beginning of a File
24
+ @source.rewind if(@source.is_a?(File))
25
+
26
+ parser.parse(@source)
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,30 @@
1
+ module Saxerator
2
+ module Parser
3
+ class Accumulator < ::Nokogiri::XML::SAX::Document
4
+ def initialize(config, block)
5
+ @stack = []
6
+ @config = config
7
+ @block = block
8
+ end
9
+
10
+ def start_element(name, attrs = [])
11
+ @stack.push XmlNode.new(@config, name, Hash[*attrs.flatten])
12
+ end
13
+
14
+ def end_element(_)
15
+ if @stack.size > 1
16
+ last = @stack.pop
17
+ @stack.last.add_node last
18
+ else
19
+ @block.call(@stack.pop.to_hash)
20
+ end
21
+ end
22
+
23
+ def characters(string)
24
+ @stack.last.add_node(string) unless string.strip.length == 0
25
+ end
26
+
27
+ alias cdata_block characters
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,21 @@
1
+ module Saxerator
2
+ module Parser
3
+ class DepthLatch < DocumentLatch
4
+ def initialize(depth)
5
+ @depth = depth
6
+ @actual_depth = 0
7
+ end
8
+
9
+ def start_element name, attrs = []
10
+ @actual_depth += 1
11
+ if @actual_depth == @depth
12
+ open
13
+ end
14
+ end
15
+
16
+ def end_element name
17
+ @actual_depth -= 1
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,21 @@
1
+ module Saxerator
2
+ module Parser
3
+ class DocumentLatch < ::Nokogiri::XML::SAX::Document
4
+ def open
5
+ @open = true
6
+ end
7
+
8
+ def close
9
+ @open = false
10
+ end
11
+
12
+ def open?
13
+ @open
14
+ end
15
+
16
+ def reset
17
+ close
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,15 @@
1
+ module Saxerator
2
+ module Parser
3
+ class ElementNameLatch < DocumentLatch
4
+ def initialize(name)
5
+ @name = name
6
+ end
7
+
8
+ def start_element(name, _)
9
+ if name == @name
10
+ open
11
+ end
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,75 @@
1
+ module Saxerator
2
+ module Parser
3
+ class LatchedAccumulator < ::Nokogiri::XML::SAX::Document
4
+ def initialize(config, latches, block)
5
+ @latches = latches
6
+ block_and_reset = Proc.new do |x|
7
+ block.call(x)
8
+ reset_latches
9
+ end
10
+ @accumulator = Accumulator.new(config, block_and_reset)
11
+ end
12
+
13
+ def reset_latches
14
+ @latches.each(&:reset)
15
+ end
16
+
17
+ def check_latches_and_passthrough(method, *args)
18
+ @latches.each { |latch| latch.send(method, *args) }
19
+ if @latches.all?(&:open?)
20
+ @accumulator.send(method, *args)
21
+ else
22
+ reset_latches
23
+ end
24
+ end
25
+
26
+ def xmldecl version, encoding, standalone
27
+ check_latches_and_passthrough(:xmldecl, version, encoding, standalone)
28
+ end
29
+
30
+ def start_document
31
+ check_latches_and_passthrough(:start_document)
32
+ end
33
+
34
+ def end_document
35
+ check_latches_and_passthrough(:end_document)
36
+ end
37
+
38
+ def start_element name, attrs = []
39
+ check_latches_and_passthrough(:start_element, name, attrs)
40
+ end
41
+
42
+ def end_element name
43
+ check_latches_and_passthrough(:end_element, name)
44
+ end
45
+
46
+ def start_element_namespace name, attrs = [], prefix = nil, uri = nil, ns = []
47
+ check_latches_and_passthrough(:start_element_namespace, name, attrs, prefix, uri, ns)
48
+ end
49
+
50
+ def end_element_namespace name, prefix = nil, uri = nil
51
+ check_latches_and_passthrough(:end_element_namespace, name, prefix, uri)
52
+ end
53
+
54
+ def characters string
55
+ check_latches_and_passthrough(:characters, string)
56
+ end
57
+
58
+ def comment string
59
+ check_latches_and_passthrough(:comment, string)
60
+ end
61
+
62
+ def warning string
63
+ check_latches_and_passthrough(:warning, string)
64
+ end
65
+
66
+ def error string
67
+ check_latches_and_passthrough(:error, string)
68
+ end
69
+
70
+ def cdata_block string
71
+ check_latches_and_passthrough(:cdata_block, string)
72
+ end
73
+ end
74
+ end
75
+ end
@@ -1,3 +1,3 @@
1
1
  module Saxerator
2
- VERSION = "0.0.3"
2
+ VERSION = "0.1.0"
3
3
  end
@@ -1,23 +1,23 @@
1
1
  module Saxerator
2
2
  class XmlNode
3
- attr_accessor :name, :attributes, :children, :type
3
+ attr_reader :name
4
4
 
5
5
  def initialize(config, name, attributes)
6
6
  @config = config
7
- self.name = name
8
- self.attributes = attributes
9
- self.children = []
7
+ @name = name
8
+ @attributes = attributes
9
+ @children = []
10
10
  @text = false
11
11
  end
12
12
 
13
13
  def add_node(node)
14
14
  @text = true if node.is_a? String
15
- children << node
15
+ @children << node
16
16
  end
17
17
 
18
18
  def to_s
19
- string = StringWithAttributes.new(@text ? children.join : children.to_s)
20
- string.attributes = attributes
19
+ string = StringWithAttributes.new(@children.join)
20
+ string.attributes = @attributes
21
21
  string
22
22
  end
23
23
 
@@ -26,7 +26,7 @@ module Saxerator
26
26
  to_s
27
27
  else
28
28
  out = HashWithAttributes.new
29
- out.attributes = attributes
29
+ out.attributes = @attributes
30
30
 
31
31
  @children.each do |child|
32
32
  name = child.name
@@ -1,7 +1,6 @@
1
1
  # -*- encoding: utf-8 -*-
2
2
  $:.push File.expand_path('../lib', __FILE__)
3
3
  require 'saxerator/version'
4
- require 'rake' # for FileList
5
4
 
6
5
  Gem::Specification.new do |s|
7
6
  s.name = 'saxerator'
@@ -20,26 +19,25 @@ Gem::Specification.new do |s|
20
19
 
21
20
  s.rubyforge_project = 'saxerator'
22
21
 
23
- s.files = FileList[
22
+ s.files = [
24
23
  'LICENSE',
25
24
  'README.md',
26
25
  'saxerator.gemspec',
27
- 'lib/**/*.rb',
28
- 'spec/**/*.*',
29
- 'benchmark/**/*.rb',
30
26
  'Gemfile',
31
27
  'Rakefile',
32
28
  '.rvmrc',
33
- '.gitignore'
34
- ]
35
- s.test_files = FileList['spec/**/*.*']
29
+ '.gitignore',
30
+ '.travis.yml'
31
+ ] +
32
+ Dir.glob('lib/**/*.rb') +
33
+ Dir.glob('spec/**/*.*') +
34
+ Dir.glob('benchmark/**/*.rb')
35
+ s.test_files = Dir.glob('spec/**/*.*')
36
36
  s.executables = []
37
37
  s.require_paths = ['lib']
38
38
 
39
- s.add_runtime_dependency 'nokogiri'
39
+ s.add_runtime_dependency 'nokogiri', '>= 1.4.0'
40
40
 
41
41
  s.add_development_dependency 'rake'
42
42
  s.add_development_dependency 'rspec'
43
- s.add_development_dependency 'simplecov'
44
- s.add_development_dependency 'ipsum'
45
43
  end
@@ -32,6 +32,43 @@ describe Saxerator do
32
32
  end
33
33
  end
34
34
 
35
+ context "with a string with an element at multiple depths" do
36
+ let(:xml) do
37
+ <<-eos
38
+ <books>
39
+ <book>
40
+ <name>How to eat an airplane</name>
41
+ <author>
42
+ <name>Leviticus Alabaster</name>
43
+ <name>Eunice Diesel</name>
44
+ </author>
45
+ </book>
46
+ <book>
47
+ <name>To wallop a horse in the face</name>
48
+ <author>
49
+ <name>Jeanne Clarewood</name>
50
+ </author>
51
+ </book>
52
+ </books>
53
+ eos
54
+ end
55
+
56
+ it "should only parse the requested tag depth" do
57
+ results = []
58
+ subject.at_depth(3).each { |x| results << x }
59
+ results.should == [
60
+ 'How to eat an airplane', {'name' => ['Leviticus Alabaster', 'Eunice Diesel']},
61
+ 'To wallop a horse in the face', {'name' => 'Jeanne Clarewood'}
62
+ ]
63
+ end
64
+
65
+ it "should only parse the requested tag depth and tag" do
66
+ results = []
67
+ subject.at_depth(3).for_tag(:name).each { |x| results << x }
68
+ results.should == ['How to eat an airplane', 'To wallop a horse in the face']
69
+ end
70
+ end
71
+
35
72
  context "with a file with blurbs" do
36
73
  let(:xml) { fixture_file('flat_blurbs.xml') }
37
74
 
@@ -47,7 +84,6 @@ describe Saxerator do
47
84
  subject.for_tag(:blurb).first.should == 'one'
48
85
  subject.for_tag(:blurb).first.should == 'one'
49
86
  end
50
-
51
87
  end
52
88
 
53
89
  context "with a file with nested elements" do
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: saxerator
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.1.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-04-12 00:00:00.000000000 Z
12
+ date: 2012-04-14 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri
@@ -18,7 +18,7 @@ dependencies:
18
18
  requirements:
19
19
  - - ! '>='
20
20
  - !ruby/object:Gem::Version
21
- version: '0'
21
+ version: 1.4.0
22
22
  type: :runtime
23
23
  prerelease: false
24
24
  version_requirements: !ruby/object:Gem::Requirement
@@ -26,7 +26,7 @@ dependencies:
26
26
  requirements:
27
27
  - - ! '>='
28
28
  - !ruby/object:Gem::Version
29
- version: '0'
29
+ version: 1.4.0
30
30
  - !ruby/object:Gem::Dependency
31
31
  name: rake
32
32
  requirement: !ruby/object:Gem::Requirement
@@ -59,38 +59,6 @@ dependencies:
59
59
  - - ! '>='
60
60
  - !ruby/object:Gem::Version
61
61
  version: '0'
62
- - !ruby/object:Gem::Dependency
63
- name: simplecov
64
- requirement: !ruby/object:Gem::Requirement
65
- none: false
66
- requirements:
67
- - - ! '>='
68
- - !ruby/object:Gem::Version
69
- version: '0'
70
- type: :development
71
- prerelease: false
72
- version_requirements: !ruby/object:Gem::Requirement
73
- none: false
74
- requirements:
75
- - - ! '>='
76
- - !ruby/object:Gem::Version
77
- version: '0'
78
- - !ruby/object:Gem::Dependency
79
- name: ipsum
80
- requirement: !ruby/object:Gem::Requirement
81
- none: false
82
- requirements:
83
- - - ! '>='
84
- - !ruby/object:Gem::Version
85
- version: '0'
86
- type: :development
87
- prerelease: false
88
- version_requirements: !ruby/object:Gem::Requirement
89
- none: false
90
- requirements:
91
- - - ! '>='
92
- - !ruby/object:Gem::Version
93
- version: '0'
94
62
  description: ! " Saxerator is a SAX-based xml-to-hash parser designed for parsing
95
63
  very large files into manageable chunks. Rather than\n dealing directly with
96
64
  SAX callback methods, Saxerator gives you Enumerable access to chunks of an xml
@@ -105,9 +73,18 @@ files:
105
73
  - LICENSE
106
74
  - README.md
107
75
  - saxerator.gemspec
108
- - lib/saxerator/configuration.rb
76
+ - Gemfile
77
+ - Rakefile
78
+ - .rvmrc
79
+ - .gitignore
80
+ - .travis.yml
81
+ - lib/saxerator/document.rb
109
82
  - lib/saxerator/hash_with_attributes.rb
110
- - lib/saxerator/parser/nokogiri.rb
83
+ - lib/saxerator/parser/accumulator.rb
84
+ - lib/saxerator/parser/depth_latch.rb
85
+ - lib/saxerator/parser/document_latch.rb
86
+ - lib/saxerator/parser/element_name_latch.rb
87
+ - lib/saxerator/parser/latched_accumulator.rb
111
88
  - lib/saxerator/string_with_attributes.rb
112
89
  - lib/saxerator/version.rb
113
90
  - lib/saxerator/xml_node.rb
@@ -118,10 +95,6 @@ files:
118
95
  - spec/spec_helper.rb
119
96
  - benchmark/benchmark.rb
120
97
  - benchmark/generate_sample_file.rb
121
- - Gemfile
122
- - Rakefile
123
- - .rvmrc
124
- - .gitignore
125
98
  homepage: https://github.com/soulcutter/saxerator
126
99
  licenses:
127
100
  - MIT
@@ -1,13 +0,0 @@
1
- module Saxerator
2
- class Configuration
3
- attr_reader :source
4
-
5
- def initialize(source)
6
- @source = source
7
- end
8
-
9
- def for_tag(tag)
10
- Saxerator::Parser::Nokogiri.new(self, source, tag.to_s)
11
- end
12
- end
13
- end
@@ -1,58 +0,0 @@
1
- require 'nokogiri'
2
-
3
- module Saxerator
4
- module Parser
5
- class Nokogiri
6
- include Enumerable
7
-
8
- def initialize(config, source, tag)
9
- @config = config
10
- @source = source
11
- @tag = tag
12
- end
13
-
14
- def each(&block)
15
- document = Document.new(@config, @tag, block)
16
- parser = ::Nokogiri::XML::SAX::Parser.new document
17
-
18
- # Always have to start at the beginning of a File
19
- @source.rewind if(@source.is_a?(File))
20
-
21
- parser.parse(@source)
22
- end
23
-
24
- class Document < ::Nokogiri::XML::SAX::Document
25
- attr_accessor :stack
26
-
27
- def initialize(config, tag, block)
28
- @config = config
29
- @tag = tag
30
- @stack = []
31
- @block = block
32
- end
33
-
34
- def start_element(name, attrs = [])
35
- if stack.size > 0 || name == @tag
36
- stack.push XmlNode.new(@config, name, Hash[*attrs.flatten])
37
- end
38
- end
39
-
40
- def end_element(name)
41
- if stack.size > 1
42
- last = stack.pop
43
- stack.last.add_node last
44
- elsif stack.size == 1
45
- @block.yield(stack.pop.to_hash)
46
- end
47
- end
48
-
49
- def characters(string)
50
- stack.last.add_node(string) unless string.strip.length == 0 || stack.empty?
51
- end
52
-
53
- alias cdata_block characters
54
-
55
- end
56
- end
57
- end
58
- end