xml_split 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in xml_split.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2012 Seamus Abshere
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,29 @@
1
+ # XmlSplit
2
+
3
+ Split XML files on an element, yielding (streaming, so constant memory usage) each node in turn.
4
+
5
+ Uses [sgrep](http://www.cs.helsinki.fi/u/jjaakkol/sgrepman.html) internally.
6
+
7
+ As seen on ["Split XML files with `sgrep`, a classic UNIX utility from 1995"](http://numbers.brighterplanet.com/2012/09/11/split-xml-files-with-unix-utility-sgrep/)
8
+
9
+ ## Usage
10
+
11
+ >> require 'xml_split'
12
+ => true
13
+ >> x = XmlSplit.new('15MinLP_15Days.xml', 'IntervalReading')
14
+ => #<XmlSplit:0x0000010395ce60 @nodes=[], @cache_full=false, @path="/tmp/scratch/15MinLP_15Days.xml", @element="IntervalReading", @caching=false>
15
+ >> x.each { |node| puts node }
16
+ <IntervalReading>
17
+ <cost>907</cost>
18
+ <timePeriod>
19
+ <duration>900</duration>
20
+ <start>1330578000</start>
21
+ <!-- 3/1/2012 5:00:00 AM -->
22
+ </timePeriod>
23
+ <value>302</value>
24
+ </IntervalReading>
25
+ [...]
26
+
27
+ ## Copyright
28
+
29
+ Copyright 2012 Brighter Planet, Inc.
data/Rakefile ADDED
@@ -0,0 +1,11 @@
1
+ require "bundler/gem_tasks"
2
+
3
+ require 'cucumber/rake/task'
4
+ Cucumber::Rake::Task.new(:cucumber) do |t|
5
+ t.cucumber_opts = %w{--format pretty}
6
+ end
7
+
8
+ task :default => [:cucumber]
9
+
10
+ require 'yard'
11
+ YARD::Rake::YardocTask.new
data/bin/xml_split ADDED
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'xml_split'
@@ -0,0 +1,13 @@
1
+ require 'tempfile'
2
+
3
+ Given /the element to split by "(.*)" and XML/ do |element, xml|
4
+ tmp = Tempfile.new(element)
5
+ tmp.write xml
6
+ tmp.flush
7
+ @object = XmlSplit.new(tmp.path, element)
8
+ end
9
+
10
+ Then /calling :(.*) must give (.*)/ do |method_id, raw_nodes|
11
+ nodes = eval raw_nodes
12
+ @object.send(method_id).should == nodes
13
+ end
@@ -0,0 +1,5 @@
1
+ require 'bundler/setup'
2
+
3
+ require 'rspec/expectations'
4
+
5
+ require 'xml_split'
@@ -0,0 +1,8 @@
1
+ Feature: XmlSplit
2
+
3
+ Scenario:
4
+ Given the element to split by "foo" and XML
5
+ """
6
+ <foo>bar</foo><foo>baz</foo>
7
+ """
8
+ Then calling :to_a must give ["<foo>bar</foo>", "<foo>baz</foo>"]
@@ -0,0 +1,3 @@
1
+ class XmlSplit
2
+ VERSION = "0.0.1"
3
+ end
data/lib/xml_split.rb ADDED
@@ -0,0 +1,64 @@
1
+ require "xml_split/version"
2
+
3
+ class XmlSplit
4
+ class << self
5
+ def sgrep_bin
6
+ @sgrep_bin ||= POSSIBLE_BIN_NAMES.detect { |bin| `which #{bin}`; $?.success? } or raise("Can't find any of #{POSSIBLE_BIN_NAMES.inspect} in your PATH")
7
+ end
8
+ end
9
+
10
+ POSSIBLE_BIN_NAMES = %w{ sgrep sgrep2 }
11
+ MAGIC_START = 'n8frNy6J'
12
+ MAGIC_STOP = 'H6py5pxG'
13
+ CHUNK_SIZE = 65536
14
+
15
+ include Enumerable
16
+
17
+ attr_reader :path
18
+ attr_reader :element
19
+ attr_reader :caching
20
+
21
+ def initialize(path, element, options = {})
22
+ @nodes = []
23
+ @cache_full = false
24
+
25
+ @path = File.expand_path path
26
+ @element = element
27
+ @caching = options.fetch :caching, false
28
+ end
29
+
30
+ def each(&blk)
31
+ if caching and @cache_full
32
+ @nodes.each(&blk)
33
+ else
34
+ caching = caching
35
+ leftover = ''
36
+ IO.popen([ XmlSplit.sgrep_bin, '-n', '-o', "#{MAGIC_START}%r#{MAGIC_STOP}", %{"#{start}" .. "#{stop}"}, path ]) do |io|
37
+ while additional = io.read(CHUNK_SIZE)
38
+ buffer = leftover + additional
39
+ while (start = buffer.index(MAGIC_START)) and (stop = buffer.index(MAGIC_STOP))
40
+ node = buffer[(start+MAGIC_START.length)...stop] + '>'
41
+ if caching
42
+ @nodes << node
43
+ end
44
+ yield node
45
+ buffer = buffer[(stop+MAGIC_STOP.length)..-1]
46
+ end
47
+ leftover = buffer
48
+ end
49
+ end
50
+ @cache_full = true
51
+ end
52
+ end
53
+
54
+ private
55
+
56
+ def start
57
+ "<#{element}"
58
+ end
59
+
60
+ def stop
61
+ "</#{element}"
62
+ end
63
+
64
+ end
data/xml_split.gemspec ADDED
@@ -0,0 +1,25 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'xml_split/version'
5
+
6
+ Gem::Specification.new do |gem|
7
+ gem.name = "xml_split"
8
+ gem.version = XmlSplit::VERSION
9
+ gem.authors = ["Seamus Abshere"]
10
+ gem.email = ["seamus@abshere.net"]
11
+ gem.description = %q{Split XML files on an element, yielding (streaming, so constant memory usage) each node in turn. Uses sgrep internally.}
12
+ gem.summary = %q{Split XML files on an element, yielding (streaming, so constant memory usage) each node in turn.}
13
+ gem.homepage = "https://github.com/seamusabshere/xml_split"
14
+
15
+ gem.files = `git ls-files`.split($/)
16
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
17
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
18
+ gem.require_paths = ["lib"]
19
+
20
+ gem.add_development_dependency 'rspec-core'
21
+ gem.add_development_dependency 'rspec-expectations'
22
+ gem.add_development_dependency 'rspec-mocks'
23
+ gem.add_development_dependency 'cucumber'
24
+ gem.add_development_dependency 'yard'
25
+ end
metadata ADDED
@@ -0,0 +1,144 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: xml_split
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Seamus Abshere
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-10-02 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rspec-core
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: rspec-expectations
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :development
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: rspec-mocks
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: cucumber
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ type: :development
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ - !ruby/object:Gem::Dependency
79
+ name: yard
80
+ requirement: !ruby/object:Gem::Requirement
81
+ none: false
82
+ requirements:
83
+ - - ! '>='
84
+ - !ruby/object:Gem::Version
85
+ version: '0'
86
+ type: :development
87
+ prerelease: false
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ! '>='
92
+ - !ruby/object:Gem::Version
93
+ version: '0'
94
+ description: Split XML files on an element, yielding (streaming, so constant memory
95
+ usage) each node in turn. Uses sgrep internally.
96
+ email:
97
+ - seamus@abshere.net
98
+ executables:
99
+ - xml_split
100
+ extensions: []
101
+ extra_rdoc_files: []
102
+ files:
103
+ - .gitignore
104
+ - Gemfile
105
+ - LICENSE.txt
106
+ - README.md
107
+ - Rakefile
108
+ - bin/xml_split
109
+ - features/step_definitions/xml_split_steps.rb
110
+ - features/support/env.rb
111
+ - features/xml_split.feature
112
+ - lib/xml_split.rb
113
+ - lib/xml_split/version.rb
114
+ - xml_split.gemspec
115
+ homepage: https://github.com/seamusabshere/xml_split
116
+ licenses: []
117
+ post_install_message:
118
+ rdoc_options: []
119
+ require_paths:
120
+ - lib
121
+ required_ruby_version: !ruby/object:Gem::Requirement
122
+ none: false
123
+ requirements:
124
+ - - ! '>='
125
+ - !ruby/object:Gem::Version
126
+ version: '0'
127
+ required_rubygems_version: !ruby/object:Gem::Requirement
128
+ none: false
129
+ requirements:
130
+ - - ! '>='
131
+ - !ruby/object:Gem::Version
132
+ version: '0'
133
+ requirements: []
134
+ rubyforge_project:
135
+ rubygems_version: 1.8.24
136
+ signing_key:
137
+ specification_version: 3
138
+ summary: Split XML files on an element, yielding (streaming, so constant memory usage)
139
+ each node in turn.
140
+ test_files:
141
+ - features/step_definitions/xml_split_steps.rb
142
+ - features/support/env.rb
143
+ - features/xml_split.feature
144
+ has_rdoc: