xml_split 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in xml_split.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2012 Seamus Abshere
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,29 @@
1
+ # XmlSplit
2
+
3
+ Split XML files on an element, yielding (streaming, so constant memory usage) each node in turn.
4
+
5
+ Uses [sgrep](http://www.cs.helsinki.fi/u/jjaakkol/sgrepman.html) internally.
6
+
7
+ As seen on ["Split XML files with `sgrep`, a classic UNIX utility from 1995"](http://numbers.brighterplanet.com/2012/09/11/split-xml-files-with-unix-utility-sgrep/)
8
+
9
+ ## Usage
10
+
11
+ >> require 'xml_split'
12
+ => true
13
+ >> x = XmlSplit.new('15MinLP_15Days.xml', 'IntervalReading')
14
+ => #<XmlSplit:0x0000010395ce60 @nodes=[], @cache_full=false, @path="/tmp/scratch/15MinLP_15Days.xml", @element="IntervalReading", @caching=false>
15
+ >> x.each { |node| puts node }
16
+ <IntervalReading>
17
+ <cost>907</cost>
18
+ <timePeriod>
19
+ <duration>900</duration>
20
+ <start>1330578000</start>
21
+ <!-- 3/1/2012 5:00:00 AM -->
22
+ </timePeriod>
23
+ <value>302</value>
24
+ </IntervalReading>
25
+ [...]
26
+
27
+ ## Copyright
28
+
29
+ Copyright 2012 Brighter Planet, Inc.
data/Rakefile ADDED
@@ -0,0 +1,11 @@
1
+ require "bundler/gem_tasks"
2
+
3
+ require 'cucumber/rake/task'
4
+ Cucumber::Rake::Task.new(:cucumber) do |t|
5
+ t.cucumber_opts = %w{--format pretty}
6
+ end
7
+
8
+ task :default => [:cucumber]
9
+
10
+ require 'yard'
11
+ YARD::Rake::YardocTask.new
data/bin/xml_split ADDED
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'xml_split'
@@ -0,0 +1,13 @@
1
+ require 'tempfile'
2
+
3
+ Given /the element to split by "(.*)" and XML/ do |element, xml|
4
+ tmp = Tempfile.new(element)
5
+ tmp.write xml
6
+ tmp.flush
7
+ @object = XmlSplit.new(tmp.path, element)
8
+ end
9
+
10
+ Then /calling :(.*) must give (.*)/ do |method_id, raw_nodes|
11
+ nodes = eval raw_nodes
12
+ @object.send(method_id).should == nodes
13
+ end
@@ -0,0 +1,5 @@
1
+ require 'bundler/setup'
2
+
3
+ require 'rspec/expectations'
4
+
5
+ require 'xml_split'
@@ -0,0 +1,8 @@
1
+ Feature: XmlSplit
2
+
3
+ Scenario:
4
+ Given the element to split by "foo" and XML
5
+ """
6
+ <foo>bar</foo><foo>baz</foo>
7
+ """
8
+ Then calling :to_a must give ["<foo>bar</foo>", "<foo>baz</foo>"]
@@ -0,0 +1,3 @@
1
+ class XmlSplit
2
+ VERSION = "0.0.1"
3
+ end
data/lib/xml_split.rb ADDED
@@ -0,0 +1,64 @@
1
+ require "xml_split/version"
2
+
3
+ class XmlSplit
4
+ class << self
5
+ def sgrep_bin
6
+ @sgrep_bin ||= POSSIBLE_BIN_NAMES.detect { |bin| `which #{bin}`; $?.success? } or raise("Can't find any of #{POSSIBLE_BIN_NAMES.inspect} in your PATH")
7
+ end
8
+ end
9
+
10
+ POSSIBLE_BIN_NAMES = %w{ sgrep sgrep2 }
11
+ MAGIC_START = 'n8frNy6J'
12
+ MAGIC_STOP = 'H6py5pxG'
13
+ CHUNK_SIZE = 65536
14
+
15
+ include Enumerable
16
+
17
+ attr_reader :path
18
+ attr_reader :element
19
+ attr_reader :caching
20
+
21
+ def initialize(path, element, options = {})
22
+ @nodes = []
23
+ @cache_full = false
24
+
25
+ @path = File.expand_path path
26
+ @element = element
27
+ @caching = options.fetch :caching, false
28
+ end
29
+
30
+ def each(&blk)
31
+ if caching and @cache_full
32
+ @nodes.each(&blk)
33
+ else
34
+ caching = caching
35
+ leftover = ''
36
+ IO.popen([ XmlSplit.sgrep_bin, '-n', '-o', "#{MAGIC_START}%r#{MAGIC_STOP}", %{"#{start}" .. "#{stop}"}, path ]) do |io|
37
+ while additional = io.read(CHUNK_SIZE)
38
+ buffer = leftover + additional
39
+ while (start = buffer.index(MAGIC_START)) and (stop = buffer.index(MAGIC_STOP))
40
+ node = buffer[(start+MAGIC_START.length)...stop] + '>'
41
+ if caching
42
+ @nodes << node
43
+ end
44
+ yield node
45
+ buffer = buffer[(stop+MAGIC_STOP.length)..-1]
46
+ end
47
+ leftover = buffer
48
+ end
49
+ end
50
+ @cache_full = true
51
+ end
52
+ end
53
+
54
+ private
55
+
56
+ def start
57
+ "<#{element}"
58
+ end
59
+
60
+ def stop
61
+ "</#{element}"
62
+ end
63
+
64
+ end
data/xml_split.gemspec ADDED
@@ -0,0 +1,25 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'xml_split/version'
5
+
6
+ Gem::Specification.new do |gem|
7
+ gem.name = "xml_split"
8
+ gem.version = XmlSplit::VERSION
9
+ gem.authors = ["Seamus Abshere"]
10
+ gem.email = ["seamus@abshere.net"]
11
+ gem.description = %q{Split XML files on an element, yielding (streaming, so constant memory usage) each node in turn. Uses sgrep internally.}
12
+ gem.summary = %q{Split XML files on an element, yielding (streaming, so constant memory usage) each node in turn.}
13
+ gem.homepage = "https://github.com/seamusabshere/xml_split"
14
+
15
+ gem.files = `git ls-files`.split($/)
16
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
17
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
18
+ gem.require_paths = ["lib"]
19
+
20
+ gem.add_development_dependency 'rspec-core'
21
+ gem.add_development_dependency 'rspec-expectations'
22
+ gem.add_development_dependency 'rspec-mocks'
23
+ gem.add_development_dependency 'cucumber'
24
+ gem.add_development_dependency 'yard'
25
+ end
metadata ADDED
@@ -0,0 +1,144 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: xml_split
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Seamus Abshere
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-10-02 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rspec-core
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: rspec-expectations
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :development
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: rspec-mocks
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: cucumber
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ type: :development
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ - !ruby/object:Gem::Dependency
79
+ name: yard
80
+ requirement: !ruby/object:Gem::Requirement
81
+ none: false
82
+ requirements:
83
+ - - ! '>='
84
+ - !ruby/object:Gem::Version
85
+ version: '0'
86
+ type: :development
87
+ prerelease: false
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ! '>='
92
+ - !ruby/object:Gem::Version
93
+ version: '0'
94
+ description: Split XML files on an element, yielding (streaming, so constant memory
95
+ usage) each node in turn. Uses sgrep internally.
96
+ email:
97
+ - seamus@abshere.net
98
+ executables:
99
+ - xml_split
100
+ extensions: []
101
+ extra_rdoc_files: []
102
+ files:
103
+ - .gitignore
104
+ - Gemfile
105
+ - LICENSE.txt
106
+ - README.md
107
+ - Rakefile
108
+ - bin/xml_split
109
+ - features/step_definitions/xml_split_steps.rb
110
+ - features/support/env.rb
111
+ - features/xml_split.feature
112
+ - lib/xml_split.rb
113
+ - lib/xml_split/version.rb
114
+ - xml_split.gemspec
115
+ homepage: https://github.com/seamusabshere/xml_split
116
+ licenses: []
117
+ post_install_message:
118
+ rdoc_options: []
119
+ require_paths:
120
+ - lib
121
+ required_ruby_version: !ruby/object:Gem::Requirement
122
+ none: false
123
+ requirements:
124
+ - - ! '>='
125
+ - !ruby/object:Gem::Version
126
+ version: '0'
127
+ required_rubygems_version: !ruby/object:Gem::Requirement
128
+ none: false
129
+ requirements:
130
+ - - ! '>='
131
+ - !ruby/object:Gem::Version
132
+ version: '0'
133
+ requirements: []
134
+ rubyforge_project:
135
+ rubygems_version: 1.8.24
136
+ signing_key:
137
+ specification_version: 3
138
+ summary: Split XML files on an element, yielding (streaming, so constant memory usage)
139
+ each node in turn.
140
+ test_files:
141
+ - features/step_definitions/xml_split_steps.rb
142
+ - features/support/env.rb
143
+ - features/xml_split.feature
144
+ has_rdoc: