saxony 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGES.txt +9 -0
- data/README.md +15 -2
- data/lib/saxony.rb +40 -26
- data/saxony.gemspec +1 -1
- metadata +2 -2
data/CHANGES.txt
CHANGED
@@ -1,5 +1,14 @@
|
|
1
1
|
SAXONY, CHANGES
|
2
2
|
|
3
|
+
#### 0.3.1 (2010-02-03) ####
|
4
|
+
|
5
|
+
* FIXED: Oops! Bad paste :[
|
6
|
+
|
7
|
+
#### 0.3.0 (2010-02-03) ####
|
8
|
+
|
9
|
+
* CHANGE: Cleaner Saxony.fork syntax
|
10
|
+
* ADDED: Built-in support for one output file per input file.
|
11
|
+
|
3
12
|
#### 0.2.0 (2010-02-02) ####
|
4
13
|
|
5
14
|
* ADDED: Saxony.fork
|
data/README.md
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
## Saxony - 0.
|
1
|
+
## Saxony - 0.3 ##
|
2
2
|
|
3
3
|
**Parse gigantic XML files with pleasure and without running out of memory.**
|
4
4
|
|
@@ -11,8 +11,21 @@
|
|
11
11
|
total_count # => Total number of SomeObjects processed
|
12
12
|
elapsed_time # => time processing current batch
|
13
13
|
path # => Current file being processed
|
14
|
+
fh # => Output file handle
|
14
15
|
end
|
15
|
-
|
16
|
+
|
17
|
+
# Process multiple files in parallel using Kernel.proc.
|
18
|
+
# By default
|
19
|
+
Saxony.fork ['path/2/huge.xml', 'path/2/huger.xml'] do
|
20
|
+
# Inside the block, everything is the
|
21
|
+
# same as calling sax.parse above.
|
22
|
+
doc.xpath('//Listing').each do |l
|
23
|
+
type = listing.xpath("Type").first.text
|
24
|
+
fh.puts listing if type == 'some_criteria'
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
|
16
29
|
## Credits
|
17
30
|
|
18
31
|
* Delano Mandelbaum (http://solutious.com)
|
data/lib/saxony.rb
CHANGED
@@ -3,19 +3,26 @@ require 'stringio'
|
|
3
3
|
|
4
4
|
|
5
5
|
class Saxony
|
6
|
-
VERSION = "0.
|
6
|
+
VERSION = "0.3.0".freeze unless defined?(Saxony::VERSION)
|
7
7
|
|
8
8
|
class Document < Nokogiri::XML::SAX::Document
|
9
9
|
attr_accessor :path
|
10
|
-
attr_reader :total_count, :granularity
|
11
|
-
def initialize(element, granularity, &processor)
|
10
|
+
attr_reader :total_count, :granularity, :suffix
|
11
|
+
def initialize(element, granularity, suffix=nil, idx=nil, &processor)
|
12
12
|
@root_element = nil
|
13
|
+
@suffix = suffix || '-saxony'
|
13
14
|
@start_time = Time.now
|
14
|
-
@element, @processor = element, processor
|
15
|
+
@element, @processor, @idx = element, processor, idx
|
15
16
|
@granularity, @total_count = granularity, 0
|
16
17
|
reset
|
17
18
|
end
|
18
|
-
|
19
|
+
def idx
|
20
|
+
@idx ||= Thread.current.object_id
|
21
|
+
end
|
22
|
+
def fh
|
23
|
+
@path.split
|
24
|
+
@fh ||= File.open([path, @suffix].join('-'), 'w')
|
25
|
+
end
|
19
26
|
def elapsed_time
|
20
27
|
Time.now - @start_time
|
21
28
|
end
|
@@ -25,7 +32,6 @@ class Saxony
|
|
25
32
|
def doc
|
26
33
|
@doc ||= Nokogiri::XML(xml)
|
27
34
|
end
|
28
|
-
|
29
35
|
def start_element(element, attributes)
|
30
36
|
if element == @element.to_s
|
31
37
|
@count += 1 and @total_count += 1
|
@@ -52,10 +58,17 @@ class Saxony
|
|
52
58
|
end
|
53
59
|
def end_document
|
54
60
|
process_objects unless @buffer.pos <= 0
|
61
|
+
fh.puts $/, "</#{@root_element}>"
|
62
|
+
fh.close
|
55
63
|
end
|
56
64
|
|
57
65
|
private
|
58
66
|
def process_objects
|
67
|
+
unless @started
|
68
|
+
puts "#{idx}: #{fh.path}"
|
69
|
+
fh.puts "<#{@root_element}>"
|
70
|
+
@started = true
|
71
|
+
end
|
59
72
|
self.instance_eval &@processor
|
60
73
|
reset
|
61
74
|
end
|
@@ -81,47 +94,48 @@ class Saxony
|
|
81
94
|
end
|
82
95
|
end
|
83
96
|
|
84
|
-
attr_reader :
|
85
|
-
def initialize(
|
86
|
-
@
|
97
|
+
attr_reader :opts, :total_count
|
98
|
+
def initialize(opts={})
|
99
|
+
@opts = opts
|
100
|
+
@total_count = 0
|
87
101
|
end
|
88
102
|
|
89
103
|
# * sources can be a list of file paths, IO objects, or XML strings
|
90
104
|
def parse *sources, &blk
|
91
105
|
sources.flatten!
|
92
|
-
@saxdoc = Saxony::Document.new @element, @granularity, &blk
|
93
106
|
sources.each do |src|
|
94
|
-
|
107
|
+
saxdoc = Saxony::Document.new @opts[:element], @opts[:batch], @opts[:suffix], &blk
|
108
|
+
parser = Nokogiri::XML::SAX::Parser.new(saxdoc)
|
95
109
|
if (String === src && File.exists?(src))
|
96
110
|
xml = File.open(src)
|
97
|
-
|
111
|
+
saxdoc.path = src
|
98
112
|
else
|
99
113
|
xml = src
|
100
|
-
|
114
|
+
saxdoc.path = src.class
|
101
115
|
end
|
102
116
|
parser.parse xml
|
117
|
+
@total_count += saxdoc.total_count
|
103
118
|
end
|
104
119
|
end
|
105
120
|
|
106
|
-
def
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
def Saxony.fork(procs,*paths,&logic)
|
112
|
-
puts
|
121
|
+
def Saxony.fork(paths, opts={}, &logic)
|
122
|
+
opts = {
|
123
|
+
:procs => 2,
|
124
|
+
:batch => 1000
|
125
|
+
}.merge! opts
|
113
126
|
paths.flatten!
|
114
|
-
|
115
|
-
|
116
|
-
|
127
|
+
sax = Saxony.new opts
|
128
|
+
if opts[:procs] > 1
|
129
|
+
path_chunks = paths.chunk(opts[:procs])
|
130
|
+
opts[:procs].times do |idx|
|
117
131
|
proc_paths = path_chunks[idx]
|
118
132
|
pid = Kernel.fork do
|
119
|
-
|
133
|
+
sax.parse *proc_paths, &logic
|
120
134
|
end
|
121
|
-
puts "PID #{pid} (#{idx+1}/#{procs}): #{proc_paths.join(', ')}"
|
135
|
+
puts "PID #{pid} (#{idx+1}/#{opts[:procs]}): #{proc_paths.join(', ')}"
|
122
136
|
end
|
123
137
|
else
|
124
|
-
|
138
|
+
sax.parse *paths, &logic
|
125
139
|
end
|
126
140
|
|
127
141
|
end
|
data/saxony.gemspec
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
@spec = Gem::Specification.new do |s|
|
2
2
|
s.name = "saxony"
|
3
3
|
s.rubyforge_project = 'bone'
|
4
|
-
s.version = "0.
|
4
|
+
s.version = "0.3.0"
|
5
5
|
s.summary = "Parse gigantic XML files with pleasure and a without running out of memory."
|
6
6
|
s.description = s.summary
|
7
7
|
s.author = "Delano Mandelbaum"
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: saxony
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Delano Mandelbaum
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2010-02-
|
12
|
+
date: 2010-02-03 00:00:00 -05:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|