saxony 0.2.0 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGES.txt +9 -0
- data/README.md +15 -2
- data/lib/saxony.rb +40 -26
- data/saxony.gemspec +1 -1
- metadata +2 -2
data/CHANGES.txt
CHANGED
@@ -1,5 +1,14 @@
|
|
1
1
|
SAXONY, CHANGES
|
2
2
|
|
3
|
+
#### 0.3.1 (2010-02-03) ####
|
4
|
+
|
5
|
+
* FIXED: Oops! Bad paste :[
|
6
|
+
|
7
|
+
#### 0.3.0 (2010-02-03) ####
|
8
|
+
|
9
|
+
* CHANGE: Cleaner Saxony.fork syntax
|
10
|
+
* ADDED: Built-in support for one output file per input file.
|
11
|
+
|
3
12
|
#### 0.2.0 (2010-02-02) ####
|
4
13
|
|
5
14
|
* ADDED: Saxony.fork
|
data/README.md
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
## Saxony - 0.
|
1
|
+
## Saxony - 0.3 ##
|
2
2
|
|
3
3
|
**Parse gigantic XML files with pleasure and without running out of memory.**
|
4
4
|
|
@@ -11,8 +11,21 @@
|
|
11
11
|
total_count # => Total number of SomeObjects processed
|
12
12
|
elapsed_time # => time processing current batch
|
13
13
|
path # => Current file being processed
|
14
|
+
fh # => Output file handle
|
14
15
|
end
|
15
|
-
|
16
|
+
|
17
|
+
# Process multiple files in parallel using Kernel.proc.
|
18
|
+
# By default
|
19
|
+
Saxony.fork ['path/2/huge.xml', 'path/2/huger.xml'] do
|
20
|
+
# Inside the block, everything is the
|
21
|
+
# same as calling sax.parse above.
|
22
|
+
doc.xpath('//Listing').each do |l
|
23
|
+
type = listing.xpath("Type").first.text
|
24
|
+
fh.puts listing if type == 'some_criteria'
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
|
16
29
|
## Credits
|
17
30
|
|
18
31
|
* Delano Mandelbaum (http://solutious.com)
|
data/lib/saxony.rb
CHANGED
@@ -3,19 +3,26 @@ require 'stringio'
|
|
3
3
|
|
4
4
|
|
5
5
|
class Saxony
|
6
|
-
VERSION = "0.
|
6
|
+
VERSION = "0.3.0".freeze unless defined?(Saxony::VERSION)
|
7
7
|
|
8
8
|
class Document < Nokogiri::XML::SAX::Document
|
9
9
|
attr_accessor :path
|
10
|
-
attr_reader :total_count, :granularity
|
11
|
-
def initialize(element, granularity, &processor)
|
10
|
+
attr_reader :total_count, :granularity, :suffix
|
11
|
+
def initialize(element, granularity, suffix=nil, idx=nil, &processor)
|
12
12
|
@root_element = nil
|
13
|
+
@suffix = suffix || '-saxony'
|
13
14
|
@start_time = Time.now
|
14
|
-
@element, @processor = element, processor
|
15
|
+
@element, @processor, @idx = element, processor, idx
|
15
16
|
@granularity, @total_count = granularity, 0
|
16
17
|
reset
|
17
18
|
end
|
18
|
-
|
19
|
+
def idx
|
20
|
+
@idx ||= Thread.current.object_id
|
21
|
+
end
|
22
|
+
def fh
|
23
|
+
@path.split
|
24
|
+
@fh ||= File.open([path, @suffix].join('-'), 'w')
|
25
|
+
end
|
19
26
|
def elapsed_time
|
20
27
|
Time.now - @start_time
|
21
28
|
end
|
@@ -25,7 +32,6 @@ class Saxony
|
|
25
32
|
def doc
|
26
33
|
@doc ||= Nokogiri::XML(xml)
|
27
34
|
end
|
28
|
-
|
29
35
|
def start_element(element, attributes)
|
30
36
|
if element == @element.to_s
|
31
37
|
@count += 1 and @total_count += 1
|
@@ -52,10 +58,17 @@ class Saxony
|
|
52
58
|
end
|
53
59
|
def end_document
|
54
60
|
process_objects unless @buffer.pos <= 0
|
61
|
+
fh.puts $/, "</#{@root_element}>"
|
62
|
+
fh.close
|
55
63
|
end
|
56
64
|
|
57
65
|
private
|
58
66
|
def process_objects
|
67
|
+
unless @started
|
68
|
+
puts "#{idx}: #{fh.path}"
|
69
|
+
fh.puts "<#{@root_element}>"
|
70
|
+
@started = true
|
71
|
+
end
|
59
72
|
self.instance_eval &@processor
|
60
73
|
reset
|
61
74
|
end
|
@@ -81,47 +94,48 @@ class Saxony
|
|
81
94
|
end
|
82
95
|
end
|
83
96
|
|
84
|
-
attr_reader :
|
85
|
-
def initialize(
|
86
|
-
@
|
97
|
+
attr_reader :opts, :total_count
|
98
|
+
def initialize(opts={})
|
99
|
+
@opts = opts
|
100
|
+
@total_count = 0
|
87
101
|
end
|
88
102
|
|
89
103
|
# * sources can be a list of file paths, IO objects, or XML strings
|
90
104
|
def parse *sources, &blk
|
91
105
|
sources.flatten!
|
92
|
-
@saxdoc = Saxony::Document.new @element, @granularity, &blk
|
93
106
|
sources.each do |src|
|
94
|
-
|
107
|
+
saxdoc = Saxony::Document.new @opts[:element], @opts[:batch], @opts[:suffix], &blk
|
108
|
+
parser = Nokogiri::XML::SAX::Parser.new(saxdoc)
|
95
109
|
if (String === src && File.exists?(src))
|
96
110
|
xml = File.open(src)
|
97
|
-
|
111
|
+
saxdoc.path = src
|
98
112
|
else
|
99
113
|
xml = src
|
100
|
-
|
114
|
+
saxdoc.path = src.class
|
101
115
|
end
|
102
116
|
parser.parse xml
|
117
|
+
@total_count += saxdoc.total_count
|
103
118
|
end
|
104
119
|
end
|
105
120
|
|
106
|
-
def
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
def Saxony.fork(procs,*paths,&logic)
|
112
|
-
puts
|
121
|
+
def Saxony.fork(paths, opts={}, &logic)
|
122
|
+
opts = {
|
123
|
+
:procs => 2,
|
124
|
+
:batch => 1000
|
125
|
+
}.merge! opts
|
113
126
|
paths.flatten!
|
114
|
-
|
115
|
-
|
116
|
-
|
127
|
+
sax = Saxony.new opts
|
128
|
+
if opts[:procs] > 1
|
129
|
+
path_chunks = paths.chunk(opts[:procs])
|
130
|
+
opts[:procs].times do |idx|
|
117
131
|
proc_paths = path_chunks[idx]
|
118
132
|
pid = Kernel.fork do
|
119
|
-
|
133
|
+
sax.parse *proc_paths, &logic
|
120
134
|
end
|
121
|
-
puts "PID #{pid} (#{idx+1}/#{procs}): #{proc_paths.join(', ')}"
|
135
|
+
puts "PID #{pid} (#{idx+1}/#{opts[:procs]}): #{proc_paths.join(', ')}"
|
122
136
|
end
|
123
137
|
else
|
124
|
-
|
138
|
+
sax.parse *paths, &logic
|
125
139
|
end
|
126
140
|
|
127
141
|
end
|
data/saxony.gemspec
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
@spec = Gem::Specification.new do |s|
|
2
2
|
s.name = "saxony"
|
3
3
|
s.rubyforge_project = 'bone'
|
4
|
-
s.version = "0.
|
4
|
+
s.version = "0.3.0"
|
5
5
|
s.summary = "Parse gigantic XML files with pleasure and a without running out of memory."
|
6
6
|
s.description = s.summary
|
7
7
|
s.author = "Delano Mandelbaum"
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: saxony
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Delano Mandelbaum
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2010-02-
|
12
|
+
date: 2010-02-03 00:00:00 -05:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|