saxony 0.2.0 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (5) hide show
  1. data/CHANGES.txt +9 -0
  2. data/README.md +15 -2
  3. data/lib/saxony.rb +40 -26
  4. data/saxony.gemspec +1 -1
  5. metadata +2 -2
@@ -1,5 +1,14 @@
1
1
  SAXONY, CHANGES
2
2
 
3
+ #### 0.3.1 (2010-02-03) ####
4
+
5
+ * FIXED: Oops! Bad paste :[
6
+
7
+ #### 0.3.0 (2010-02-03) ####
8
+
9
+ * CHANGE: Cleaner Saxony.fork syntax
10
+ * ADDED: Built-in support for one output file per input file.
11
+
3
12
  #### 0.2.0 (2010-02-02) ####
4
13
 
5
14
  * ADDED: Saxony.fork
data/README.md CHANGED
@@ -1,4 +1,4 @@
1
- ## Saxony - 0.2 ##
1
+ ## Saxony - 0.3 ##
2
2
 
3
3
  **Parse gigantic XML files with pleasure and without running out of memory.**
4
4
 
@@ -11,8 +11,21 @@
11
11
  total_count # => Total number of SomeObjects processed
12
12
  elapsed_time # => time processing current batch
13
13
  path # => Current file being processed
14
+ fh # => Output file handle
14
15
  end
15
-
16
+
17
+ # Process multiple files in parallel using Kernel.proc.
18
+ # By default
19
+ Saxony.fork ['path/2/huge.xml', 'path/2/huger.xml'] do
20
+ # Inside the block, everything is the
21
+ # same as calling sax.parse above.
22
+ doc.xpath('//Listing').each do |l
23
+ type = listing.xpath("Type").first.text
24
+ fh.puts listing if type == 'some_criteria'
25
+ end
26
+ end
27
+
28
+
16
29
  ## Credits
17
30
 
18
31
  * Delano Mandelbaum (http://solutious.com)
@@ -3,19 +3,26 @@ require 'stringio'
3
3
 
4
4
 
5
5
  class Saxony
6
- VERSION = "0.2.0".freeze unless defined?(Saxony::VERSION)
6
+ VERSION = "0.3.0".freeze unless defined?(Saxony::VERSION)
7
7
 
8
8
  class Document < Nokogiri::XML::SAX::Document
9
9
  attr_accessor :path
10
- attr_reader :total_count, :granularity
11
- def initialize(element, granularity, &processor)
10
+ attr_reader :total_count, :granularity, :suffix
11
+ def initialize(element, granularity, suffix=nil, idx=nil, &processor)
12
12
  @root_element = nil
13
+ @suffix = suffix || '-saxony'
13
14
  @start_time = Time.now
14
- @element, @processor = element, processor
15
+ @element, @processor, @idx = element, processor, idx
15
16
  @granularity, @total_count = granularity, 0
16
17
  reset
17
18
  end
18
-
19
+ def idx
20
+ @idx ||= Thread.current.object_id
21
+ end
22
+ def fh
23
+ @path.split
24
+ @fh ||= File.open([path, @suffix].join('-'), 'w')
25
+ end
19
26
  def elapsed_time
20
27
  Time.now - @start_time
21
28
  end
@@ -25,7 +32,6 @@ class Saxony
25
32
  def doc
26
33
  @doc ||= Nokogiri::XML(xml)
27
34
  end
28
-
29
35
  def start_element(element, attributes)
30
36
  if element == @element.to_s
31
37
  @count += 1 and @total_count += 1
@@ -52,10 +58,17 @@ class Saxony
52
58
  end
53
59
  def end_document
54
60
  process_objects unless @buffer.pos <= 0
61
+ fh.puts $/, "</#{@root_element}>"
62
+ fh.close
55
63
  end
56
64
 
57
65
  private
58
66
  def process_objects
67
+ unless @started
68
+ puts "#{idx}: #{fh.path}"
69
+ fh.puts "<#{@root_element}>"
70
+ @started = true
71
+ end
59
72
  self.instance_eval &@processor
60
73
  reset
61
74
  end
@@ -81,47 +94,48 @@ class Saxony
81
94
  end
82
95
  end
83
96
 
84
- attr_reader :granularity, :element
85
- def initialize(element, granularity=1000)
86
- @element, @granularity = element, granularity
97
+ attr_reader :opts, :total_count
98
+ def initialize(opts={})
99
+ @opts = opts
100
+ @total_count = 0
87
101
  end
88
102
 
89
103
  # * sources can be a list of file paths, IO objects, or XML strings
90
104
  def parse *sources, &blk
91
105
  sources.flatten!
92
- @saxdoc = Saxony::Document.new @element, @granularity, &blk
93
106
  sources.each do |src|
94
- parser = Nokogiri::XML::SAX::Parser.new(@saxdoc)
107
+ saxdoc = Saxony::Document.new @opts[:element], @opts[:batch], @opts[:suffix], &blk
108
+ parser = Nokogiri::XML::SAX::Parser.new(saxdoc)
95
109
  if (String === src && File.exists?(src))
96
110
  xml = File.open(src)
97
- @saxdoc.path = src
111
+ saxdoc.path = src
98
112
  else
99
113
  xml = src
100
- @saxdoc.path = src.class
114
+ saxdoc.path = src.class
101
115
  end
102
116
  parser.parse xml
117
+ @total_count += saxdoc.total_count
103
118
  end
104
119
  end
105
120
 
106
- def total_count
107
- @saxdoc.total_count
108
- end
109
-
110
-
111
- def Saxony.fork(procs,*paths,&logic)
112
- puts
121
+ def Saxony.fork(paths, opts={}, &logic)
122
+ opts = {
123
+ :procs => 2,
124
+ :batch => 1000
125
+ }.merge! opts
113
126
  paths.flatten!
114
- if procs > 1
115
- path_chunks = paths.chunk(procs)
116
- procs.times do |idx|
127
+ sax = Saxony.new opts
128
+ if opts[:procs] > 1
129
+ path_chunks = paths.chunk(opts[:procs])
130
+ opts[:procs].times do |idx|
117
131
  proc_paths = path_chunks[idx]
118
132
  pid = Kernel.fork do
119
- logic.call(proc_paths,idx)
133
+ sax.parse *proc_paths, &logic
120
134
  end
121
- puts "PID #{pid} (#{idx+1}/#{procs}): #{proc_paths.join(', ')}"
135
+ puts "PID #{pid} (#{idx+1}/#{opts[:procs]}): #{proc_paths.join(', ')}"
122
136
  end
123
137
  else
124
- logic.call paths, 1
138
+ sax.parse *paths, &logic
125
139
  end
126
140
 
127
141
  end
@@ -1,7 +1,7 @@
1
1
  @spec = Gem::Specification.new do |s|
2
2
  s.name = "saxony"
3
3
  s.rubyforge_project = 'bone'
4
- s.version = "0.2.0"
4
+ s.version = "0.3.0"
5
5
  s.summary = "Parse gigantic XML files with pleasure and a without running out of memory."
6
6
  s.description = s.summary
7
7
  s.author = "Delano Mandelbaum"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: saxony
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Delano Mandelbaum
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2010-02-02 00:00:00 -05:00
12
+ date: 2010-02-03 00:00:00 -05:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency