saxony 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (5) hide show
  1. data/CHANGES.txt +9 -0
  2. data/README.md +15 -2
  3. data/lib/saxony.rb +40 -26
  4. data/saxony.gemspec +1 -1
  5. metadata +2 -2
@@ -1,5 +1,14 @@
1
1
  SAXONY, CHANGES
2
2
 
3
+ #### 0.3.1 (2010-02-03) ####
4
+
5
+ * FIXED: Oops! Bad paste :[
6
+
7
+ #### 0.3.0 (2010-02-03) ####
8
+
9
+ * CHANGE: Cleaner Saxony.fork syntax
10
+ * ADDED: Built-in support for one output file per input file.
11
+
3
12
  #### 0.2.0 (2010-02-02) ####
4
13
 
5
14
  * ADDED: Saxony.fork
data/README.md CHANGED
@@ -1,4 +1,4 @@
1
- ## Saxony - 0.2 ##
1
+ ## Saxony - 0.3 ##
2
2
 
3
3
  **Parse gigantic XML files with pleasure and without running out of memory.**
4
4
 
@@ -11,8 +11,21 @@
11
11
  total_count # => Total number of SomeObjects processed
12
12
  elapsed_time # => time processing current batch
13
13
  path # => Current file being processed
14
+ fh # => Output file handle
14
15
  end
15
-
16
+
17
+ # Process multiple files in parallel using Kernel.proc.
18
+ # By default
19
+ Saxony.fork ['path/2/huge.xml', 'path/2/huger.xml'] do
20
+ # Inside the block, everything is the
21
+ # same as calling sax.parse above.
22
+ doc.xpath('//Listing').each do |l
23
+ type = listing.xpath("Type").first.text
24
+ fh.puts listing if type == 'some_criteria'
25
+ end
26
+ end
27
+
28
+
16
29
  ## Credits
17
30
 
18
31
  * Delano Mandelbaum (http://solutious.com)
@@ -3,19 +3,26 @@ require 'stringio'
3
3
 
4
4
 
5
5
  class Saxony
6
- VERSION = "0.2.0".freeze unless defined?(Saxony::VERSION)
6
+ VERSION = "0.3.0".freeze unless defined?(Saxony::VERSION)
7
7
 
8
8
  class Document < Nokogiri::XML::SAX::Document
9
9
  attr_accessor :path
10
- attr_reader :total_count, :granularity
11
- def initialize(element, granularity, &processor)
10
+ attr_reader :total_count, :granularity, :suffix
11
+ def initialize(element, granularity, suffix=nil, idx=nil, &processor)
12
12
  @root_element = nil
13
+ @suffix = suffix || '-saxony'
13
14
  @start_time = Time.now
14
- @element, @processor = element, processor
15
+ @element, @processor, @idx = element, processor, idx
15
16
  @granularity, @total_count = granularity, 0
16
17
  reset
17
18
  end
18
-
19
+ def idx
20
+ @idx ||= Thread.current.object_id
21
+ end
22
+ def fh
23
+ @path.split
24
+ @fh ||= File.open([path, @suffix].join('-'), 'w')
25
+ end
19
26
  def elapsed_time
20
27
  Time.now - @start_time
21
28
  end
@@ -25,7 +32,6 @@ class Saxony
25
32
  def doc
26
33
  @doc ||= Nokogiri::XML(xml)
27
34
  end
28
-
29
35
  def start_element(element, attributes)
30
36
  if element == @element.to_s
31
37
  @count += 1 and @total_count += 1
@@ -52,10 +58,17 @@ class Saxony
52
58
  end
53
59
  def end_document
54
60
  process_objects unless @buffer.pos <= 0
61
+ fh.puts $/, "</#{@root_element}>"
62
+ fh.close
55
63
  end
56
64
 
57
65
  private
58
66
  def process_objects
67
+ unless @started
68
+ puts "#{idx}: #{fh.path}"
69
+ fh.puts "<#{@root_element}>"
70
+ @started = true
71
+ end
59
72
  self.instance_eval &@processor
60
73
  reset
61
74
  end
@@ -81,47 +94,48 @@ class Saxony
81
94
  end
82
95
  end
83
96
 
84
- attr_reader :granularity, :element
85
- def initialize(element, granularity=1000)
86
- @element, @granularity = element, granularity
97
+ attr_reader :opts, :total_count
98
+ def initialize(opts={})
99
+ @opts = opts
100
+ @total_count = 0
87
101
  end
88
102
 
89
103
  # * sources can be a list of file paths, IO objects, or XML strings
90
104
  def parse *sources, &blk
91
105
  sources.flatten!
92
- @saxdoc = Saxony::Document.new @element, @granularity, &blk
93
106
  sources.each do |src|
94
- parser = Nokogiri::XML::SAX::Parser.new(@saxdoc)
107
+ saxdoc = Saxony::Document.new @opts[:element], @opts[:batch], @opts[:suffix], &blk
108
+ parser = Nokogiri::XML::SAX::Parser.new(saxdoc)
95
109
  if (String === src && File.exists?(src))
96
110
  xml = File.open(src)
97
- @saxdoc.path = src
111
+ saxdoc.path = src
98
112
  else
99
113
  xml = src
100
- @saxdoc.path = src.class
114
+ saxdoc.path = src.class
101
115
  end
102
116
  parser.parse xml
117
+ @total_count += saxdoc.total_count
103
118
  end
104
119
  end
105
120
 
106
- def total_count
107
- @saxdoc.total_count
108
- end
109
-
110
-
111
- def Saxony.fork(procs,*paths,&logic)
112
- puts
121
+ def Saxony.fork(paths, opts={}, &logic)
122
+ opts = {
123
+ :procs => 2,
124
+ :batch => 1000
125
+ }.merge! opts
113
126
  paths.flatten!
114
- if procs > 1
115
- path_chunks = paths.chunk(procs)
116
- procs.times do |idx|
127
+ sax = Saxony.new opts
128
+ if opts[:procs] > 1
129
+ path_chunks = paths.chunk(opts[:procs])
130
+ opts[:procs].times do |idx|
117
131
  proc_paths = path_chunks[idx]
118
132
  pid = Kernel.fork do
119
- logic.call(proc_paths,idx)
133
+ sax.parse *proc_paths, &logic
120
134
  end
121
- puts "PID #{pid} (#{idx+1}/#{procs}): #{proc_paths.join(', ')}"
135
+ puts "PID #{pid} (#{idx+1}/#{opts[:procs]}): #{proc_paths.join(', ')}"
122
136
  end
123
137
  else
124
- logic.call paths, 1
138
+ sax.parse *paths, &logic
125
139
  end
126
140
 
127
141
  end
@@ -1,7 +1,7 @@
1
1
  @spec = Gem::Specification.new do |s|
2
2
  s.name = "saxony"
3
3
  s.rubyforge_project = 'bone'
4
- s.version = "0.2.0"
4
+ s.version = "0.3.0"
5
5
  s.summary = "Parse gigantic XML files with pleasure and a without running out of memory."
6
6
  s.description = s.summary
7
7
  s.author = "Delano Mandelbaum"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: saxony
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Delano Mandelbaum
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2010-02-02 00:00:00 -05:00
12
+ date: 2010-02-03 00:00:00 -05:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency