data_tools 0.6.0 → 0.6.4

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore CHANGED
@@ -3,3 +3,4 @@
3
3
  pkg
4
4
  todo.txt
5
5
  .DS_Store
6
+ tmp
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- data_tools (0.6.0)
4
+ data_tools (0.6.2)
5
5
  awesome_print
6
6
  facets
7
7
 
data/lib/data_tools.rb CHANGED
@@ -8,7 +8,7 @@ module DataTools
8
8
  $".grep(/data_tools/).each {|f| load(f)}
9
9
  end
10
10
 
11
- def DataTools.scour(s, opts)
11
+ def DataTools.scour(s, opts = {})
12
12
  case s
13
13
  when nil
14
14
  nil
@@ -18,7 +18,7 @@ module DataTools
18
18
  # looks numeric
19
19
  s2 = s2.to_i.to_s
20
20
  end
21
- (s2.empty? || opts[:junkwords].include?(s2)) ? nil : s2
21
+ (s2.empty? || (opts[:junkwords]||[]).include?(s2)) ? nil : s2
22
22
  when Numeric
23
23
  s.to_s
24
24
  else
@@ -37,6 +37,4 @@ module DataTools::Array
37
37
  File.unlink(filename) if File.exists?(filename)
38
38
  File.open(filename, "w") {|f| f << Marshal.dump(self)}
39
39
  end
40
-
41
-
42
40
  end
@@ -123,18 +123,18 @@ module DataTools::ArrayOfHashes
123
123
  File.open(filename, "w") {|f| f << Marshal.dump(self)}
124
124
  end
125
125
 
126
- # attempt to dump out contents of this array-of-hashes as CSV to named file
127
- # fields is list of attribute names to write out
128
- # options headers is public names for the fields
129
- def csvme(filename, fields, headers = fields)
130
- CSV.open(filename, "wb") do |csv|
131
- csv << headers unless headers.nil?
132
- pluck(fields).each do |ary|
133
- csv << ary
134
- end
135
- end
136
- true
137
- end
126
+ # # attempt to dump out contents of this array-of-hashes as CSV to named file
127
+ # # fields is list of attribute names to write out
128
+ # # options headers is public names for the fields
129
+ # def csvme(filename, fields, headers = fields)
130
+ # CSV.open(filename, "wb") do |csv|
131
+ # csv << headers unless headers.nil?
132
+ # pluck(fields).each do |ary|
133
+ # csv << ary
134
+ # end
135
+ # end
136
+ # true
137
+ # end
138
138
 
139
139
  def tsvme(filename, fields, headers = fields)
140
140
  File.open(target) do |output|
@@ -4,7 +4,26 @@ module DataTools::Enumerator
4
4
  each do |hash|
5
5
  outputstream.puts hash.pluck(fields).to_csv
6
6
  end
7
+ outputstream.flush # otherwise missing rows might not get pushed out
7
8
  outputstream
9
+ rescue Errno::EPIPE
10
+ # output was closed, that's fine
11
+ end
12
+
13
+ def lazy_select(&block)
14
+ Enumerator.new do |yielder|
15
+ self.each do |val|
16
+ yielder.yield(val) if block.call(val)
17
+ end
18
+ end
19
+ end
20
+
21
+ def lazy_map(&block)
22
+ Enumerator.new do |yielder|
23
+ self.each do |value|
24
+ yielder.yield(block.call(value))
25
+ end
26
+ end
8
27
  end
9
28
  end
10
29
 
@@ -180,44 +180,23 @@ module DataTools::Hash
180
180
  end
181
181
  end
182
182
 
183
- # # HASH OF ARRAYS
184
- # def coalesce!(args)
185
- # rules = args[:per]
186
- # rules.each do |from, to|
187
- # if self[to].nil?
188
- # raise "cannot merge #{from} into #{to}, destination does not exist"
189
- # end
190
- # if self[from].nil?
191
- # $stderr.puts "cannot merge #{from} into #{to}, source does not exist, ignoring"
192
- # next
193
- # end
194
- # self[to] += self[from]
195
- # self.delete(from)
196
- # end
197
- # self
198
- # end
199
-
200
183
  def cleanse(options = {})
201
184
  each_with_object({}) do |(k,v), out|
202
185
  out[k] = DataTools.scour(v, options)
203
186
  if dateformat = options[:datefields][k]
204
187
  begin
205
- out[k] = DateTime.strptime(v, dateformat).to_time
206
- rescue
207
- warn "invalid #{k} (expected #{dateformat}): #{rec}"
188
+ out[k] = v && DateTime.strptime(v, dateformat).to_date
189
+ rescue ArgumentError
190
+ warn "expected '#{dateformat}' in #{k} = '#{v}' at [#{options[:line]}]: #{self}"
191
+ out[k] = nil
208
192
  end
209
193
  end
210
194
  end
211
195
  end
212
196
 
213
- def subset(keys)
214
- map do |h|
215
- h.select {|k,v| keys.include? k}
216
- end
217
- end
218
-
219
- def pluck(*keys)
220
- keys.flatten.map {|k| self[k]}
197
+ def pluck(keys)
198
+ keys.map {|k| self[k]}
199
+ # keys.flatten.map {|k| self[k]}
221
200
  end
222
201
  end
223
202
 
data/lib/data_tools/io.rb CHANGED
@@ -1,27 +1,32 @@
1
1
  require "csv"
2
2
 
3
3
  module DataTools::IO
4
+ attr_reader :headers, :import_options
5
+
4
6
  def unmarshal
5
7
  Marshal.load(self)
6
8
  end
7
9
 
8
- def headers
9
- @import_headers ||= @import_options[:headers] || behead
10
- end
11
-
12
10
  def split(line)
13
- case import_options[:format]
14
- when :tsv
11
+ fields = case import_options[:format]
12
+ when :tsv # tab-delimited
15
13
  line.split("\t")
16
- when :qcq
14
+ when :wsv # whitespace-delimited
15
+ line.split
16
+ when :qcq # quote-comma-quote (*not* the same as CSV)
17
17
  line.split('","')
18
- else # default is CSV
18
+ else # default is :csv
19
19
  line.parse_csv
20
20
  end
21
+
22
+ fields.map {|f| DataTools.scour(f)}
21
23
  end
22
24
 
23
25
  def parseline(line)
24
- split(line.chomp)
26
+ @linenumber += 1
27
+ # remove leading and trailing line endings (CR or LF)
28
+ # but NOT whitespace, because e.g. there could be leading or trailing blank fields delimited by tabs
29
+ split(line.gsub(/^[\n\r]*|[\n\r]*$/, ''))
25
30
  end
26
31
 
27
32
  def import_options
@@ -35,15 +40,20 @@ module DataTools::IO
35
40
  import_options.merge!(options)
36
41
  end
37
42
 
43
+ def line_to_record(line)
44
+ Hash[headers.zip(parseline(line)).select {|k,v| !v.nil?}]
45
+ end
46
+
38
47
  def import(opts = {}) # expects a block
39
48
  configure_import(opts)
40
- headers = opts[:headers] || parseline(readline)
41
- # warn "HEADERS ARE #{headers}"
49
+ @linenumber = 0
50
+ @headers = opts[:headers] || parseline(readline(opts[:rowsep] || $/))
42
51
  Enumerator.new do |yielder|
43
- self.each do |line|
44
- rec = Hash[headers.zip(parseline(line))]
52
+ self.each(opts[:rowsep] || $/) do |line|
53
+ rec = line_to_record(line)
54
+ next if rec.empty? # silently ignore blank records
45
55
  rec.extend DataTools::Hash
46
- yielder.yield rec.cleanse(import_options)
56
+ yielder.yield rec.cleanse(import_options.merge(:line => @linenumber))
47
57
  end
48
58
  # need to emit anything to trigger a file-completed action? (such as pushing a batch to storage)
49
59
  end
@@ -53,3 +63,5 @@ end
53
63
  class IO
54
64
  include DataTools::IO
55
65
  end
66
+
67
+ ARGF.extend DataTools::IO
@@ -1,3 +1,3 @@
1
1
  module DataTools
2
- VERSION = "0.6.0"
2
+ VERSION = "0.6.4"
3
3
  end
data/spec/import_spec.rb CHANGED
@@ -3,8 +3,8 @@ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
3
3
  require "json"
4
4
 
5
5
  describe "File Import" do
6
- it "imports" do
7
- f = File.open(File.dirname(__FILE__) + "/../hrhead.csv")
6
+ it "imports CSV" do
7
+ f = File.open(File.dirname(__FILE__) + "/../tmp/hrhead.csv")
8
8
  # sio = f.import.csvme(StringIO.new, ['Person Phone GUID', 'Person Address GUID'])
9
9
  # puts sio.string
10
10
 
@@ -12,4 +12,11 @@ describe "File Import" do
12
12
  puts slice.extend(DataTools::ArrayOfHashes).pluck('Person Phone GUID', 'Person Address GUID').to_json
13
13
  end
14
14
  end
15
+
16
+ it "import WSV" do
17
+ f = File.open(File.dirname(__FILE__) + "/../tmp/visits.txt")
18
+ recs = f.import(:format => :wsv, :datefields => {'admit_arrive_date' => '%Y-%m-%d'}).to_a
19
+ recs.count.should == 99
20
+ puts recs.sample
21
+ end
15
22
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: data_tools
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.0
4
+ version: 0.6.4
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-01-23 00:00:00.000000000 Z
12
+ date: 2013-02-05 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: awesome_print
@@ -118,3 +118,4 @@ signing_key:
118
118
  specification_version: 3
119
119
  summary: Miscellaneous data-munging utilities.
120
120
  test_files: []
121
+ has_rdoc: