data_tools 0.6.0 → 0.6.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore CHANGED
@@ -3,3 +3,4 @@
3
3
  pkg
4
4
  todo.txt
5
5
  .DS_Store
6
+ tmp
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- data_tools (0.6.0)
4
+ data_tools (0.6.2)
5
5
  awesome_print
6
6
  facets
7
7
 
data/lib/data_tools.rb CHANGED
@@ -8,7 +8,7 @@ module DataTools
8
8
  $".grep(/data_tools/).each {|f| load(f)}
9
9
  end
10
10
 
11
- def DataTools.scour(s, opts)
11
+ def DataTools.scour(s, opts = {})
12
12
  case s
13
13
  when nil
14
14
  nil
@@ -18,7 +18,7 @@ module DataTools
18
18
  # looks numeric
19
19
  s2 = s2.to_i.to_s
20
20
  end
21
- (s2.empty? || opts[:junkwords].include?(s2)) ? nil : s2
21
+ (s2.empty? || (opts[:junkwords]||[]).include?(s2)) ? nil : s2
22
22
  when Numeric
23
23
  s.to_s
24
24
  else
@@ -37,6 +37,4 @@ module DataTools::Array
37
37
  File.unlink(filename) if File.exists?(filename)
38
38
  File.open(filename, "w") {|f| f << Marshal.dump(self)}
39
39
  end
40
-
41
-
42
40
  end
@@ -123,18 +123,18 @@ module DataTools::ArrayOfHashes
123
123
  File.open(filename, "w") {|f| f << Marshal.dump(self)}
124
124
  end
125
125
 
126
- # attempt to dump out contents of this array-of-hashes as CSV to named file
127
- # fields is list of attribute names to write out
128
- # options headers is public names for the fields
129
- def csvme(filename, fields, headers = fields)
130
- CSV.open(filename, "wb") do |csv|
131
- csv << headers unless headers.nil?
132
- pluck(fields).each do |ary|
133
- csv << ary
134
- end
135
- end
136
- true
137
- end
126
+ # # attempt to dump out contents of this array-of-hashes as CSV to named file
127
+ # # fields is list of attribute names to write out
128
+ # # options headers is public names for the fields
129
+ # def csvme(filename, fields, headers = fields)
130
+ # CSV.open(filename, "wb") do |csv|
131
+ # csv << headers unless headers.nil?
132
+ # pluck(fields).each do |ary|
133
+ # csv << ary
134
+ # end
135
+ # end
136
+ # true
137
+ # end
138
138
 
139
139
  def tsvme(filename, fields, headers = fields)
140
140
  File.open(target) do |output|
@@ -4,7 +4,26 @@ module DataTools::Enumerator
4
4
  each do |hash|
5
5
  outputstream.puts hash.pluck(fields).to_csv
6
6
  end
7
+ outputstream.flush # otherwise missing rows might not get pushed out
7
8
  outputstream
9
+ rescue Errno::EPIPE
10
+ # output was closed, that's fine
11
+ end
12
+
13
+ def lazy_select(&block)
14
+ Enumerator.new do |yielder|
15
+ self.each do |val|
16
+ yielder.yield(val) if block.call(val)
17
+ end
18
+ end
19
+ end
20
+
21
+ def lazy_map(&block)
22
+ Enumerator.new do |yielder|
23
+ self.each do |value|
24
+ yielder.yield(block.call(value))
25
+ end
26
+ end
8
27
  end
9
28
  end
10
29
 
@@ -180,44 +180,23 @@ module DataTools::Hash
180
180
  end
181
181
  end
182
182
 
183
- # # HASH OF ARRAYS
184
- # def coalesce!(args)
185
- # rules = args[:per]
186
- # rules.each do |from, to|
187
- # if self[to].nil?
188
- # raise "cannot merge #{from} into #{to}, destination does not exist"
189
- # end
190
- # if self[from].nil?
191
- # $stderr.puts "cannot merge #{from} into #{to}, source does not exist, ignoring"
192
- # next
193
- # end
194
- # self[to] += self[from]
195
- # self.delete(from)
196
- # end
197
- # self
198
- # end
199
-
200
183
  def cleanse(options = {})
201
184
  each_with_object({}) do |(k,v), out|
202
185
  out[k] = DataTools.scour(v, options)
203
186
  if dateformat = options[:datefields][k]
204
187
  begin
205
- out[k] = DateTime.strptime(v, dateformat).to_time
206
- rescue
207
- warn "invalid #{k} (expected #{dateformat}): #{rec}"
188
+ out[k] = v && DateTime.strptime(v, dateformat).to_date
189
+ rescue ArgumentError
190
+ warn "expected '#{dateformat}' in #{k} = '#{v}' at [#{options[:line]}]: #{self}"
191
+ out[k] = nil
208
192
  end
209
193
  end
210
194
  end
211
195
  end
212
196
 
213
- def subset(keys)
214
- map do |h|
215
- h.select {|k,v| keys.include? k}
216
- end
217
- end
218
-
219
- def pluck(*keys)
220
- keys.flatten.map {|k| self[k]}
197
+ def pluck(keys)
198
+ keys.map {|k| self[k]}
199
+ # keys.flatten.map {|k| self[k]}
221
200
  end
222
201
  end
223
202
 
data/lib/data_tools/io.rb CHANGED
@@ -1,27 +1,32 @@
1
1
  require "csv"
2
2
 
3
3
  module DataTools::IO
4
+ attr_reader :headers, :import_options
5
+
4
6
  def unmarshal
5
7
  Marshal.load(self)
6
8
  end
7
9
 
8
- def headers
9
- @import_headers ||= @import_options[:headers] || behead
10
- end
11
-
12
10
  def split(line)
13
- case import_options[:format]
14
- when :tsv
11
+ fields = case import_options[:format]
12
+ when :tsv # tab-delimited
15
13
  line.split("\t")
16
- when :qcq
14
+ when :wsv # whitespace-delimited
15
+ line.split
16
+ when :qcq # quote-comma-quote (*not* the same as CSV)
17
17
  line.split('","')
18
- else # default is CSV
18
+ else # default is :csv
19
19
  line.parse_csv
20
20
  end
21
+
22
+ fields.map {|f| DataTools.scour(f)}
21
23
  end
22
24
 
23
25
  def parseline(line)
24
- split(line.chomp)
26
+ @linenumber += 1
27
+ # remove leading and trailing line endings (CR or LF)
28
+ # but NOT whitespace, because e.g. there could be leading or trailing blank fields delimited by tabs
29
+ split(line.gsub(/^[\n\r]*|[\n\r]*$/, ''))
25
30
  end
26
31
 
27
32
  def import_options
@@ -35,15 +40,20 @@ module DataTools::IO
35
40
  import_options.merge!(options)
36
41
  end
37
42
 
43
+ def line_to_record(line)
44
+ Hash[headers.zip(parseline(line)).select {|k,v| !v.nil?}]
45
+ end
46
+
38
47
  def import(opts = {}) # expects a block
39
48
  configure_import(opts)
40
- headers = opts[:headers] || parseline(readline)
41
- # warn "HEADERS ARE #{headers}"
49
+ @linenumber = 0
50
+ @headers = opts[:headers] || parseline(readline(opts[:rowsep] || $/))
42
51
  Enumerator.new do |yielder|
43
- self.each do |line|
44
- rec = Hash[headers.zip(parseline(line))]
52
+ self.each(opts[:rowsep] || $/) do |line|
53
+ rec = line_to_record(line)
54
+ next if rec.empty? # silently ignore blank records
45
55
  rec.extend DataTools::Hash
46
- yielder.yield rec.cleanse(import_options)
56
+ yielder.yield rec.cleanse(import_options.merge(:line => @linenumber))
47
57
  end
48
58
  # need to emit anything to trigger a file-completed action? (such as pushing a batch to storage)
49
59
  end
@@ -53,3 +63,5 @@ end
53
63
  class IO
54
64
  include DataTools::IO
55
65
  end
66
+
67
+ ARGF.extend DataTools::IO
@@ -1,3 +1,3 @@
1
1
  module DataTools
2
- VERSION = "0.6.0"
2
+ VERSION = "0.6.4"
3
3
  end
data/spec/import_spec.rb CHANGED
@@ -3,8 +3,8 @@ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
3
3
  require "json"
4
4
 
5
5
  describe "File Import" do
6
- it "imports" do
7
- f = File.open(File.dirname(__FILE__) + "/../hrhead.csv")
6
+ it "imports CSV" do
7
+ f = File.open(File.dirname(__FILE__) + "/../tmp/hrhead.csv")
8
8
  # sio = f.import.csvme(StringIO.new, ['Person Phone GUID', 'Person Address GUID'])
9
9
  # puts sio.string
10
10
 
@@ -12,4 +12,11 @@ describe "File Import" do
12
12
  puts slice.extend(DataTools::ArrayOfHashes).pluck('Person Phone GUID', 'Person Address GUID').to_json
13
13
  end
14
14
  end
15
+
16
+ it "import WSV" do
17
+ f = File.open(File.dirname(__FILE__) + "/../tmp/visits.txt")
18
+ recs = f.import(:format => :wsv, :datefields => {'admit_arrive_date' => '%Y-%m-%d'}).to_a
19
+ recs.count.should == 99
20
+ puts recs.sample
21
+ end
15
22
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: data_tools
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.0
4
+ version: 0.6.4
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-01-23 00:00:00.000000000 Z
12
+ date: 2013-02-05 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: awesome_print
@@ -118,3 +118,4 @@ signing_key:
118
118
  specification_version: 3
119
119
  summary: Miscellaneous data-munging utilities.
120
120
  test_files: []
121
+ has_rdoc: