data_tools 0.6.0 → 0.6.4
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +1 -0
- data/Gemfile.lock +1 -1
- data/lib/data_tools.rb +2 -2
- data/lib/data_tools/array.rb +0 -2
- data/lib/data_tools/array_of_hashes.rb +12 -12
- data/lib/data_tools/enumerator.rb +19 -0
- data/lib/data_tools/hash.rb +7 -28
- data/lib/data_tools/io.rb +26 -14
- data/lib/data_tools/version.rb +1 -1
- data/spec/import_spec.rb +9 -2
- metadata +3 -2
data/.gitignore
CHANGED
data/Gemfile.lock
CHANGED
data/lib/data_tools.rb
CHANGED
@@ -8,7 +8,7 @@ module DataTools
|
|
8
8
|
$".grep(/data_tools/).each {|f| load(f)}
|
9
9
|
end
|
10
10
|
|
11
|
-
def DataTools.scour(s, opts)
|
11
|
+
def DataTools.scour(s, opts = {})
|
12
12
|
case s
|
13
13
|
when nil
|
14
14
|
nil
|
@@ -18,7 +18,7 @@ module DataTools
|
|
18
18
|
# looks numeric
|
19
19
|
s2 = s2.to_i.to_s
|
20
20
|
end
|
21
|
-
(s2.empty? || opts[:junkwords].include?(s2)) ? nil : s2
|
21
|
+
(s2.empty? || (opts[:junkwords]||[]).include?(s2)) ? nil : s2
|
22
22
|
when Numeric
|
23
23
|
s.to_s
|
24
24
|
else
|
data/lib/data_tools/array.rb
CHANGED
@@ -123,18 +123,18 @@ module DataTools::ArrayOfHashes
|
|
123
123
|
File.open(filename, "w") {|f| f << Marshal.dump(self)}
|
124
124
|
end
|
125
125
|
|
126
|
-
# attempt to dump out contents of this array-of-hashes as CSV to named file
|
127
|
-
# fields is list of attribute names to write out
|
128
|
-
# options headers is public names for the fields
|
129
|
-
def csvme(filename, fields, headers = fields)
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
end
|
126
|
+
# # attempt to dump out contents of this array-of-hashes as CSV to named file
|
127
|
+
# # fields is list of attribute names to write out
|
128
|
+
# # options headers is public names for the fields
|
129
|
+
# def csvme(filename, fields, headers = fields)
|
130
|
+
# CSV.open(filename, "wb") do |csv|
|
131
|
+
# csv << headers unless headers.nil?
|
132
|
+
# pluck(fields).each do |ary|
|
133
|
+
# csv << ary
|
134
|
+
# end
|
135
|
+
# end
|
136
|
+
# true
|
137
|
+
# end
|
138
138
|
|
139
139
|
def tsvme(filename, fields, headers = fields)
|
140
140
|
File.open(target) do |output|
|
@@ -4,7 +4,26 @@ module DataTools::Enumerator
|
|
4
4
|
each do |hash|
|
5
5
|
outputstream.puts hash.pluck(fields).to_csv
|
6
6
|
end
|
7
|
+
outputstream.flush # otherwise missing rows might not get pushed out
|
7
8
|
outputstream
|
9
|
+
rescue Errno::EPIPE
|
10
|
+
# output was closed, that's fine
|
11
|
+
end
|
12
|
+
|
13
|
+
def lazy_select(&block)
|
14
|
+
Enumerator.new do |yielder|
|
15
|
+
self.each do |val|
|
16
|
+
yielder.yield(val) if block.call(val)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def lazy_map(&block)
|
22
|
+
Enumerator.new do |yielder|
|
23
|
+
self.each do |value|
|
24
|
+
yielder.yield(block.call(value))
|
25
|
+
end
|
26
|
+
end
|
8
27
|
end
|
9
28
|
end
|
10
29
|
|
data/lib/data_tools/hash.rb
CHANGED
@@ -180,44 +180,23 @@ module DataTools::Hash
|
|
180
180
|
end
|
181
181
|
end
|
182
182
|
|
183
|
-
# # HASH OF ARRAYS
|
184
|
-
# def coalesce!(args)
|
185
|
-
# rules = args[:per]
|
186
|
-
# rules.each do |from, to|
|
187
|
-
# if self[to].nil?
|
188
|
-
# raise "cannot merge #{from} into #{to}, destination does not exist"
|
189
|
-
# end
|
190
|
-
# if self[from].nil?
|
191
|
-
# $stderr.puts "cannot merge #{from} into #{to}, source does not exist, ignoring"
|
192
|
-
# next
|
193
|
-
# end
|
194
|
-
# self[to] += self[from]
|
195
|
-
# self.delete(from)
|
196
|
-
# end
|
197
|
-
# self
|
198
|
-
# end
|
199
|
-
|
200
183
|
def cleanse(options = {})
|
201
184
|
each_with_object({}) do |(k,v), out|
|
202
185
|
out[k] = DataTools.scour(v, options)
|
203
186
|
if dateformat = options[:datefields][k]
|
204
187
|
begin
|
205
|
-
out[k] = DateTime.strptime(v, dateformat).
|
206
|
-
rescue
|
207
|
-
warn "
|
188
|
+
out[k] = v && DateTime.strptime(v, dateformat).to_date
|
189
|
+
rescue ArgumentError
|
190
|
+
warn "expected '#{dateformat}' in #{k} = '#{v}' at [#{options[:line]}]: #{self}"
|
191
|
+
out[k] = nil
|
208
192
|
end
|
209
193
|
end
|
210
194
|
end
|
211
195
|
end
|
212
196
|
|
213
|
-
def
|
214
|
-
map
|
215
|
-
|
216
|
-
end
|
217
|
-
end
|
218
|
-
|
219
|
-
def pluck(*keys)
|
220
|
-
keys.flatten.map {|k| self[k]}
|
197
|
+
def pluck(keys)
|
198
|
+
keys.map {|k| self[k]}
|
199
|
+
# keys.flatten.map {|k| self[k]}
|
221
200
|
end
|
222
201
|
end
|
223
202
|
|
data/lib/data_tools/io.rb
CHANGED
@@ -1,27 +1,32 @@
|
|
1
1
|
require "csv"
|
2
2
|
|
3
3
|
module DataTools::IO
|
4
|
+
attr_reader :headers, :import_options
|
5
|
+
|
4
6
|
def unmarshal
|
5
7
|
Marshal.load(self)
|
6
8
|
end
|
7
9
|
|
8
|
-
def headers
|
9
|
-
@import_headers ||= @import_options[:headers] || behead
|
10
|
-
end
|
11
|
-
|
12
10
|
def split(line)
|
13
|
-
case import_options[:format]
|
14
|
-
when :tsv
|
11
|
+
fields = case import_options[:format]
|
12
|
+
when :tsv # tab-delimited
|
15
13
|
line.split("\t")
|
16
|
-
when :
|
14
|
+
when :wsv # whitespace-delimited
|
15
|
+
line.split
|
16
|
+
when :qcq # quote-comma-quote (*not* the same as CSV)
|
17
17
|
line.split('","')
|
18
|
-
else # default is
|
18
|
+
else # default is :csv
|
19
19
|
line.parse_csv
|
20
20
|
end
|
21
|
+
|
22
|
+
fields.map {|f| DataTools.scour(f)}
|
21
23
|
end
|
22
24
|
|
23
25
|
def parseline(line)
|
24
|
-
|
26
|
+
@linenumber += 1
|
27
|
+
# remove leading and trailing line endings (CR or LF)
|
28
|
+
# but NOT whitespace, because e.g. there could be leading or trailing blank fields delimited by tabs
|
29
|
+
split(line.gsub(/^[\n\r]*|[\n\r]*$/, ''))
|
25
30
|
end
|
26
31
|
|
27
32
|
def import_options
|
@@ -35,15 +40,20 @@ module DataTools::IO
|
|
35
40
|
import_options.merge!(options)
|
36
41
|
end
|
37
42
|
|
43
|
+
def line_to_record(line)
|
44
|
+
Hash[headers.zip(parseline(line)).select {|k,v| !v.nil?}]
|
45
|
+
end
|
46
|
+
|
38
47
|
def import(opts = {}) # expects a block
|
39
48
|
configure_import(opts)
|
40
|
-
|
41
|
-
|
49
|
+
@linenumber = 0
|
50
|
+
@headers = opts[:headers] || parseline(readline(opts[:rowsep] || $/))
|
42
51
|
Enumerator.new do |yielder|
|
43
|
-
self.each do |line|
|
44
|
-
rec =
|
52
|
+
self.each(opts[:rowsep] || $/) do |line|
|
53
|
+
rec = line_to_record(line)
|
54
|
+
next if rec.empty? # silently ignore blank records
|
45
55
|
rec.extend DataTools::Hash
|
46
|
-
yielder.yield rec.cleanse(import_options)
|
56
|
+
yielder.yield rec.cleanse(import_options.merge(:line => @linenumber))
|
47
57
|
end
|
48
58
|
# need to emit anything to trigger a file-completed action? (such as pushing a batch to storage)
|
49
59
|
end
|
@@ -53,3 +63,5 @@ end
|
|
53
63
|
class IO
|
54
64
|
include DataTools::IO
|
55
65
|
end
|
66
|
+
|
67
|
+
ARGF.extend DataTools::IO
|
data/lib/data_tools/version.rb
CHANGED
data/spec/import_spec.rb
CHANGED
@@ -3,8 +3,8 @@ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
|
3
3
|
require "json"
|
4
4
|
|
5
5
|
describe "File Import" do
|
6
|
-
it "imports" do
|
7
|
-
f = File.open(File.dirname(__FILE__) + "/../hrhead.csv")
|
6
|
+
it "imports CSV" do
|
7
|
+
f = File.open(File.dirname(__FILE__) + "/../tmp/hrhead.csv")
|
8
8
|
# sio = f.import.csvme(StringIO.new, ['Person Phone GUID', 'Person Address GUID'])
|
9
9
|
# puts sio.string
|
10
10
|
|
@@ -12,4 +12,11 @@ describe "File Import" do
|
|
12
12
|
puts slice.extend(DataTools::ArrayOfHashes).pluck('Person Phone GUID', 'Person Address GUID').to_json
|
13
13
|
end
|
14
14
|
end
|
15
|
+
|
16
|
+
it "import WSV" do
|
17
|
+
f = File.open(File.dirname(__FILE__) + "/../tmp/visits.txt")
|
18
|
+
recs = f.import(:format => :wsv, :datefields => {'admit_arrive_date' => '%Y-%m-%d'}).to_a
|
19
|
+
recs.count.should == 99
|
20
|
+
puts recs.sample
|
21
|
+
end
|
15
22
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: data_tools
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.6.
|
4
|
+
version: 0.6.4
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-
|
12
|
+
date: 2013-02-05 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: awesome_print
|
@@ -118,3 +118,4 @@ signing_key:
|
|
118
118
|
specification_version: 3
|
119
119
|
summary: Miscellaneous data-munging utilities.
|
120
120
|
test_files: []
|
121
|
+
has_rdoc:
|