data_tools 0.6.0 → 0.6.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +1 -0
- data/Gemfile.lock +1 -1
- data/lib/data_tools.rb +2 -2
- data/lib/data_tools/array.rb +0 -2
- data/lib/data_tools/array_of_hashes.rb +12 -12
- data/lib/data_tools/enumerator.rb +19 -0
- data/lib/data_tools/hash.rb +7 -28
- data/lib/data_tools/io.rb +26 -14
- data/lib/data_tools/version.rb +1 -1
- data/spec/import_spec.rb +9 -2
- metadata +3 -2
data/.gitignore
CHANGED
data/Gemfile.lock
CHANGED
data/lib/data_tools.rb
CHANGED
@@ -8,7 +8,7 @@ module DataTools
|
|
8
8
|
$".grep(/data_tools/).each {|f| load(f)}
|
9
9
|
end
|
10
10
|
|
11
|
-
def DataTools.scour(s, opts)
|
11
|
+
def DataTools.scour(s, opts = {})
|
12
12
|
case s
|
13
13
|
when nil
|
14
14
|
nil
|
@@ -18,7 +18,7 @@ module DataTools
|
|
18
18
|
# looks numeric
|
19
19
|
s2 = s2.to_i.to_s
|
20
20
|
end
|
21
|
-
(s2.empty? || opts[:junkwords].include?(s2)) ? nil : s2
|
21
|
+
(s2.empty? || (opts[:junkwords]||[]).include?(s2)) ? nil : s2
|
22
22
|
when Numeric
|
23
23
|
s.to_s
|
24
24
|
else
|
data/lib/data_tools/array.rb
CHANGED
@@ -123,18 +123,18 @@ module DataTools::ArrayOfHashes
|
|
123
123
|
File.open(filename, "w") {|f| f << Marshal.dump(self)}
|
124
124
|
end
|
125
125
|
|
126
|
-
# attempt to dump out contents of this array-of-hashes as CSV to named file
|
127
|
-
# fields is list of attribute names to write out
|
128
|
-
# options headers is public names for the fields
|
129
|
-
def csvme(filename, fields, headers = fields)
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
end
|
126
|
+
# # attempt to dump out contents of this array-of-hashes as CSV to named file
|
127
|
+
# # fields is list of attribute names to write out
|
128
|
+
# # options headers is public names for the fields
|
129
|
+
# def csvme(filename, fields, headers = fields)
|
130
|
+
# CSV.open(filename, "wb") do |csv|
|
131
|
+
# csv << headers unless headers.nil?
|
132
|
+
# pluck(fields).each do |ary|
|
133
|
+
# csv << ary
|
134
|
+
# end
|
135
|
+
# end
|
136
|
+
# true
|
137
|
+
# end
|
138
138
|
|
139
139
|
def tsvme(filename, fields, headers = fields)
|
140
140
|
File.open(target) do |output|
|
@@ -4,7 +4,26 @@ module DataTools::Enumerator
|
|
4
4
|
each do |hash|
|
5
5
|
outputstream.puts hash.pluck(fields).to_csv
|
6
6
|
end
|
7
|
+
outputstream.flush # otherwise missing rows might not get pushed out
|
7
8
|
outputstream
|
9
|
+
rescue Errno::EPIPE
|
10
|
+
# output was closed, that's fine
|
11
|
+
end
|
12
|
+
|
13
|
+
def lazy_select(&block)
|
14
|
+
Enumerator.new do |yielder|
|
15
|
+
self.each do |val|
|
16
|
+
yielder.yield(val) if block.call(val)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def lazy_map(&block)
|
22
|
+
Enumerator.new do |yielder|
|
23
|
+
self.each do |value|
|
24
|
+
yielder.yield(block.call(value))
|
25
|
+
end
|
26
|
+
end
|
8
27
|
end
|
9
28
|
end
|
10
29
|
|
data/lib/data_tools/hash.rb
CHANGED
@@ -180,44 +180,23 @@ module DataTools::Hash
|
|
180
180
|
end
|
181
181
|
end
|
182
182
|
|
183
|
-
# # HASH OF ARRAYS
|
184
|
-
# def coalesce!(args)
|
185
|
-
# rules = args[:per]
|
186
|
-
# rules.each do |from, to|
|
187
|
-
# if self[to].nil?
|
188
|
-
# raise "cannot merge #{from} into #{to}, destination does not exist"
|
189
|
-
# end
|
190
|
-
# if self[from].nil?
|
191
|
-
# $stderr.puts "cannot merge #{from} into #{to}, source does not exist, ignoring"
|
192
|
-
# next
|
193
|
-
# end
|
194
|
-
# self[to] += self[from]
|
195
|
-
# self.delete(from)
|
196
|
-
# end
|
197
|
-
# self
|
198
|
-
# end
|
199
|
-
|
200
183
|
def cleanse(options = {})
|
201
184
|
each_with_object({}) do |(k,v), out|
|
202
185
|
out[k] = DataTools.scour(v, options)
|
203
186
|
if dateformat = options[:datefields][k]
|
204
187
|
begin
|
205
|
-
out[k] = DateTime.strptime(v, dateformat).
|
206
|
-
rescue
|
207
|
-
warn "
|
188
|
+
out[k] = v && DateTime.strptime(v, dateformat).to_date
|
189
|
+
rescue ArgumentError
|
190
|
+
warn "expected '#{dateformat}' in #{k} = '#{v}' at [#{options[:line]}]: #{self}"
|
191
|
+
out[k] = nil
|
208
192
|
end
|
209
193
|
end
|
210
194
|
end
|
211
195
|
end
|
212
196
|
|
213
|
-
def
|
214
|
-
map
|
215
|
-
|
216
|
-
end
|
217
|
-
end
|
218
|
-
|
219
|
-
def pluck(*keys)
|
220
|
-
keys.flatten.map {|k| self[k]}
|
197
|
+
def pluck(keys)
|
198
|
+
keys.map {|k| self[k]}
|
199
|
+
# keys.flatten.map {|k| self[k]}
|
221
200
|
end
|
222
201
|
end
|
223
202
|
|
data/lib/data_tools/io.rb
CHANGED
@@ -1,27 +1,32 @@
|
|
1
1
|
require "csv"
|
2
2
|
|
3
3
|
module DataTools::IO
|
4
|
+
attr_reader :headers, :import_options
|
5
|
+
|
4
6
|
def unmarshal
|
5
7
|
Marshal.load(self)
|
6
8
|
end
|
7
9
|
|
8
|
-
def headers
|
9
|
-
@import_headers ||= @import_options[:headers] || behead
|
10
|
-
end
|
11
|
-
|
12
10
|
def split(line)
|
13
|
-
case import_options[:format]
|
14
|
-
when :tsv
|
11
|
+
fields = case import_options[:format]
|
12
|
+
when :tsv # tab-delimited
|
15
13
|
line.split("\t")
|
16
|
-
when :
|
14
|
+
when :wsv # whitespace-delimited
|
15
|
+
line.split
|
16
|
+
when :qcq # quote-comma-quote (*not* the same as CSV)
|
17
17
|
line.split('","')
|
18
|
-
else # default is
|
18
|
+
else # default is :csv
|
19
19
|
line.parse_csv
|
20
20
|
end
|
21
|
+
|
22
|
+
fields.map {|f| DataTools.scour(f)}
|
21
23
|
end
|
22
24
|
|
23
25
|
def parseline(line)
|
24
|
-
|
26
|
+
@linenumber += 1
|
27
|
+
# remove leading and trailing line endings (CR or LF)
|
28
|
+
# but NOT whitespace, because e.g. there could be leading or trailing blank fields delimited by tabs
|
29
|
+
split(line.gsub(/^[\n\r]*|[\n\r]*$/, ''))
|
25
30
|
end
|
26
31
|
|
27
32
|
def import_options
|
@@ -35,15 +40,20 @@ module DataTools::IO
|
|
35
40
|
import_options.merge!(options)
|
36
41
|
end
|
37
42
|
|
43
|
+
def line_to_record(line)
|
44
|
+
Hash[headers.zip(parseline(line)).select {|k,v| !v.nil?}]
|
45
|
+
end
|
46
|
+
|
38
47
|
def import(opts = {}) # expects a block
|
39
48
|
configure_import(opts)
|
40
|
-
|
41
|
-
|
49
|
+
@linenumber = 0
|
50
|
+
@headers = opts[:headers] || parseline(readline(opts[:rowsep] || $/))
|
42
51
|
Enumerator.new do |yielder|
|
43
|
-
self.each do |line|
|
44
|
-
rec =
|
52
|
+
self.each(opts[:rowsep] || $/) do |line|
|
53
|
+
rec = line_to_record(line)
|
54
|
+
next if rec.empty? # silently ignore blank records
|
45
55
|
rec.extend DataTools::Hash
|
46
|
-
yielder.yield rec.cleanse(import_options)
|
56
|
+
yielder.yield rec.cleanse(import_options.merge(:line => @linenumber))
|
47
57
|
end
|
48
58
|
# need to emit anything to trigger a file-completed action? (such as pushing a batch to storage)
|
49
59
|
end
|
@@ -53,3 +63,5 @@ end
|
|
53
63
|
class IO
|
54
64
|
include DataTools::IO
|
55
65
|
end
|
66
|
+
|
67
|
+
ARGF.extend DataTools::IO
|
data/lib/data_tools/version.rb
CHANGED
data/spec/import_spec.rb
CHANGED
@@ -3,8 +3,8 @@ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
|
3
3
|
require "json"
|
4
4
|
|
5
5
|
describe "File Import" do
|
6
|
-
it "imports" do
|
7
|
-
f = File.open(File.dirname(__FILE__) + "/../hrhead.csv")
|
6
|
+
it "imports CSV" do
|
7
|
+
f = File.open(File.dirname(__FILE__) + "/../tmp/hrhead.csv")
|
8
8
|
# sio = f.import.csvme(StringIO.new, ['Person Phone GUID', 'Person Address GUID'])
|
9
9
|
# puts sio.string
|
10
10
|
|
@@ -12,4 +12,11 @@ describe "File Import" do
|
|
12
12
|
puts slice.extend(DataTools::ArrayOfHashes).pluck('Person Phone GUID', 'Person Address GUID').to_json
|
13
13
|
end
|
14
14
|
end
|
15
|
+
|
16
|
+
it "import WSV" do
|
17
|
+
f = File.open(File.dirname(__FILE__) + "/../tmp/visits.txt")
|
18
|
+
recs = f.import(:format => :wsv, :datefields => {'admit_arrive_date' => '%Y-%m-%d'}).to_a
|
19
|
+
recs.count.should == 99
|
20
|
+
puts recs.sample
|
21
|
+
end
|
15
22
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: data_tools
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.6.
|
4
|
+
version: 0.6.4
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-
|
12
|
+
date: 2013-02-05 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: awesome_print
|
@@ -118,3 +118,4 @@ signing_key:
|
|
118
118
|
specification_version: 3
|
119
119
|
summary: Miscellaneous data-munging utilities.
|
120
120
|
test_files: []
|
121
|
+
has_rdoc:
|