masticate 0.2.3 → 0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/masticate/base.rb +10 -2
- data/lib/masticate/cook.rb +0 -1
- data/lib/masticate/exclude.rb +13 -1
- data/lib/masticate/mender.rb +67 -3
- data/lib/masticate/myoptparse.rb +3 -1
- data/lib/masticate/version.rb +1 -1
- data/spec/data/cooking_mend_result.csv +4 -0
- data/spec/data/datify_input.csv +11 -0
- data/spec/data/datify_result.csv +11 -0
- data/spec/data/recipe_mend.txt +3 -0
- data/spec/lib/cook_spec.rb +11 -0
- data/spec/lib/datify_spec.rb +7 -8
- data/spec/lib/exclude_spec.rb +10 -0
- data/spec/lib/mender_spec.rb +1 -1
- metadata +16 -10
- data/spec/data/datify_input.txt +0 -20
data/lib/masticate/base.rb
CHANGED
@@ -60,7 +60,14 @@ class Masticate::Base
|
|
60
60
|
@output_count = 0
|
61
61
|
with_input do |input|
|
62
62
|
while line = get
|
63
|
-
row = CSV.parse_line(line, csv_options)
|
63
|
+
row = CSV.parse_line(line, csv_options) #.map {|s| s && s.strip}
|
64
|
+
if row
|
65
|
+
row = row.map {|s| s && s.strip}
|
66
|
+
end
|
67
|
+
# row2 = row.map {|s| s && s.strip}
|
68
|
+
# if row2.nil?
|
69
|
+
# puts "**** ROW IS [#{row.inspect}]"
|
70
|
+
# end
|
64
71
|
output = crunch(row)
|
65
72
|
emit(output) if output
|
66
73
|
end
|
@@ -70,7 +77,8 @@ class Masticate::Base
|
|
70
77
|
|
71
78
|
{
|
72
79
|
:input_count => input_count,
|
73
|
-
:output_count => @output_count
|
80
|
+
:output_count => @output_count,
|
81
|
+
:headers => @headers
|
74
82
|
}
|
75
83
|
end
|
76
84
|
end
|
data/lib/masticate/cook.rb
CHANGED
data/lib/masticate/exclude.rb
CHANGED
@@ -19,7 +19,19 @@ class Masticate::Exclude < Masticate::Base
|
|
19
19
|
def crunch(row)
|
20
20
|
if !@headers
|
21
21
|
@headers = row
|
22
|
-
|
22
|
+
f = @field
|
23
|
+
@index =
|
24
|
+
case f
|
25
|
+
when Fixnum, /^\d+$/
|
26
|
+
f = f.to_i
|
27
|
+
if f > row.count
|
28
|
+
raise "Cannot pluck column #{f}, there are only #{row.count} fields"
|
29
|
+
else
|
30
|
+
f-1
|
31
|
+
end
|
32
|
+
else
|
33
|
+
row.index(f) or raise "Unable to find column '#{f}' in headers"
|
34
|
+
end
|
23
35
|
row
|
24
36
|
elsif row
|
25
37
|
if row[@index] == @value
|
data/lib/masticate/mender.rb
CHANGED
@@ -4,13 +4,77 @@
|
|
4
4
|
# (due to a newline embedded in a field). Glue those two lines into a single line in the output.
|
5
5
|
|
6
6
|
class Masticate::Mender < Masticate::Base
|
7
|
-
|
7
|
+
def configure(opts)
|
8
|
+
standard_options(opts)
|
8
9
|
|
9
|
-
|
10
|
-
@
|
10
|
+
@inlined = opts[:inlined]
|
11
|
+
@snip = opts[:snip]
|
12
|
+
@dejunk = opts[:dejunk]
|
13
|
+
|
14
|
+
@expected_field_count = nil
|
15
|
+
@holding = []
|
11
16
|
end
|
12
17
|
|
18
|
+
# attr_reader :col_sep
|
19
|
+
|
20
|
+
# def initialize(filename)
|
21
|
+
# @filename = filename
|
22
|
+
# end
|
23
|
+
|
13
24
|
def mend(opts)
|
25
|
+
execute(opts)
|
26
|
+
end
|
27
|
+
|
28
|
+
def crunch(row)
|
29
|
+
if @inlined
|
30
|
+
if row
|
31
|
+
ncells = row.count/2-1
|
32
|
+
if !@headers
|
33
|
+
@headers = row[0..ncells]
|
34
|
+
@expected_field_count = @headers.count
|
35
|
+
emit(@headers)
|
36
|
+
else
|
37
|
+
if row[0..ncells] != @headers
|
38
|
+
raise "Header mismatch on line #{@input_count}\n Expected: #{@headers.join(',')}\n Found: #{row[0..ncells].join(',')}"
|
39
|
+
end
|
40
|
+
end
|
41
|
+
row = row[ncells+1, @expected_field_count]
|
42
|
+
end
|
43
|
+
elsif !@headers
|
44
|
+
# trust the first row
|
45
|
+
@headers = row
|
46
|
+
case @snip
|
47
|
+
when Fixnum
|
48
|
+
@headers.shift(@snip)
|
49
|
+
when String
|
50
|
+
raise "TODO: snip named header. Multiple?"
|
51
|
+
when nil
|
52
|
+
# do nothing
|
53
|
+
else
|
54
|
+
raise "Do not understand snip instruction [#{@snip.inspect}]"
|
55
|
+
end
|
56
|
+
@expected_field_count = @headers.count
|
57
|
+
row = @headers
|
58
|
+
elsif row
|
59
|
+
@holding += row
|
60
|
+
if @holding.count < @expected_field_count
|
61
|
+
# incomplete row; do not emit anything
|
62
|
+
row = nil
|
63
|
+
else
|
64
|
+
row = @holding
|
65
|
+
@holding = []
|
66
|
+
end
|
67
|
+
|
68
|
+
if @dejunk && row && row.select {|s| s && !s.strip.empty?}.count <= 2
|
69
|
+
# junky row, suppress output
|
70
|
+
nil
|
71
|
+
else
|
72
|
+
row
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
def old_mend(opts)
|
14
78
|
@output = opts[:output] ? File.open(opts[:output], "w") : $stdout
|
15
79
|
@col_sep = opts[:col_sep] || ','
|
16
80
|
@quote_char = opts[:quote_char] || "\0"
|
data/lib/masticate/myoptparse.rb
CHANGED
@@ -91,7 +91,8 @@ class Masticate::MyOptionParser
|
|
91
91
|
'maxrows' => Masticate::MaxRows,
|
92
92
|
'relabel' => Masticate::Relabel,
|
93
93
|
'pluck' => Masticate::Plucker,
|
94
|
-
'exclude' => Masticate::Exclude
|
94
|
+
'exclude' => Masticate::Exclude,
|
95
|
+
'mend' => Masticate::Mender
|
95
96
|
}
|
96
97
|
|
97
98
|
klass = klasses[command]
|
@@ -165,6 +166,7 @@ EOT
|
|
165
166
|
* masticate #{command} (#{options.keys.join(', ')})
|
166
167
|
Lines in input: #{results[:input_count]}
|
167
168
|
Lines in output: #{results[:output_count]}
|
169
|
+
Headers: #{results[:headers].inspect}
|
168
170
|
EOT
|
169
171
|
if results[:field_counts]
|
170
172
|
$stderr.puts " Field counts: #{results[:field_counts].inspect}"
|
data/lib/masticate/version.rb
CHANGED
@@ -0,0 +1,11 @@
|
|
1
|
+
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,timestamp,Term Date,Status,R_NAME,SEX,BIRTHDATE
|
2
|
+
WASHINGTON,GEORGE,D,824,9556,09/10/2005 4:23:16PM,07/01/2006,TM,Surgical House Staff,M,09/23/1975
|
3
|
+
JEFFERSON,TOM,,621,8052,07/23/2001 7:23:11AM,01/28/2011,TM,Telemetry,F,12/24/1976
|
4
|
+
ADAMS,JOHN,,655,8834,09/22/2003 01:23:45PM,,WA,6 East,F,08/07/1978
|
5
|
+
ADAMS,JOHN QUINCY,A,209,8637,02/24/2003 02:34:00AM,12/02/2007,TM,Imaging Svcs - MRI,F,11/03/1966
|
6
|
+
HAMILTON,ANDREW,,278,10065,01/09/2007 02:34:00AM,11/16/2007,TM,Information Technology,M,09/16/1968
|
7
|
+
MADISON,JAMES,F,672,10720,01/05/2009 02:34:00AM,02/16/2010,TM,Rehab Svcs - Outpatients,F,04/15/1985
|
8
|
+
FRANKLIN,BENJAMIN,R,674,8340,05/01/2002 02:34:00AM,09/01/2003,TM,"Rehab Svcs, xyz",F,03/15/1973
|
9
|
+
LINCOLN,ABRAHAM,M,634,11340,05/02/2011 02:34:00AM,,PN,Sibley Ambulatory Surgery Ctr,F,07/11/1960
|
10
|
+
MONROE,JAMES,L,614,10757,02/16/2009 02:34:00AM,,RF,Labor & Delivery,F,11/06/1983
|
11
|
+
REVERE,PAUL,B,424,8568,11/18/2002 02:34:00AM,06/27/2006,TM,Laundry & Linen,M,12/31/1976
|
@@ -0,0 +1,11 @@
|
|
1
|
+
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,timestamp,Term Date,Status,R_NAME,SEX,BIRTHDATE
|
2
|
+
WASHINGTON,GEORGE,D,824,9556,1126369396,07/01/2006,TM,Surgical House Staff,M,09/23/1975
|
3
|
+
JEFFERSON,TOM,,621,8052,995872991,01/28/2011,TM,Telemetry,F,12/24/1976
|
4
|
+
ADAMS,JOHN,,655,8834,1064237025,,WA,6 East,F,08/07/1978
|
5
|
+
ADAMS,JOHN QUINCY,A,209,8637,1046054040,12/02/2007,TM,Imaging Svcs - MRI,F,11/03/1966
|
6
|
+
HAMILTON,ANDREW,,278,10065,1168310040,11/16/2007,TM,Information Technology,M,09/16/1968
|
7
|
+
MADISON,JAMES,F,672,10720,1231122840,02/16/2010,TM,Rehab Svcs - Outpatients,F,04/15/1985
|
8
|
+
FRANKLIN,BENJAMIN,R,674,8340,1020220440,09/01/2003,TM,"Rehab Svcs, xyz",F,03/15/1973
|
9
|
+
LINCOLN,ABRAHAM,M,634,11340,1304303640,,PN,Sibley Ambulatory Surgery Ctr,F,07/11/1960
|
10
|
+
MONROE,JAMES,L,614,10757,1234751640,,RF,Labor & Delivery,F,11/06/1983
|
11
|
+
REVERE,PAUL,B,424,8568,1037586840,06/27/2006,TM,Laundry & Linen,M,12/31/1976
|
data/spec/lib/cook_spec.rb
CHANGED
@@ -13,4 +13,15 @@ describe "cooking up a recipe" do
|
|
13
13
|
|
14
14
|
output.should == correct_output
|
15
15
|
end
|
16
|
+
|
17
|
+
it "should allow mend in recipe" do
|
18
|
+
input = File.dirname(__FILE__) + "/../data/broken_psv.txt"
|
19
|
+
recipe = File.dirname(__FILE__) + "/../data/recipe_mend.txt"
|
20
|
+
tmp = Tempfile.new('cooked')
|
21
|
+
results = Masticate.cook(input, :col_sep => '|', :output => tmp, :recipe => recipe)
|
22
|
+
output = File.read(tmp)
|
23
|
+
correct_output = File.read(File.dirname(__FILE__) + "/../data/cooking_mend_result.csv")
|
24
|
+
|
25
|
+
output.should == correct_output
|
26
|
+
end
|
16
27
|
end
|
data/spec/lib/datify_spec.rb
CHANGED
@@ -2,15 +2,14 @@
|
|
2
2
|
|
3
3
|
require "spec_helper"
|
4
4
|
|
5
|
-
describe "
|
6
|
-
it "should
|
7
|
-
filename = File.dirname(__FILE__) + "/../data/
|
8
|
-
tmp = Tempfile.new('
|
9
|
-
results = Masticate.
|
5
|
+
describe "datification" do
|
6
|
+
it "should transform dates" do
|
7
|
+
filename = File.dirname(__FILE__) + "/../data/datify_input.csv"
|
8
|
+
tmp = Tempfile.new('datify')
|
9
|
+
results = Masticate.datify(filename, :output => tmp, :field => 'timestamp', :format => '%m/%d/%Y %H:%M:%S%p')
|
10
10
|
output = File.read(tmp)
|
11
11
|
tmp.unlink
|
12
|
-
|
13
|
-
|
14
|
-
results[:output_count].should == 5
|
12
|
+
|
13
|
+
correct_output = File.read(File.dirname(__FILE__) + "/../data/datify_result.csv")
|
15
14
|
end
|
16
15
|
end
|
data/spec/lib/exclude_spec.rb
CHANGED
@@ -12,4 +12,14 @@ describe "exclude" do
|
|
12
12
|
|
13
13
|
output.should == correct_output
|
14
14
|
end
|
15
|
+
|
16
|
+
it "should be able to exclude rows by number as well as name" do
|
17
|
+
filename = File.dirname(__FILE__) + "/../data/exclude_input.csv"
|
18
|
+
tmp = Tempfile.new('exclude')
|
19
|
+
results = Masticate.exclude(filename, :output => tmp, :field => 1, :value => '')
|
20
|
+
output = File.read(tmp)
|
21
|
+
correct_output = File.read(File.dirname(__FILE__) + "/../data/exclude_results.csv")
|
22
|
+
|
23
|
+
output.should == correct_output
|
24
|
+
end
|
15
25
|
end
|
data/spec/lib/mender_spec.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: masticate
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: '0.3'
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-05-11 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rspec
|
16
|
-
requirement: &
|
16
|
+
requirement: &2152079220 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: 2.9.0
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *2152079220
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: guard-rspec
|
27
|
-
requirement: &
|
27
|
+
requirement: &2152076120 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ~>
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: 0.7.0
|
33
33
|
type: :development
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *2152076120
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: ruby_gntp
|
38
|
-
requirement: &
|
38
|
+
requirement: &2152074480 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ~>
|
@@ -43,7 +43,7 @@ dependencies:
|
|
43
43
|
version: 0.3.4
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *2152074480
|
47
47
|
description: Data file crunching
|
48
48
|
email:
|
49
49
|
- jmay@pobox.com
|
@@ -79,8 +79,10 @@ files:
|
|
79
79
|
- spec/data/badnums_fixed.csv
|
80
80
|
- spec/data/broken_psv.txt
|
81
81
|
- spec/data/concat_result.txt
|
82
|
+
- spec/data/cooking_mend_result.csv
|
82
83
|
- spec/data/cooking_result.csv
|
83
|
-
- spec/data/datify_input.
|
84
|
+
- spec/data/datify_input.csv
|
85
|
+
- spec/data/datify_result.csv
|
84
86
|
- spec/data/events.csv
|
85
87
|
- spec/data/events_reduced.csv
|
86
88
|
- spec/data/exclude_input.csv
|
@@ -94,6 +96,7 @@ files:
|
|
94
96
|
- spec/data/pipe_data.txt
|
95
97
|
- spec/data/quoted_csv_data.txt
|
96
98
|
- spec/data/recipe.txt
|
99
|
+
- spec/data/recipe_mend.txt
|
97
100
|
- spec/data/relabel_result.csv
|
98
101
|
- spec/data/tabbed_data.txt
|
99
102
|
- spec/data/tilde_data.txt
|
@@ -138,8 +141,10 @@ test_files:
|
|
138
141
|
- spec/data/badnums_fixed.csv
|
139
142
|
- spec/data/broken_psv.txt
|
140
143
|
- spec/data/concat_result.txt
|
144
|
+
- spec/data/cooking_mend_result.csv
|
141
145
|
- spec/data/cooking_result.csv
|
142
|
-
- spec/data/datify_input.
|
146
|
+
- spec/data/datify_input.csv
|
147
|
+
- spec/data/datify_result.csv
|
143
148
|
- spec/data/events.csv
|
144
149
|
- spec/data/events_reduced.csv
|
145
150
|
- spec/data/exclude_input.csv
|
@@ -153,6 +158,7 @@ test_files:
|
|
153
158
|
- spec/data/pipe_data.txt
|
154
159
|
- spec/data/quoted_csv_data.txt
|
155
160
|
- spec/data/recipe.txt
|
161
|
+
- spec/data/recipe_mend.txt
|
156
162
|
- spec/data/relabel_result.csv
|
157
163
|
- spec/data/tabbed_data.txt
|
158
164
|
- spec/data/tilde_data.txt
|
data/spec/data/datify_input.txt
DELETED
@@ -1,20 +0,0 @@
|
|
1
|
-
ChangeDTTM|AuditEventID|AuditEventCode|AuditEventTypeID|AuditEventName|AuditEventDesc|AuditByID
|
2
|
-
12/31/2011 3:41:54PM|57,314,552.00|LOGGEDOUT|30|Logged out|Logged out|4,426.00
|
3
|
-
1/8/2012 9:21:23AM|57,486,988.00|LOGGEDOUT|30|Logged out|Logged out|4,573.00
|
4
|
-
1/8/2012 9:21:24AM|57,486,989.00|LOGGEDOUT|30|Logged out|Logged out|4,573.00
|
5
|
-
2/19/2012 6:08:59PM|58,597,521.00|LOGGEDOUT|30|Logged out|Logged out|4,573.00
|
6
|
-
2/19/2012 7:02:50PM|58,597,816.00|LOGGEDOUT|30|Logged out|Logged out|4,573.00
|
7
|
-
2/19/2012 7:02:50PM|58,597,817.00|LOGGEDOUT|30|Logged out|Logged out|4,573.00
|
8
|
-
2/4/2012 9:29:20AM|58,201,259.00|LOGGEDOUT|30|Logged out|Logged out|4,573.00
|
9
|
-
2/4/2012 12:19:30AM|58,196,973.00|LOGGEDOUT|30|Logged out|Logged out|4,573.00
|
10
|
-
2/4/2012 12:20:53AM|58,196,975.00|LOGGEDOUT|30|Logged out|Logged out|4,573.00
|
11
|
-
2/3/2012 10:44:25PM|58,196,507.00|LOGGEDOUT|30|Logged out|Logged out|4,573.00
|
12
|
-
1/20/2012 8:01:07PM|57,833,496.00|LOGGEDOUT|30|Logged out|Logged out|4,573.00
|
13
|
-
1/22/2012 4:20:48AM|57,845,095.00|LOGGEDOUT|30|Logged out|Logged out|4,573.00
|
14
|
-
1/22/2012 4:20:49AM|57,845,096.00|LOGGEDOUT|30|Logged out|Logged out|4,573.00
|
15
|
-
1/14/2012 7:50:40AM|57,661,613.00|LOGGEDOUT|30|Logged out|Logged out|5,019.00
|
16
|
-
2/26/2012 7:48:07AM|58,776,243.00|LOGGEDOUT|30|Logged out|Logged out|5,019.00
|
17
|
-
2/23/2012 7:47:39AM|58,699,535.00|LOGGEDOUT|30|Logged out|Logged out|5,019.00
|
18
|
-
2/23/2012 7:39:56AM|58,699,387.00|LOGGEDOUT|30|Logged out|Logged out|5,019.00
|
19
|
-
2/23/2012 7:39:56AM|58,699,388.00|LOGGEDOUT|30|Logged out|Logged out|5,019.00
|
20
|
-
2/15/2012 3:15:06PM|58,502,037.00|LOGGEDOUT|30|Logged out|Logged out|5,019.00
|