masticate 0.1.1 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/masticate +12 -1
- data/lib/masticate/mender.rb +8 -2
- data/lib/masticate/version.rb +1 -1
- data/spec/data/inlined_headers.csv +1 -1
- data/spec/data/inlined_headers.csv.output +10 -10
- data/spec/data/junk_header.csv +1 -1
- data/spec/lib/mender_spec.rb +2 -1
- metadata +7 -7
data/bin/masticate
CHANGED
@@ -13,11 +13,15 @@ OptionParser.new do |opts|
|
|
13
13
|
options[:format] = v
|
14
14
|
end
|
15
15
|
|
16
|
-
opts.on("--delim DELIMITER", "Specify field delimiter (character or TAB)") do |v|
|
16
|
+
opts.on("--delim DELIMITER", "Specify field delimiter (character or TAB; default is ',')") do |v|
|
17
17
|
options[:col_sep] = v
|
18
18
|
options[:col_sep] = "\t" if options[:col_sep] == "TAB"
|
19
19
|
end
|
20
20
|
|
21
|
+
opts.on("--quote QUOTE-CHAR", "Specify character used for quoting fields (optional; default is no quoting)") do |char|
|
22
|
+
options[:quote_char] = char
|
23
|
+
end
|
24
|
+
|
21
25
|
opts.on("--fields LIST", Array, "Specify fields to select") do |list|
|
22
26
|
options[:fields] = list
|
23
27
|
end
|
@@ -41,6 +45,10 @@ OptionParser.new do |opts|
|
|
41
45
|
opts.on("--inlined", "(for *mend* only) Source file has headers inlined on each line") do |v|
|
42
46
|
options[:inlined] = v
|
43
47
|
end
|
48
|
+
|
49
|
+
opts.on("--dejunk", "(for *mend* only) Expunge junk lines from source") do |v|
|
50
|
+
options[:dejunk] = v
|
51
|
+
end
|
44
52
|
end.parse!
|
45
53
|
|
46
54
|
filename = ARGV.shift # use stdin if no filename provided
|
@@ -51,6 +59,9 @@ def logmessage(command, options, results)
|
|
51
59
|
Lines in input: #{results[:input_count]}
|
52
60
|
Lines in output: #{results[:output_count]}
|
53
61
|
EOT
|
62
|
+
if results[:field_counts]
|
63
|
+
$stderr.puts " Field counts: #{results[:field_counts].inspect}"
|
64
|
+
end
|
54
65
|
end
|
55
66
|
|
56
67
|
case command
|
data/lib/masticate/mender.rb
CHANGED
@@ -18,6 +18,7 @@ class Masticate::Mender < Masticate::Base
|
|
18
18
|
expected_field_count = nil
|
19
19
|
headers = nil
|
20
20
|
@output_count = 0
|
21
|
+
fieldcounts = Hash.new(0)
|
21
22
|
with_input do |input|
|
22
23
|
while (line = get) do
|
23
24
|
unless line =~ /^\s*$/
|
@@ -27,13 +28,15 @@ class Masticate::Mender < Masticate::Base
|
|
27
28
|
if !expected_field_count
|
28
29
|
headers = row[0..ncells]
|
29
30
|
expected_field_count = headers.count
|
31
|
+
fieldcounts[headers.count] += 1
|
30
32
|
emit(headers.to_csv(:col_sep => @col_sep))
|
31
33
|
else
|
32
34
|
if row[0..ncells] != headers
|
33
35
|
raise "Header mismatch on line #{@input_count}\n Expected: #{headers.join(',')}\n Found: #{row[0..ncells].join(',')}"
|
34
36
|
end
|
35
37
|
end
|
36
|
-
row = row[ncells+1
|
38
|
+
row = row[ncells+1, expected_field_count]
|
39
|
+
fieldcounts[row.count] += 1
|
37
40
|
emit(row.to_csv(:col_sep => @col_sep))
|
38
41
|
elsif !expected_field_count
|
39
42
|
# trust the first row
|
@@ -49,6 +52,7 @@ class Masticate::Mender < Masticate::Base
|
|
49
52
|
raise "Do not understand snip instruction [#{opts[:snip].inspect}]"
|
50
53
|
end
|
51
54
|
expected_field_count = headers.count
|
55
|
+
fieldcounts[headers.count] += 1
|
52
56
|
emit(headers.to_csv(:col_sep => @col_sep))
|
53
57
|
else
|
54
58
|
running_count = fieldcount(line)
|
@@ -61,6 +65,7 @@ class Masticate::Mender < Masticate::Base
|
|
61
65
|
end
|
62
66
|
|
63
67
|
unless opts[:dejunk] && junky?(line)
|
68
|
+
fieldcounts[fieldcount(line)] += 1
|
64
69
|
emit(line)
|
65
70
|
end
|
66
71
|
end
|
@@ -72,6 +77,7 @@ class Masticate::Mender < Masticate::Base
|
|
72
77
|
{
|
73
78
|
:input_count => @input_count,
|
74
79
|
:output_count => @output_count,
|
80
|
+
:field_counts => fieldcounts,
|
75
81
|
:headers => headers
|
76
82
|
}
|
77
83
|
end
|
@@ -81,7 +87,7 @@ class Masticate::Mender < Masticate::Base
|
|
81
87
|
end
|
82
88
|
|
83
89
|
def explode(line)
|
84
|
-
CSV.parse_line(line, :col_sep => col_sep, :quote_char => @quote_char)
|
90
|
+
CSV.parse_line(line, :col_sep => @col_sep, :quote_char => @quote_char)
|
85
91
|
end
|
86
92
|
|
87
93
|
# a line is "junky" if it has 2 or fewer fields with any content
|
data/lib/masticate/version.rb
CHANGED
@@ -4,7 +4,7 @@ LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NA
|
|
4
4
|
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,ADAMS,JOHN QUINCY,A,209,8637,02/24/2003,12/02/2007,TM,Imaging Svcs - MRI,F,11/03/1966,3/10/2012
|
5
5
|
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,HAMILTON,ANDREW,,278,10065,01/09/2007,11/16/2007,TM,Information Technology,M,09/16/1968,3/10/2012
|
6
6
|
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,MADISON,JAMES,F,672,10720,01/05/2009,02/16/2010,TM,Rehab Svcs - Outpatients,F,04/15/1985,3/10/2012
|
7
|
-
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,FRANKLIN,BENJAMIN,R,674,8340,05/01/2002,09/01/2003,TM,Rehab Svcs
|
7
|
+
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,FRANKLIN,BENJAMIN,R,674,8340,05/01/2002,09/01/2003,TM,"Rehab Svcs, xyz",F,03/15/1973,3/10/2012
|
8
8
|
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,LINCOLN,ABRAHAM,M,634,11340,05/02/2011,,PN,Sibley Ambulatory Surgery Ctr,F,07/11/1960,3/10/2012
|
9
9
|
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,MONROE,JAMES,L,614,10757,02/16/2009,,RF,Labor & Delivery,F,11/06/1983,3/10/2012
|
10
10
|
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,REVERE,PAUL,B,424,8568,11/18/2002,06/27/2006,TM,Laundry & Linen,M,12/31/1976,3/10/2012
|
@@ -1,11 +1,11 @@
|
|
1
1
|
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE
|
2
|
-
WASHINGTON,GEORGE,D,824,9556,09/10/2005,07/01/2006,TM,Surgical House Staff,M,09/23/1975
|
3
|
-
JEFFERSON,TOM,,621,8052,07/23/2001,01/28/2011,TM,Telemetry,F,12/24/1976
|
4
|
-
ADAMS,JOHN,,655,8834,09/22/2003,,WA,6 East,F,08/07/1978
|
5
|
-
ADAMS,JOHN QUINCY,A,209,8637,02/24/2003,12/02/2007,TM,Imaging Svcs - MRI,F,11/03/1966
|
6
|
-
HAMILTON,ANDREW,,278,10065,01/09/2007,11/16/2007,TM,Information Technology,M,09/16/1968
|
7
|
-
MADISON,JAMES,F,672,10720,01/05/2009,02/16/2010,TM,Rehab Svcs - Outpatients,F,04/15/1985
|
8
|
-
FRANKLIN,BENJAMIN,R,674,8340,05/01/2002,09/01/2003,TM,Rehab Svcs
|
9
|
-
LINCOLN,ABRAHAM,M,634,11340,05/02/2011,,PN,Sibley Ambulatory Surgery Ctr,F,07/11/1960
|
10
|
-
MONROE,JAMES,L,614,10757,02/16/2009,,RF,Labor & Delivery,F,11/06/1983
|
11
|
-
REVERE,PAUL,B,424,8568,11/18/2002,06/27/2006,TM,Laundry & Linen,M,12/31/1976
|
2
|
+
WASHINGTON,GEORGE,D,824,9556,09/10/2005,07/01/2006,TM,Surgical House Staff,M,09/23/1975
|
3
|
+
JEFFERSON,TOM,,621,8052,07/23/2001,01/28/2011,TM,Telemetry,F,12/24/1976
|
4
|
+
ADAMS,JOHN,,655,8834,09/22/2003,,WA,6 East,F,08/07/1978
|
5
|
+
ADAMS,JOHN QUINCY,A,209,8637,02/24/2003,12/02/2007,TM,Imaging Svcs - MRI,F,11/03/1966
|
6
|
+
HAMILTON,ANDREW,,278,10065,01/09/2007,11/16/2007,TM,Information Technology,M,09/16/1968
|
7
|
+
MADISON,JAMES,F,672,10720,01/05/2009,02/16/2010,TM,Rehab Svcs - Outpatients,F,04/15/1985
|
8
|
+
FRANKLIN,BENJAMIN,R,674,8340,05/01/2002,09/01/2003,TM,"Rehab Svcs, xyz",F,03/15/1973
|
9
|
+
LINCOLN,ABRAHAM,M,634,11340,05/02/2011,,PN,Sibley Ambulatory Surgery Ctr,F,07/11/1960
|
10
|
+
MONROE,JAMES,L,614,10757,02/16/2009,,RF,Labor & Delivery,F,11/06/1983
|
11
|
+
REVERE,PAUL,B,424,8568,11/18/2002,06/27/2006,TM,Laundry & Linen,M,12/31/1976
|
data/spec/data/junk_header.csv
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
3/7/2012,hospid,usrorder,dteorder,usrsend,dtesend,usrdone,dtedone,department
|
2
2
|
15267,407,201201060140,407,201201060140,0,201201060309,L
|
3
|
-
15267,381,201201060222,381,201201060222,
|
3
|
+
15267,381,201201060222,381,201201060222,"abc,def",201201060647,X
|
4
4
|
15267,407,201201060311,407,201201060311,0,201201060339,L
|
5
5
|
15267,407,201201060514,108,201201060515,108,201201060515,SEC
|
data/spec/lib/mender_spec.rb
CHANGED
@@ -30,12 +30,13 @@ describe "mending" do
|
|
30
30
|
it "should unfold inlined headers" do
|
31
31
|
filename = File.dirname(__FILE__) + "/../data/inlined_headers.csv"
|
32
32
|
tmp = Tempfile.new('mending')
|
33
|
-
results = Masticate.mend(filename, :inlined => true, :output => tmp)
|
33
|
+
results = Masticate.mend(filename, :col_sep => ',', :quote_char => '"', :inlined => true, :output => tmp)
|
34
34
|
output = File.read(tmp)
|
35
35
|
correct_output = File.read(File.dirname(__FILE__) + "/../data/inlined_headers.csv.output")
|
36
36
|
|
37
37
|
results[:input_count].should == 11
|
38
38
|
results[:output_count].should == 11
|
39
|
+
results[:field_counts].should == {11 => 11}
|
39
40
|
output.should == correct_output
|
40
41
|
end
|
41
42
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: masticate
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -13,7 +13,7 @@ date: 2012-04-06 00:00:00.000000000 Z
|
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rspec
|
16
|
-
requirement: &
|
16
|
+
requirement: &2152447240 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: 2.9.0
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *2152447240
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: guard-rspec
|
27
|
-
requirement: &
|
27
|
+
requirement: &2152446740 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ~>
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: 0.7.0
|
33
33
|
type: :development
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *2152446740
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: ruby_gntp
|
38
|
-
requirement: &
|
38
|
+
requirement: &2152446280 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ~>
|
@@ -43,7 +43,7 @@ dependencies:
|
|
43
43
|
version: 0.3.4
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *2152446280
|
47
47
|
description: Data file crunching
|
48
48
|
email:
|
49
49
|
- jmay@pobox.com
|