masticate 0.1.1 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/masticate +12 -1
- data/lib/masticate/mender.rb +8 -2
- data/lib/masticate/version.rb +1 -1
- data/spec/data/inlined_headers.csv +1 -1
- data/spec/data/inlined_headers.csv.output +10 -10
- data/spec/data/junk_header.csv +1 -1
- data/spec/lib/mender_spec.rb +2 -1
- metadata +7 -7
data/bin/masticate
CHANGED
@@ -13,11 +13,15 @@ OptionParser.new do |opts|
|
|
13
13
|
options[:format] = v
|
14
14
|
end
|
15
15
|
|
16
|
-
opts.on("--delim DELIMITER", "Specify field delimiter (character or TAB)") do |v|
|
16
|
+
opts.on("--delim DELIMITER", "Specify field delimiter (character or TAB; default is ',')") do |v|
|
17
17
|
options[:col_sep] = v
|
18
18
|
options[:col_sep] = "\t" if options[:col_sep] == "TAB"
|
19
19
|
end
|
20
20
|
|
21
|
+
opts.on("--quote QUOTE-CHAR", "Specify character used for quoting fields (optional; default is no quoting)") do |char|
|
22
|
+
options[:quote_char] = char
|
23
|
+
end
|
24
|
+
|
21
25
|
opts.on("--fields LIST", Array, "Specify fields to select") do |list|
|
22
26
|
options[:fields] = list
|
23
27
|
end
|
@@ -41,6 +45,10 @@ OptionParser.new do |opts|
|
|
41
45
|
opts.on("--inlined", "(for *mend* only) Source file has headers inlined on each line") do |v|
|
42
46
|
options[:inlined] = v
|
43
47
|
end
|
48
|
+
|
49
|
+
opts.on("--dejunk", "(for *mend* only) Expunge junk lines from source") do |v|
|
50
|
+
options[:dejunk] = v
|
51
|
+
end
|
44
52
|
end.parse!
|
45
53
|
|
46
54
|
filename = ARGV.shift # use stdin if no filename provided
|
@@ -51,6 +59,9 @@ def logmessage(command, options, results)
|
|
51
59
|
Lines in input: #{results[:input_count]}
|
52
60
|
Lines in output: #{results[:output_count]}
|
53
61
|
EOT
|
62
|
+
if results[:field_counts]
|
63
|
+
$stderr.puts " Field counts: #{results[:field_counts].inspect}"
|
64
|
+
end
|
54
65
|
end
|
55
66
|
|
56
67
|
case command
|
data/lib/masticate/mender.rb
CHANGED
@@ -18,6 +18,7 @@ class Masticate::Mender < Masticate::Base
|
|
18
18
|
expected_field_count = nil
|
19
19
|
headers = nil
|
20
20
|
@output_count = 0
|
21
|
+
fieldcounts = Hash.new(0)
|
21
22
|
with_input do |input|
|
22
23
|
while (line = get) do
|
23
24
|
unless line =~ /^\s*$/
|
@@ -27,13 +28,15 @@ class Masticate::Mender < Masticate::Base
|
|
27
28
|
if !expected_field_count
|
28
29
|
headers = row[0..ncells]
|
29
30
|
expected_field_count = headers.count
|
31
|
+
fieldcounts[headers.count] += 1
|
30
32
|
emit(headers.to_csv(:col_sep => @col_sep))
|
31
33
|
else
|
32
34
|
if row[0..ncells] != headers
|
33
35
|
raise "Header mismatch on line #{@input_count}\n Expected: #{headers.join(',')}\n Found: #{row[0..ncells].join(',')}"
|
34
36
|
end
|
35
37
|
end
|
36
|
-
row = row[ncells+1
|
38
|
+
row = row[ncells+1, expected_field_count]
|
39
|
+
fieldcounts[row.count] += 1
|
37
40
|
emit(row.to_csv(:col_sep => @col_sep))
|
38
41
|
elsif !expected_field_count
|
39
42
|
# trust the first row
|
@@ -49,6 +52,7 @@ class Masticate::Mender < Masticate::Base
|
|
49
52
|
raise "Do not understand snip instruction [#{opts[:snip].inspect}]"
|
50
53
|
end
|
51
54
|
expected_field_count = headers.count
|
55
|
+
fieldcounts[headers.count] += 1
|
52
56
|
emit(headers.to_csv(:col_sep => @col_sep))
|
53
57
|
else
|
54
58
|
running_count = fieldcount(line)
|
@@ -61,6 +65,7 @@ class Masticate::Mender < Masticate::Base
|
|
61
65
|
end
|
62
66
|
|
63
67
|
unless opts[:dejunk] && junky?(line)
|
68
|
+
fieldcounts[fieldcount(line)] += 1
|
64
69
|
emit(line)
|
65
70
|
end
|
66
71
|
end
|
@@ -72,6 +77,7 @@ class Masticate::Mender < Masticate::Base
|
|
72
77
|
{
|
73
78
|
:input_count => @input_count,
|
74
79
|
:output_count => @output_count,
|
80
|
+
:field_counts => fieldcounts,
|
75
81
|
:headers => headers
|
76
82
|
}
|
77
83
|
end
|
@@ -81,7 +87,7 @@ class Masticate::Mender < Masticate::Base
|
|
81
87
|
end
|
82
88
|
|
83
89
|
def explode(line)
|
84
|
-
CSV.parse_line(line, :col_sep => col_sep, :quote_char => @quote_char)
|
90
|
+
CSV.parse_line(line, :col_sep => @col_sep, :quote_char => @quote_char)
|
85
91
|
end
|
86
92
|
|
87
93
|
# a line is "junky" if it has 2 or fewer fields with any content
|
data/lib/masticate/version.rb
CHANGED
@@ -4,7 +4,7 @@ LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NA
|
|
4
4
|
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,ADAMS,JOHN QUINCY,A,209,8637,02/24/2003,12/02/2007,TM,Imaging Svcs - MRI,F,11/03/1966,3/10/2012
|
5
5
|
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,HAMILTON,ANDREW,,278,10065,01/09/2007,11/16/2007,TM,Information Technology,M,09/16/1968,3/10/2012
|
6
6
|
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,MADISON,JAMES,F,672,10720,01/05/2009,02/16/2010,TM,Rehab Svcs - Outpatients,F,04/15/1985,3/10/2012
|
7
|
-
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,FRANKLIN,BENJAMIN,R,674,8340,05/01/2002,09/01/2003,TM,Rehab Svcs
|
7
|
+
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,FRANKLIN,BENJAMIN,R,674,8340,05/01/2002,09/01/2003,TM,"Rehab Svcs, xyz",F,03/15/1973,3/10/2012
|
8
8
|
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,LINCOLN,ABRAHAM,M,634,11340,05/02/2011,,PN,Sibley Ambulatory Surgery Ctr,F,07/11/1960,3/10/2012
|
9
9
|
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,MONROE,JAMES,L,614,10757,02/16/2009,,RF,Labor & Delivery,F,11/06/1983,3/10/2012
|
10
10
|
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,REVERE,PAUL,B,424,8568,11/18/2002,06/27/2006,TM,Laundry & Linen,M,12/31/1976,3/10/2012
|
@@ -1,11 +1,11 @@
|
|
1
1
|
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE
|
2
|
-
WASHINGTON,GEORGE,D,824,9556,09/10/2005,07/01/2006,TM,Surgical House Staff,M,09/23/1975
|
3
|
-
JEFFERSON,TOM,,621,8052,07/23/2001,01/28/2011,TM,Telemetry,F,12/24/1976
|
4
|
-
ADAMS,JOHN,,655,8834,09/22/2003,,WA,6 East,F,08/07/1978
|
5
|
-
ADAMS,JOHN QUINCY,A,209,8637,02/24/2003,12/02/2007,TM,Imaging Svcs - MRI,F,11/03/1966
|
6
|
-
HAMILTON,ANDREW,,278,10065,01/09/2007,11/16/2007,TM,Information Technology,M,09/16/1968
|
7
|
-
MADISON,JAMES,F,672,10720,01/05/2009,02/16/2010,TM,Rehab Svcs - Outpatients,F,04/15/1985
|
8
|
-
FRANKLIN,BENJAMIN,R,674,8340,05/01/2002,09/01/2003,TM,Rehab Svcs
|
9
|
-
LINCOLN,ABRAHAM,M,634,11340,05/02/2011,,PN,Sibley Ambulatory Surgery Ctr,F,07/11/1960
|
10
|
-
MONROE,JAMES,L,614,10757,02/16/2009,,RF,Labor & Delivery,F,11/06/1983
|
11
|
-
REVERE,PAUL,B,424,8568,11/18/2002,06/27/2006,TM,Laundry & Linen,M,12/31/1976
|
2
|
+
WASHINGTON,GEORGE,D,824,9556,09/10/2005,07/01/2006,TM,Surgical House Staff,M,09/23/1975
|
3
|
+
JEFFERSON,TOM,,621,8052,07/23/2001,01/28/2011,TM,Telemetry,F,12/24/1976
|
4
|
+
ADAMS,JOHN,,655,8834,09/22/2003,,WA,6 East,F,08/07/1978
|
5
|
+
ADAMS,JOHN QUINCY,A,209,8637,02/24/2003,12/02/2007,TM,Imaging Svcs - MRI,F,11/03/1966
|
6
|
+
HAMILTON,ANDREW,,278,10065,01/09/2007,11/16/2007,TM,Information Technology,M,09/16/1968
|
7
|
+
MADISON,JAMES,F,672,10720,01/05/2009,02/16/2010,TM,Rehab Svcs - Outpatients,F,04/15/1985
|
8
|
+
FRANKLIN,BENJAMIN,R,674,8340,05/01/2002,09/01/2003,TM,"Rehab Svcs, xyz",F,03/15/1973
|
9
|
+
LINCOLN,ABRAHAM,M,634,11340,05/02/2011,,PN,Sibley Ambulatory Surgery Ctr,F,07/11/1960
|
10
|
+
MONROE,JAMES,L,614,10757,02/16/2009,,RF,Labor & Delivery,F,11/06/1983
|
11
|
+
REVERE,PAUL,B,424,8568,11/18/2002,06/27/2006,TM,Laundry & Linen,M,12/31/1976
|
data/spec/data/junk_header.csv
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
3/7/2012,hospid,usrorder,dteorder,usrsend,dtesend,usrdone,dtedone,department
|
2
2
|
15267,407,201201060140,407,201201060140,0,201201060309,L
|
3
|
-
15267,381,201201060222,381,201201060222,
|
3
|
+
15267,381,201201060222,381,201201060222,"abc,def",201201060647,X
|
4
4
|
15267,407,201201060311,407,201201060311,0,201201060339,L
|
5
5
|
15267,407,201201060514,108,201201060515,108,201201060515,SEC
|
data/spec/lib/mender_spec.rb
CHANGED
@@ -30,12 +30,13 @@ describe "mending" do
|
|
30
30
|
it "should unfold inlined headers" do
|
31
31
|
filename = File.dirname(__FILE__) + "/../data/inlined_headers.csv"
|
32
32
|
tmp = Tempfile.new('mending')
|
33
|
-
results = Masticate.mend(filename, :inlined => true, :output => tmp)
|
33
|
+
results = Masticate.mend(filename, :col_sep => ',', :quote_char => '"', :inlined => true, :output => tmp)
|
34
34
|
output = File.read(tmp)
|
35
35
|
correct_output = File.read(File.dirname(__FILE__) + "/../data/inlined_headers.csv.output")
|
36
36
|
|
37
37
|
results[:input_count].should == 11
|
38
38
|
results[:output_count].should == 11
|
39
|
+
results[:field_counts].should == {11 => 11}
|
39
40
|
output.should == correct_output
|
40
41
|
end
|
41
42
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: masticate
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -13,7 +13,7 @@ date: 2012-04-06 00:00:00.000000000 Z
|
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rspec
|
16
|
-
requirement: &
|
16
|
+
requirement: &2152447240 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: 2.9.0
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *2152447240
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: guard-rspec
|
27
|
-
requirement: &
|
27
|
+
requirement: &2152446740 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ~>
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: 0.7.0
|
33
33
|
type: :development
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *2152446740
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: ruby_gntp
|
38
|
-
requirement: &
|
38
|
+
requirement: &2152446280 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ~>
|
@@ -43,7 +43,7 @@ dependencies:
|
|
43
43
|
version: 0.3.4
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *2152446280
|
47
47
|
description: Data file crunching
|
48
48
|
email:
|
49
49
|
- jmay@pobox.com
|