masticate 0.1.3 → 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/masticate +12 -0
- data/lib/masticate/mender.rb +1 -1
- data/lib/masticate/plucker.rb +15 -2
- data/lib/masticate/sniffer.rb +2 -2
- data/lib/masticate/version.rb +1 -1
- data/lib/masticate.rb +2 -2
- data/spec/data/inlined_headers.csv +5 -5
- data/spec/data/inlined_headers.csv.output +1 -1
- data/spec/data/junk_header.csv +1 -1
- data/spec/lib/plucker_spec.rb +12 -0
- data/spec/lib/sniffer_spec.rb +10 -3
- metadata +8 -8
data/bin/masticate
CHANGED
@@ -49,6 +49,14 @@ OptionParser.new do |opts|
|
|
49
49
|
opts.on("--dejunk", "(for *mend* only) Expunge junk lines from source") do |v|
|
50
50
|
options[:dejunk] = v
|
51
51
|
end
|
52
|
+
|
53
|
+
opts.on("--by FIELD", "(for *maxrows* only) Field to group by") do |f|
|
54
|
+
options[:by] = f
|
55
|
+
end
|
56
|
+
|
57
|
+
opts.on("--max FIELD", "(for *maxrows* only) Field to find max value for") do |f|
|
58
|
+
options[:max] = f
|
59
|
+
end
|
52
60
|
end.parse!
|
53
61
|
|
54
62
|
filename = ARGV.shift # use stdin if no filename provided
|
@@ -98,6 +106,10 @@ when 'gsub'
|
|
98
106
|
results = Masticate.gsub(filename, options)
|
99
107
|
logmessage(command, options, results)
|
100
108
|
|
109
|
+
when 'maxrows'
|
110
|
+
results = Masticate.maxrows(filename, options)
|
111
|
+
logmessage(command, options, results)
|
112
|
+
|
101
113
|
else
|
102
114
|
raise "unknown command #{command}"
|
103
115
|
end
|
data/lib/masticate/mender.rb
CHANGED
@@ -87,7 +87,7 @@ class Masticate::Mender < Masticate::Base
|
|
87
87
|
end
|
88
88
|
|
89
89
|
def explode(line)
|
90
|
-
CSV.parse_line(line, :col_sep => @col_sep, :quote_char => @quote_char)
|
90
|
+
CSV.parse_line(line, :col_sep => @col_sep, :quote_char => @quote_char).map {|s| s && s.strip}
|
91
91
|
end
|
92
92
|
|
93
93
|
# a line is "junky" if it has 2 or fewer fields with any content
|
data/lib/masticate/plucker.rb
CHANGED
@@ -14,8 +14,21 @@ class Masticate::Plucker < Masticate::Base
|
|
14
14
|
row = CSV.parse_line(line, csv_options)
|
15
15
|
if !headers
|
16
16
|
headers = row
|
17
|
-
indexes = fields.map
|
18
|
-
|
17
|
+
indexes = fields.map do |f|
|
18
|
+
case f
|
19
|
+
when String
|
20
|
+
headers.index(f) or raise "Unable to find column '#{f}'"
|
21
|
+
when Fixnum
|
22
|
+
if f > headers.count
|
23
|
+
raise "Cannot pluck column #{f}, there are only #{headers.count} fields"
|
24
|
+
else
|
25
|
+
f-1
|
26
|
+
end
|
27
|
+
else
|
28
|
+
raise "Invalid field descriptor '#{f}'"
|
29
|
+
end
|
30
|
+
end
|
31
|
+
emit(indexes.map {|i| headers[i]}.to_csv)
|
19
32
|
else
|
20
33
|
emit(indexes.map {|i| row[i]}.to_csv) if row
|
21
34
|
end
|
data/lib/masticate/sniffer.rb
CHANGED
@@ -10,10 +10,10 @@ class Masticate::Sniffer < Masticate::Base
|
|
10
10
|
@filename = filename
|
11
11
|
end
|
12
12
|
|
13
|
-
def sniff
|
13
|
+
def sniff(opts)
|
14
14
|
@col_sep = find_col_sep
|
15
15
|
@quote_char = delimstats[@col_sep][:quote_char]
|
16
|
-
@stats = stats
|
16
|
+
@stats = stats if opts[:stats]
|
17
17
|
{
|
18
18
|
:col_sep => @col_sep,
|
19
19
|
:quote_char => @quote_char,
|
data/lib/masticate/version.rb
CHANGED
data/lib/masticate.rb
CHANGED
@@ -11,8 +11,8 @@ require_relative "masticate/gsubber"
|
|
11
11
|
require_relative "masticate/max_rows"
|
12
12
|
|
13
13
|
module Masticate
|
14
|
-
def self.sniff(filename)
|
15
|
-
Sniffer.new(filename).sniff
|
14
|
+
def self.sniff(filename, opts = {})
|
15
|
+
Sniffer.new(filename).sniff(opts)
|
16
16
|
end
|
17
17
|
|
18
18
|
def self.mend(filename, opts)
|
@@ -1,10 +1,10 @@
|
|
1
|
-
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT
|
2
|
-
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,JEFFERSON,TOM,,621,8052,07/23/2001,01/28/2011,TM,Telemetry,F,12/24/1976,3/10/2012
|
1
|
+
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT # ,Empl # ,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,WASHINGTON,GEORGE,D,824,9556,09/10/2005,07/01/2006,TM,Surgical House Staff,M,09/23/1975,3/10/2012
|
2
|
+
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,JEFFERSON,TOM ,,621,8052,07/23/2001,01/28/2011,TM,Telemetry,F,12/24/1976,3/10/2012
|
3
3
|
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,ADAMS,JOHN,,655,8834,09/22/2003,,WA,6 East,F,08/07/1978,3/10/2012
|
4
4
|
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,ADAMS,JOHN QUINCY,A,209,8637,02/24/2003,12/02/2007,TM,Imaging Svcs - MRI,F,11/03/1966,3/10/2012
|
5
5
|
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,HAMILTON,ANDREW,,278,10065,01/09/2007,11/16/2007,TM,Information Technology,M,09/16/1968,3/10/2012
|
6
|
-
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,MADISON,JAMES,F,672,10720,01/05/2009,02/16/2010,TM,Rehab Svcs - Outpatients,F,04/15/1985,3/10/2012
|
6
|
+
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,MADISON ,JAMES,F,672,10720,01/05/2009,02/16/2010,TM,Rehab Svcs - Outpatients,F,04/15/1985,3/10/2012
|
7
7
|
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,FRANKLIN,BENJAMIN,R,674,8340,05/01/2002,09/01/2003,TM,"Rehab Svcs, xyz",F,03/15/1973,3/10/2012
|
8
8
|
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,LINCOLN,ABRAHAM,M,634,11340,05/02/2011,,PN,Sibley Ambulatory Surgery Ctr,F,07/11/1960,3/10/2012
|
9
|
-
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl
|
10
|
-
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,REVERE,PAUL,B,424,8568,11/18/2002,06/27/2006,TM,Laundry & Linen,M,12/31/1976,3/10/2012
|
9
|
+
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl # ,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,MONROE,JAMES,L,614,10757,02/16/2009,,RF,Labor & Delivery,F,11/06/1983,3/10/2012
|
10
|
+
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,REVERE,PAUL,B,424,8568,11/18/2002,06/27/2006,TM,Laundry & Linen ,M,12/31/1976,3/10/2012
|
@@ -1,4 +1,4 @@
|
|
1
|
-
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date
|
1
|
+
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date,Status,R_NAME,SEX,BIRTHDATE
|
2
2
|
WASHINGTON,GEORGE,D,824,9556,09/10/2005,07/01/2006,TM,Surgical House Staff,M,09/23/1975
|
3
3
|
JEFFERSON,TOM,,621,8052,07/23/2001,01/28/2011,TM,Telemetry,F,12/24/1976
|
4
4
|
ADAMS,JOHN,,655,8834,09/22/2003,,WA,6 East,F,08/07/1978
|
data/spec/data/junk_header.csv
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
3/7/2012,hospid,usrorder,dteorder,usrsend,dtesend,usrdone,dtedone,department
|
1
|
+
3/7/2012,hospid,usrorder ,dteorder,usrsend,dtesend,usrdone,dtedone,department
|
2
2
|
15267,407,201201060140,407,201201060140,0,201201060309,L
|
3
3
|
15267,381,201201060222,381,201201060222,"abc,def",201201060647,X
|
4
4
|
15267,407,201201060311,407,201201060311,0,201201060339,L
|
data/spec/lib/plucker_spec.rb
CHANGED
@@ -15,4 +15,16 @@ describe "plucker" do
|
|
15
15
|
results[:input_count].should == 5
|
16
16
|
output.should == correct_output
|
17
17
|
end
|
18
|
+
|
19
|
+
it "should pull numbered columns starting at 1" do
|
20
|
+
filename = File.dirname(__FILE__) + "/../data/namedcols.csv"
|
21
|
+
tmp = Tempfile.new('plucker')
|
22
|
+
results = Masticate.pluck(filename, :output => tmp, :fields => [3,5])
|
23
|
+
output = File.read(tmp)
|
24
|
+
correct_output = File.read(File.dirname(__FILE__) + "/../data/namedcols.csv.output")
|
25
|
+
tmp.unlink
|
26
|
+
|
27
|
+
results[:input_count].should == 5
|
28
|
+
output.should == correct_output
|
29
|
+
end
|
18
30
|
end
|
data/spec/lib/sniffer_spec.rb
CHANGED
@@ -3,23 +3,30 @@
|
|
3
3
|
require "spec_helper"
|
4
4
|
|
5
5
|
describe "delimiter sniffing" do
|
6
|
-
it "should
|
6
|
+
it "stats collection should default off" do
|
7
7
|
filename = File.dirname(__FILE__) + "/../data/tabbed_data.txt"
|
8
8
|
results = Masticate.sniff(filename)
|
9
9
|
results[:col_sep].should == "\t"
|
10
|
+
results[:field_counts].should be_nil
|
11
|
+
end
|
12
|
+
|
13
|
+
it "should find tab delimiter" do
|
14
|
+
filename = File.dirname(__FILE__) + "/../data/tabbed_data.txt"
|
15
|
+
results = Masticate.sniff(filename, :stats => true)
|
16
|
+
results[:col_sep].should == "\t"
|
10
17
|
results[:field_counts].should == {6 => 5}
|
11
18
|
end
|
12
19
|
|
13
20
|
it "should find pipe delimiter" do
|
14
21
|
filename = File.dirname(__FILE__) + "/../data/pipe_data.txt"
|
15
|
-
results = Masticate.sniff(filename)
|
22
|
+
results = Masticate.sniff(filename, :stats => true)
|
16
23
|
results[:col_sep].should == '|'
|
17
24
|
results[:field_counts].should == {6 => 5}
|
18
25
|
end
|
19
26
|
|
20
27
|
it "should recognize quotes in CSV sources" do
|
21
28
|
filename = File.dirname(__FILE__) + "/../data/quoted_csv_data.txt"
|
22
|
-
results = Masticate.sniff(filename)
|
29
|
+
results = Masticate.sniff(filename, :stats => true)
|
23
30
|
results[:col_sep].should == ','
|
24
31
|
results[:quote_char].should == '"'
|
25
32
|
results[:field_counts].should == {14 => 100}
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: masticate
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.4
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-04-
|
12
|
+
date: 2012-04-16 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rspec
|
16
|
-
requirement: &
|
16
|
+
requirement: &2152726520 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: 2.9.0
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *2152726520
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: guard-rspec
|
27
|
-
requirement: &
|
27
|
+
requirement: &2152749240 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ~>
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: 0.7.0
|
33
33
|
type: :development
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *2152749240
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: ruby_gntp
|
38
|
-
requirement: &
|
38
|
+
requirement: &2152748720 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ~>
|
@@ -43,7 +43,7 @@ dependencies:
|
|
43
43
|
version: 0.3.4
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *2152748720
|
47
47
|
description: Data file crunching
|
48
48
|
email:
|
49
49
|
- jmay@pobox.com
|