masticate 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/masticate +12 -0
- data/lib/masticate/mender.rb +1 -1
- data/lib/masticate/plucker.rb +15 -2
- data/lib/masticate/sniffer.rb +2 -2
- data/lib/masticate/version.rb +1 -1
- data/lib/masticate.rb +2 -2
- data/spec/data/inlined_headers.csv +5 -5
- data/spec/data/inlined_headers.csv.output +1 -1
- data/spec/data/junk_header.csv +1 -1
- data/spec/lib/plucker_spec.rb +12 -0
- data/spec/lib/sniffer_spec.rb +10 -3
- metadata +8 -8
data/bin/masticate
CHANGED
@@ -49,6 +49,14 @@ OptionParser.new do |opts|
|
|
49
49
|
opts.on("--dejunk", "(for *mend* only) Expunge junk lines from source") do |v|
|
50
50
|
options[:dejunk] = v
|
51
51
|
end
|
52
|
+
|
53
|
+
opts.on("--by FIELD", "(for *maxrows* only) Field to group by") do |f|
|
54
|
+
options[:by] = f
|
55
|
+
end
|
56
|
+
|
57
|
+
opts.on("--max FIELD", "(for *maxrows* only) Field to find max value for") do |f|
|
58
|
+
options[:max] = f
|
59
|
+
end
|
52
60
|
end.parse!
|
53
61
|
|
54
62
|
filename = ARGV.shift # use stdin if no filename provided
|
@@ -98,6 +106,10 @@ when 'gsub'
|
|
98
106
|
results = Masticate.gsub(filename, options)
|
99
107
|
logmessage(command, options, results)
|
100
108
|
|
109
|
+
when 'maxrows'
|
110
|
+
results = Masticate.maxrows(filename, options)
|
111
|
+
logmessage(command, options, results)
|
112
|
+
|
101
113
|
else
|
102
114
|
raise "unknown command #{command}"
|
103
115
|
end
|
data/lib/masticate/mender.rb
CHANGED
@@ -87,7 +87,7 @@ class Masticate::Mender < Masticate::Base
|
|
87
87
|
end
|
88
88
|
|
89
89
|
def explode(line)
|
90
|
-
CSV.parse_line(line, :col_sep => @col_sep, :quote_char => @quote_char)
|
90
|
+
CSV.parse_line(line, :col_sep => @col_sep, :quote_char => @quote_char).map {|s| s && s.strip}
|
91
91
|
end
|
92
92
|
|
93
93
|
# a line is "junky" if it has 2 or fewer fields with any content
|
data/lib/masticate/plucker.rb
CHANGED
@@ -14,8 +14,21 @@ class Masticate::Plucker < Masticate::Base
|
|
14
14
|
row = CSV.parse_line(line, csv_options)
|
15
15
|
if !headers
|
16
16
|
headers = row
|
17
|
-
indexes = fields.map
|
18
|
-
|
17
|
+
indexes = fields.map do |f|
|
18
|
+
case f
|
19
|
+
when String
|
20
|
+
headers.index(f) or raise "Unable to find column '#{f}'"
|
21
|
+
when Fixnum
|
22
|
+
if f > headers.count
|
23
|
+
raise "Cannot pluck column #{f}, there are only #{headers.count} fields"
|
24
|
+
else
|
25
|
+
f-1
|
26
|
+
end
|
27
|
+
else
|
28
|
+
raise "Invalid field descriptor '#{f}'"
|
29
|
+
end
|
30
|
+
end
|
31
|
+
emit(indexes.map {|i| headers[i]}.to_csv)
|
19
32
|
else
|
20
33
|
emit(indexes.map {|i| row[i]}.to_csv) if row
|
21
34
|
end
|
data/lib/masticate/sniffer.rb
CHANGED
@@ -10,10 +10,10 @@ class Masticate::Sniffer < Masticate::Base
|
|
10
10
|
@filename = filename
|
11
11
|
end
|
12
12
|
|
13
|
-
def sniff
|
13
|
+
def sniff(opts)
|
14
14
|
@col_sep = find_col_sep
|
15
15
|
@quote_char = delimstats[@col_sep][:quote_char]
|
16
|
-
@stats = stats
|
16
|
+
@stats = stats if opts[:stats]
|
17
17
|
{
|
18
18
|
:col_sep => @col_sep,
|
19
19
|
:quote_char => @quote_char,
|
data/lib/masticate/version.rb
CHANGED
data/lib/masticate.rb
CHANGED
@@ -11,8 +11,8 @@ require_relative "masticate/gsubber"
|
|
11
11
|
require_relative "masticate/max_rows"
|
12
12
|
|
13
13
|
module Masticate
|
14
|
-
def self.sniff(filename)
|
15
|
-
Sniffer.new(filename).sniff
|
14
|
+
def self.sniff(filename, opts = {})
|
15
|
+
Sniffer.new(filename).sniff(opts)
|
16
16
|
end
|
17
17
|
|
18
18
|
def self.mend(filename, opts)
|
@@ -1,10 +1,10 @@
|
|
1
|
-
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT
|
2
|
-
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,JEFFERSON,TOM,,621,8052,07/23/2001,01/28/2011,TM,Telemetry,F,12/24/1976,3/10/2012
|
1
|
+
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT # ,Empl # ,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,WASHINGTON,GEORGE,D,824,9556,09/10/2005,07/01/2006,TM,Surgical House Staff,M,09/23/1975,3/10/2012
|
2
|
+
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,JEFFERSON,TOM ,,621,8052,07/23/2001,01/28/2011,TM,Telemetry,F,12/24/1976,3/10/2012
|
3
3
|
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,ADAMS,JOHN,,655,8834,09/22/2003,,WA,6 East,F,08/07/1978,3/10/2012
|
4
4
|
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,ADAMS,JOHN QUINCY,A,209,8637,02/24/2003,12/02/2007,TM,Imaging Svcs - MRI,F,11/03/1966,3/10/2012
|
5
5
|
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,HAMILTON,ANDREW,,278,10065,01/09/2007,11/16/2007,TM,Information Technology,M,09/16/1968,3/10/2012
|
6
|
-
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,MADISON,JAMES,F,672,10720,01/05/2009,02/16/2010,TM,Rehab Svcs - Outpatients,F,04/15/1985,3/10/2012
|
6
|
+
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,MADISON ,JAMES,F,672,10720,01/05/2009,02/16/2010,TM,Rehab Svcs - Outpatients,F,04/15/1985,3/10/2012
|
7
7
|
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,FRANKLIN,BENJAMIN,R,674,8340,05/01/2002,09/01/2003,TM,"Rehab Svcs, xyz",F,03/15/1973,3/10/2012
|
8
8
|
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,LINCOLN,ABRAHAM,M,634,11340,05/02/2011,,PN,Sibley Ambulatory Surgery Ctr,F,07/11/1960,3/10/2012
|
9
|
-
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl
|
10
|
-
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,REVERE,PAUL,B,424,8568,11/18/2002,06/27/2006,TM,Laundry & Linen,M,12/31/1976,3/10/2012
|
9
|
+
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl # ,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,MONROE,JAMES,L,614,10757,02/16/2009,,RF,Labor & Delivery,F,11/06/1983,3/10/2012
|
10
|
+
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,REVERE,PAUL,B,424,8568,11/18/2002,06/27/2006,TM,Laundry & Linen ,M,12/31/1976,3/10/2012
|
@@ -1,4 +1,4 @@
|
|
1
|
-
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date
|
1
|
+
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date,Status,R_NAME,SEX,BIRTHDATE
|
2
2
|
WASHINGTON,GEORGE,D,824,9556,09/10/2005,07/01/2006,TM,Surgical House Staff,M,09/23/1975
|
3
3
|
JEFFERSON,TOM,,621,8052,07/23/2001,01/28/2011,TM,Telemetry,F,12/24/1976
|
4
4
|
ADAMS,JOHN,,655,8834,09/22/2003,,WA,6 East,F,08/07/1978
|
data/spec/data/junk_header.csv
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
3/7/2012,hospid,usrorder,dteorder,usrsend,dtesend,usrdone,dtedone,department
|
1
|
+
3/7/2012,hospid,usrorder ,dteorder,usrsend,dtesend,usrdone,dtedone,department
|
2
2
|
15267,407,201201060140,407,201201060140,0,201201060309,L
|
3
3
|
15267,381,201201060222,381,201201060222,"abc,def",201201060647,X
|
4
4
|
15267,407,201201060311,407,201201060311,0,201201060339,L
|
data/spec/lib/plucker_spec.rb
CHANGED
@@ -15,4 +15,16 @@ describe "plucker" do
|
|
15
15
|
results[:input_count].should == 5
|
16
16
|
output.should == correct_output
|
17
17
|
end
|
18
|
+
|
19
|
+
it "should pull numbered columns starting at 1" do
|
20
|
+
filename = File.dirname(__FILE__) + "/../data/namedcols.csv"
|
21
|
+
tmp = Tempfile.new('plucker')
|
22
|
+
results = Masticate.pluck(filename, :output => tmp, :fields => [3,5])
|
23
|
+
output = File.read(tmp)
|
24
|
+
correct_output = File.read(File.dirname(__FILE__) + "/../data/namedcols.csv.output")
|
25
|
+
tmp.unlink
|
26
|
+
|
27
|
+
results[:input_count].should == 5
|
28
|
+
output.should == correct_output
|
29
|
+
end
|
18
30
|
end
|
data/spec/lib/sniffer_spec.rb
CHANGED
@@ -3,23 +3,30 @@
|
|
3
3
|
require "spec_helper"
|
4
4
|
|
5
5
|
describe "delimiter sniffing" do
|
6
|
-
it "should
|
6
|
+
it "stats collection should default off" do
|
7
7
|
filename = File.dirname(__FILE__) + "/../data/tabbed_data.txt"
|
8
8
|
results = Masticate.sniff(filename)
|
9
9
|
results[:col_sep].should == "\t"
|
10
|
+
results[:field_counts].should be_nil
|
11
|
+
end
|
12
|
+
|
13
|
+
it "should find tab delimiter" do
|
14
|
+
filename = File.dirname(__FILE__) + "/../data/tabbed_data.txt"
|
15
|
+
results = Masticate.sniff(filename, :stats => true)
|
16
|
+
results[:col_sep].should == "\t"
|
10
17
|
results[:field_counts].should == {6 => 5}
|
11
18
|
end
|
12
19
|
|
13
20
|
it "should find pipe delimiter" do
|
14
21
|
filename = File.dirname(__FILE__) + "/../data/pipe_data.txt"
|
15
|
-
results = Masticate.sniff(filename)
|
22
|
+
results = Masticate.sniff(filename, :stats => true)
|
16
23
|
results[:col_sep].should == '|'
|
17
24
|
results[:field_counts].should == {6 => 5}
|
18
25
|
end
|
19
26
|
|
20
27
|
it "should recognize quotes in CSV sources" do
|
21
28
|
filename = File.dirname(__FILE__) + "/../data/quoted_csv_data.txt"
|
22
|
-
results = Masticate.sniff(filename)
|
29
|
+
results = Masticate.sniff(filename, :stats => true)
|
23
30
|
results[:col_sep].should == ','
|
24
31
|
results[:quote_char].should == '"'
|
25
32
|
results[:field_counts].should == {14 => 100}
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: masticate
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.4
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-04-
|
12
|
+
date: 2012-04-16 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rspec
|
16
|
-
requirement: &
|
16
|
+
requirement: &2152726520 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: 2.9.0
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *2152726520
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: guard-rspec
|
27
|
-
requirement: &
|
27
|
+
requirement: &2152749240 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ~>
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: 0.7.0
|
33
33
|
type: :development
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *2152749240
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: ruby_gntp
|
38
|
-
requirement: &
|
38
|
+
requirement: &2152748720 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ~>
|
@@ -43,7 +43,7 @@ dependencies:
|
|
43
43
|
version: 0.3.4
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *2152748720
|
47
47
|
description: Data file crunching
|
48
48
|
email:
|
49
49
|
- jmay@pobox.com
|