masticate 0.1.3 → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
data/bin/masticate CHANGED
@@ -49,6 +49,14 @@ OptionParser.new do |opts|
49
49
  opts.on("--dejunk", "(for *mend* only) Expunge junk lines from source") do |v|
50
50
  options[:dejunk] = v
51
51
  end
52
+
53
+ opts.on("--by FIELD", "(for *maxrows* only) Field to group by") do |f|
54
+ options[:by] = f
55
+ end
56
+
57
+ opts.on("--max FIELD", "(for *maxrows* only) Field to find max value for") do |f|
58
+ options[:max] = f
59
+ end
52
60
  end.parse!
53
61
 
54
62
  filename = ARGV.shift # use stdin if no filename provided
@@ -98,6 +106,10 @@ when 'gsub'
98
106
  results = Masticate.gsub(filename, options)
99
107
  logmessage(command, options, results)
100
108
 
109
+ when 'maxrows'
110
+ results = Masticate.maxrows(filename, options)
111
+ logmessage(command, options, results)
112
+
101
113
  else
102
114
  raise "unknown command #{command}"
103
115
  end
@@ -87,7 +87,7 @@ class Masticate::Mender < Masticate::Base
87
87
  end
88
88
 
89
89
  def explode(line)
90
- CSV.parse_line(line, :col_sep => @col_sep, :quote_char => @quote_char)
90
+ CSV.parse_line(line, :col_sep => @col_sep, :quote_char => @quote_char).map {|s| s && s.strip}
91
91
  end
92
92
 
93
93
  # a line is "junky" if it has 2 or fewer fields with any content
@@ -14,8 +14,21 @@ class Masticate::Plucker < Masticate::Base
14
14
  row = CSV.parse_line(line, csv_options)
15
15
  if !headers
16
16
  headers = row
17
- indexes = fields.map {|f| headers.index(f) or raise "Unable to find column '#{f}'"}
18
- emit(fields.to_csv)
17
+ indexes = fields.map do |f|
18
+ case f
19
+ when String
20
+ headers.index(f) or raise "Unable to find column '#{f}'"
21
+ when Fixnum
22
+ if f > headers.count
23
+ raise "Cannot pluck column #{f}, there are only #{headers.count} fields"
24
+ else
25
+ f-1
26
+ end
27
+ else
28
+ raise "Invalid field descriptor '#{f}'"
29
+ end
30
+ end
31
+ emit(indexes.map {|i| headers[i]}.to_csv)
19
32
  else
20
33
  emit(indexes.map {|i| row[i]}.to_csv) if row
21
34
  end
@@ -10,10 +10,10 @@ class Masticate::Sniffer < Masticate::Base
10
10
  @filename = filename
11
11
  end
12
12
 
13
- def sniff
13
+ def sniff(opts)
14
14
  @col_sep = find_col_sep
15
15
  @quote_char = delimstats[@col_sep][:quote_char]
16
- @stats = stats
16
+ @stats = stats if opts[:stats]
17
17
  {
18
18
  :col_sep => @col_sep,
19
19
  :quote_char => @quote_char,
@@ -1,3 +1,3 @@
1
1
  module Masticate
2
- VERSION = "0.1.3"
2
+ VERSION = "0.1.4"
3
3
  end
data/lib/masticate.rb CHANGED
@@ -11,8 +11,8 @@ require_relative "masticate/gsubber"
11
11
  require_relative "masticate/max_rows"
12
12
 
13
13
  module Masticate
14
- def self.sniff(filename)
15
- Sniffer.new(filename).sniff
14
+ def self.sniff(filename, opts = {})
15
+ Sniffer.new(filename).sniff(opts)
16
16
  end
17
17
 
18
18
  def self.mend(filename, opts)
@@ -1,10 +1,10 @@
1
- LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,WASHINGTON,GEORGE,D,824,9556,09/10/2005,07/01/2006,TM,Surgical House Staff,M,09/23/1975,3/10/2012
2
- LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,JEFFERSON,TOM,,621,8052,07/23/2001,01/28/2011,TM,Telemetry,F,12/24/1976,3/10/2012
1
+ LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT # ,Empl # ,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,WASHINGTON,GEORGE,D,824,9556,09/10/2005,07/01/2006,TM,Surgical House Staff,M,09/23/1975,3/10/2012
2
+ LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,JEFFERSON,TOM ,,621,8052,07/23/2001,01/28/2011,TM,Telemetry,F,12/24/1976,3/10/2012
3
3
  LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,ADAMS,JOHN,,655,8834,09/22/2003,,WA,6 East,F,08/07/1978,3/10/2012
4
4
  LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,ADAMS,JOHN QUINCY,A,209,8637,02/24/2003,12/02/2007,TM,Imaging Svcs - MRI,F,11/03/1966,3/10/2012
5
5
  LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,HAMILTON,ANDREW,,278,10065,01/09/2007,11/16/2007,TM,Information Technology,M,09/16/1968,3/10/2012
6
- LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,MADISON,JAMES,F,672,10720,01/05/2009,02/16/2010,TM,Rehab Svcs - Outpatients,F,04/15/1985,3/10/2012
6
+ LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,MADISON ,JAMES,F,672,10720,01/05/2009,02/16/2010,TM,Rehab Svcs - Outpatients,F,04/15/1985,3/10/2012
7
7
  LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,FRANKLIN,BENJAMIN,R,674,8340,05/01/2002,09/01/2003,TM,"Rehab Svcs, xyz",F,03/15/1973,3/10/2012
8
8
  LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,LINCOLN,ABRAHAM,M,634,11340,05/02/2011,,PN,Sibley Ambulatory Surgery Ctr,F,07/11/1960,3/10/2012
9
- LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,MONROE,JAMES,L,614,10757,02/16/2009,,RF,Labor & Delivery,F,11/06/1983,3/10/2012
10
- LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,REVERE,PAUL,B,424,8568,11/18/2002,06/27/2006,TM,Laundry & Linen,M,12/31/1976,3/10/2012
9
+ LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl # ,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,MONROE,JAMES,L,614,10757,02/16/2009,,RF,Labor & Delivery,F,11/06/1983,3/10/2012
10
+ LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,REVERE,PAUL,B,424,8568,11/18/2002,06/27/2006,TM,Laundry & Linen ,M,12/31/1976,3/10/2012
@@ -1,4 +1,4 @@
1
- LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE
1
+ LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date,Status,R_NAME,SEX,BIRTHDATE
2
2
  WASHINGTON,GEORGE,D,824,9556,09/10/2005,07/01/2006,TM,Surgical House Staff,M,09/23/1975
3
3
  JEFFERSON,TOM,,621,8052,07/23/2001,01/28/2011,TM,Telemetry,F,12/24/1976
4
4
  ADAMS,JOHN,,655,8834,09/22/2003,,WA,6 East,F,08/07/1978
@@ -1,4 +1,4 @@
1
- 3/7/2012,hospid,usrorder,dteorder,usrsend,dtesend,usrdone,dtedone,department
1
+ 3/7/2012,hospid,usrorder ,dteorder,usrsend,dtesend,usrdone,dtedone,department
2
2
  15267,407,201201060140,407,201201060140,0,201201060309,L
3
3
  15267,381,201201060222,381,201201060222,"abc,def",201201060647,X
4
4
  15267,407,201201060311,407,201201060311,0,201201060339,L
@@ -15,4 +15,16 @@ describe "plucker" do
15
15
  results[:input_count].should == 5
16
16
  output.should == correct_output
17
17
  end
18
+
19
+ it "should pull numbered columns starting at 1" do
20
+ filename = File.dirname(__FILE__) + "/../data/namedcols.csv"
21
+ tmp = Tempfile.new('plucker')
22
+ results = Masticate.pluck(filename, :output => tmp, :fields => [3,5])
23
+ output = File.read(tmp)
24
+ correct_output = File.read(File.dirname(__FILE__) + "/../data/namedcols.csv.output")
25
+ tmp.unlink
26
+
27
+ results[:input_count].should == 5
28
+ output.should == correct_output
29
+ end
18
30
  end
@@ -3,23 +3,30 @@
3
3
  require "spec_helper"
4
4
 
5
5
  describe "delimiter sniffing" do
6
- it "should find tab delimiter" do
6
+ it "stats collection should default off" do
7
7
  filename = File.dirname(__FILE__) + "/../data/tabbed_data.txt"
8
8
  results = Masticate.sniff(filename)
9
9
  results[:col_sep].should == "\t"
10
+ results[:field_counts].should be_nil
11
+ end
12
+
13
+ it "should find tab delimiter" do
14
+ filename = File.dirname(__FILE__) + "/../data/tabbed_data.txt"
15
+ results = Masticate.sniff(filename, :stats => true)
16
+ results[:col_sep].should == "\t"
10
17
  results[:field_counts].should == {6 => 5}
11
18
  end
12
19
 
13
20
  it "should find pipe delimiter" do
14
21
  filename = File.dirname(__FILE__) + "/../data/pipe_data.txt"
15
- results = Masticate.sniff(filename)
22
+ results = Masticate.sniff(filename, :stats => true)
16
23
  results[:col_sep].should == '|'
17
24
  results[:field_counts].should == {6 => 5}
18
25
  end
19
26
 
20
27
  it "should recognize quotes in CSV sources" do
21
28
  filename = File.dirname(__FILE__) + "/../data/quoted_csv_data.txt"
22
- results = Masticate.sniff(filename)
29
+ results = Masticate.sniff(filename, :stats => true)
23
30
  results[:col_sep].should == ','
24
31
  results[:quote_char].should == '"'
25
32
  results[:field_counts].should == {14 => 100}
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: masticate
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.1.4
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-04-06 00:00:00.000000000 Z
12
+ date: 2012-04-16 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rspec
16
- requirement: &2152447240 !ruby/object:Gem::Requirement
16
+ requirement: &2152726520 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: 2.9.0
22
22
  type: :development
23
23
  prerelease: false
24
- version_requirements: *2152447240
24
+ version_requirements: *2152726520
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: guard-rspec
27
- requirement: &2152446740 !ruby/object:Gem::Requirement
27
+ requirement: &2152749240 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ~>
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: 0.7.0
33
33
  type: :development
34
34
  prerelease: false
35
- version_requirements: *2152446740
35
+ version_requirements: *2152749240
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: ruby_gntp
38
- requirement: &2152446280 !ruby/object:Gem::Requirement
38
+ requirement: &2152748720 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ~>
@@ -43,7 +43,7 @@ dependencies:
43
43
  version: 0.3.4
44
44
  type: :development
45
45
  prerelease: false
46
- version_requirements: *2152446280
46
+ version_requirements: *2152748720
47
47
  description: Data file crunching
48
48
  email:
49
49
  - jmay@pobox.com