masticate 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/bin/masticate CHANGED
@@ -49,6 +49,14 @@ OptionParser.new do |opts|
49
49
  opts.on("--dejunk", "(for *mend* only) Expunge junk lines from source") do |v|
50
50
  options[:dejunk] = v
51
51
  end
52
+
53
+ opts.on("--by FIELD", "(for *maxrows* only) Field to group by") do |f|
54
+ options[:by] = f
55
+ end
56
+
57
+ opts.on("--max FIELD", "(for *maxrows* only) Field to find max value for") do |f|
58
+ options[:max] = f
59
+ end
52
60
  end.parse!
53
61
 
54
62
  filename = ARGV.shift # use stdin if no filename provided
@@ -98,6 +106,10 @@ when 'gsub'
98
106
  results = Masticate.gsub(filename, options)
99
107
  logmessage(command, options, results)
100
108
 
109
+ when 'maxrows'
110
+ results = Masticate.maxrows(filename, options)
111
+ logmessage(command, options, results)
112
+
101
113
  else
102
114
  raise "unknown command #{command}"
103
115
  end
@@ -87,7 +87,7 @@ class Masticate::Mender < Masticate::Base
87
87
  end
88
88
 
89
89
  def explode(line)
90
- CSV.parse_line(line, :col_sep => @col_sep, :quote_char => @quote_char)
90
+ CSV.parse_line(line, :col_sep => @col_sep, :quote_char => @quote_char).map {|s| s && s.strip}
91
91
  end
92
92
 
93
93
  # a line is "junky" if it has 2 or fewer fields with any content
@@ -14,8 +14,21 @@ class Masticate::Plucker < Masticate::Base
14
14
  row = CSV.parse_line(line, csv_options)
15
15
  if !headers
16
16
  headers = row
17
- indexes = fields.map {|f| headers.index(f) or raise "Unable to find column '#{f}'"}
18
- emit(fields.to_csv)
17
+ indexes = fields.map do |f|
18
+ case f
19
+ when String
20
+ headers.index(f) or raise "Unable to find column '#{f}'"
21
+ when Fixnum
22
+ if f > headers.count
23
+ raise "Cannot pluck column #{f}, there are only #{headers.count} fields"
24
+ else
25
+ f-1
26
+ end
27
+ else
28
+ raise "Invalid field descriptor '#{f}'"
29
+ end
30
+ end
31
+ emit(indexes.map {|i| headers[i]}.to_csv)
19
32
  else
20
33
  emit(indexes.map {|i| row[i]}.to_csv) if row
21
34
  end
@@ -10,10 +10,10 @@ class Masticate::Sniffer < Masticate::Base
10
10
  @filename = filename
11
11
  end
12
12
 
13
- def sniff
13
+ def sniff(opts)
14
14
  @col_sep = find_col_sep
15
15
  @quote_char = delimstats[@col_sep][:quote_char]
16
- @stats = stats
16
+ @stats = stats if opts[:stats]
17
17
  {
18
18
  :col_sep => @col_sep,
19
19
  :quote_char => @quote_char,
@@ -1,3 +1,3 @@
1
1
  module Masticate
2
- VERSION = "0.1.3"
2
+ VERSION = "0.1.4"
3
3
  end
data/lib/masticate.rb CHANGED
@@ -11,8 +11,8 @@ require_relative "masticate/gsubber"
11
11
  require_relative "masticate/max_rows"
12
12
 
13
13
  module Masticate
14
- def self.sniff(filename)
15
- Sniffer.new(filename).sniff
14
+ def self.sniff(filename, opts = {})
15
+ Sniffer.new(filename).sniff(opts)
16
16
  end
17
17
 
18
18
  def self.mend(filename, opts)
@@ -1,10 +1,10 @@
1
- LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,WASHINGTON,GEORGE,D,824,9556,09/10/2005,07/01/2006,TM,Surgical House Staff,M,09/23/1975,3/10/2012
2
- LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,JEFFERSON,TOM,,621,8052,07/23/2001,01/28/2011,TM,Telemetry,F,12/24/1976,3/10/2012
1
+ LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT # ,Empl # ,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,WASHINGTON,GEORGE,D,824,9556,09/10/2005,07/01/2006,TM,Surgical House Staff,M,09/23/1975,3/10/2012
2
+ LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,JEFFERSON,TOM ,,621,8052,07/23/2001,01/28/2011,TM,Telemetry,F,12/24/1976,3/10/2012
3
3
  LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,ADAMS,JOHN,,655,8834,09/22/2003,,WA,6 East,F,08/07/1978,3/10/2012
4
4
  LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,ADAMS,JOHN QUINCY,A,209,8637,02/24/2003,12/02/2007,TM,Imaging Svcs - MRI,F,11/03/1966,3/10/2012
5
5
  LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,HAMILTON,ANDREW,,278,10065,01/09/2007,11/16/2007,TM,Information Technology,M,09/16/1968,3/10/2012
6
- LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,MADISON,JAMES,F,672,10720,01/05/2009,02/16/2010,TM,Rehab Svcs - Outpatients,F,04/15/1985,3/10/2012
6
+ LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,MADISON ,JAMES,F,672,10720,01/05/2009,02/16/2010,TM,Rehab Svcs - Outpatients,F,04/15/1985,3/10/2012
7
7
  LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,FRANKLIN,BENJAMIN,R,674,8340,05/01/2002,09/01/2003,TM,"Rehab Svcs, xyz",F,03/15/1973,3/10/2012
8
8
  LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,LINCOLN,ABRAHAM,M,634,11340,05/02/2011,,PN,Sibley Ambulatory Surgery Ctr,F,07/11/1960,3/10/2012
9
- LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,MONROE,JAMES,L,614,10757,02/16/2009,,RF,Labor & Delivery,F,11/06/1983,3/10/2012
10
- LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,REVERE,PAUL,B,424,8568,11/18/2002,06/27/2006,TM,Laundry & Linen,M,12/31/1976,3/10/2012
9
+ LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl # ,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,MONROE,JAMES,L,614,10757,02/16/2009,,RF,Labor & Delivery,F,11/06/1983,3/10/2012
10
+ LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,REVERE,PAUL,B,424,8568,11/18/2002,06/27/2006,TM,Laundry & Linen ,M,12/31/1976,3/10/2012
@@ -1,4 +1,4 @@
1
- LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE
1
+ LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date,Status,R_NAME,SEX,BIRTHDATE
2
2
  WASHINGTON,GEORGE,D,824,9556,09/10/2005,07/01/2006,TM,Surgical House Staff,M,09/23/1975
3
3
  JEFFERSON,TOM,,621,8052,07/23/2001,01/28/2011,TM,Telemetry,F,12/24/1976
4
4
  ADAMS,JOHN,,655,8834,09/22/2003,,WA,6 East,F,08/07/1978
@@ -1,4 +1,4 @@
1
- 3/7/2012,hospid,usrorder,dteorder,usrsend,dtesend,usrdone,dtedone,department
1
+ 3/7/2012,hospid,usrorder ,dteorder,usrsend,dtesend,usrdone,dtedone,department
2
2
  15267,407,201201060140,407,201201060140,0,201201060309,L
3
3
  15267,381,201201060222,381,201201060222,"abc,def",201201060647,X
4
4
  15267,407,201201060311,407,201201060311,0,201201060339,L
@@ -15,4 +15,16 @@ describe "plucker" do
15
15
  results[:input_count].should == 5
16
16
  output.should == correct_output
17
17
  end
18
+
19
+ it "should pull numbered columns starting at 1" do
20
+ filename = File.dirname(__FILE__) + "/../data/namedcols.csv"
21
+ tmp = Tempfile.new('plucker')
22
+ results = Masticate.pluck(filename, :output => tmp, :fields => [3,5])
23
+ output = File.read(tmp)
24
+ correct_output = File.read(File.dirname(__FILE__) + "/../data/namedcols.csv.output")
25
+ tmp.unlink
26
+
27
+ results[:input_count].should == 5
28
+ output.should == correct_output
29
+ end
18
30
  end
@@ -3,23 +3,30 @@
3
3
  require "spec_helper"
4
4
 
5
5
  describe "delimiter sniffing" do
6
- it "should find tab delimiter" do
6
+ it "stats collection should default off" do
7
7
  filename = File.dirname(__FILE__) + "/../data/tabbed_data.txt"
8
8
  results = Masticate.sniff(filename)
9
9
  results[:col_sep].should == "\t"
10
+ results[:field_counts].should be_nil
11
+ end
12
+
13
+ it "should find tab delimiter" do
14
+ filename = File.dirname(__FILE__) + "/../data/tabbed_data.txt"
15
+ results = Masticate.sniff(filename, :stats => true)
16
+ results[:col_sep].should == "\t"
10
17
  results[:field_counts].should == {6 => 5}
11
18
  end
12
19
 
13
20
  it "should find pipe delimiter" do
14
21
  filename = File.dirname(__FILE__) + "/../data/pipe_data.txt"
15
- results = Masticate.sniff(filename)
22
+ results = Masticate.sniff(filename, :stats => true)
16
23
  results[:col_sep].should == '|'
17
24
  results[:field_counts].should == {6 => 5}
18
25
  end
19
26
 
20
27
  it "should recognize quotes in CSV sources" do
21
28
  filename = File.dirname(__FILE__) + "/../data/quoted_csv_data.txt"
22
- results = Masticate.sniff(filename)
29
+ results = Masticate.sniff(filename, :stats => true)
23
30
  results[:col_sep].should == ','
24
31
  results[:quote_char].should == '"'
25
32
  results[:field_counts].should == {14 => 100}
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: masticate
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.1.4
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-04-06 00:00:00.000000000 Z
12
+ date: 2012-04-16 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rspec
16
- requirement: &2152447240 !ruby/object:Gem::Requirement
16
+ requirement: &2152726520 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: 2.9.0
22
22
  type: :development
23
23
  prerelease: false
24
- version_requirements: *2152447240
24
+ version_requirements: *2152726520
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: guard-rspec
27
- requirement: &2152446740 !ruby/object:Gem::Requirement
27
+ requirement: &2152749240 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ~>
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: 0.7.0
33
33
  type: :development
34
34
  prerelease: false
35
- version_requirements: *2152446740
35
+ version_requirements: *2152749240
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: ruby_gntp
38
- requirement: &2152446280 !ruby/object:Gem::Requirement
38
+ requirement: &2152748720 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ~>
@@ -43,7 +43,7 @@ dependencies:
43
43
  version: 0.3.4
44
44
  type: :development
45
45
  prerelease: false
46
- version_requirements: *2152446280
46
+ version_requirements: *2152748720
47
47
  description: Data file crunching
48
48
  email:
49
49
  - jmay@pobox.com