masticate 0.1.1 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
data/bin/masticate CHANGED
@@ -13,11 +13,15 @@ OptionParser.new do |opts|
13
13
  options[:format] = v
14
14
  end
15
15
 
16
- opts.on("--delim DELIMITER", "Specify field delimiter (character or TAB)") do |v|
16
+ opts.on("--delim DELIMITER", "Specify field delimiter (character or TAB; default is ',')") do |v|
17
17
  options[:col_sep] = v
18
18
  options[:col_sep] = "\t" if options[:col_sep] == "TAB"
19
19
  end
20
20
 
21
+ opts.on("--quote QUOTE-CHAR", "Specify character used for quoting fields (optional; default is no quoting)") do |char|
22
+ options[:quote_char] = char
23
+ end
24
+
21
25
  opts.on("--fields LIST", Array, "Specify fields to select") do |list|
22
26
  options[:fields] = list
23
27
  end
@@ -41,6 +45,10 @@ OptionParser.new do |opts|
41
45
  opts.on("--inlined", "(for *mend* only) Source file has headers inlined on each line") do |v|
42
46
  options[:inlined] = v
43
47
  end
48
+
49
+ opts.on("--dejunk", "(for *mend* only) Expunge junk lines from source") do |v|
50
+ options[:dejunk] = v
51
+ end
44
52
  end.parse!
45
53
 
46
54
  filename = ARGV.shift # use stdin if no filename provided
@@ -51,6 +59,9 @@ def logmessage(command, options, results)
51
59
  Lines in input: #{results[:input_count]}
52
60
  Lines in output: #{results[:output_count]}
53
61
  EOT
62
+ if results[:field_counts]
63
+ $stderr.puts " Field counts: #{results[:field_counts].inspect}"
64
+ end
54
65
  end
55
66
 
56
67
  case command
@@ -18,6 +18,7 @@ class Masticate::Mender < Masticate::Base
18
18
  expected_field_count = nil
19
19
  headers = nil
20
20
  @output_count = 0
21
+ fieldcounts = Hash.new(0)
21
22
  with_input do |input|
22
23
  while (line = get) do
23
24
  unless line =~ /^\s*$/
@@ -27,13 +28,15 @@ class Masticate::Mender < Masticate::Base
27
28
  if !expected_field_count
28
29
  headers = row[0..ncells]
29
30
  expected_field_count = headers.count
31
+ fieldcounts[headers.count] += 1
30
32
  emit(headers.to_csv(:col_sep => @col_sep))
31
33
  else
32
34
  if row[0..ncells] != headers
33
35
  raise "Header mismatch on line #{@input_count}\n Expected: #{headers.join(',')}\n Found: #{row[0..ncells].join(',')}"
34
36
  end
35
37
  end
36
- row = row[ncells+1..-1]
38
+ row = row[ncells+1, expected_field_count]
39
+ fieldcounts[row.count] += 1
37
40
  emit(row.to_csv(:col_sep => @col_sep))
38
41
  elsif !expected_field_count
39
42
  # trust the first row
@@ -49,6 +52,7 @@ class Masticate::Mender < Masticate::Base
49
52
  raise "Do not understand snip instruction [#{opts[:snip].inspect}]"
50
53
  end
51
54
  expected_field_count = headers.count
55
+ fieldcounts[headers.count] += 1
52
56
  emit(headers.to_csv(:col_sep => @col_sep))
53
57
  else
54
58
  running_count = fieldcount(line)
@@ -61,6 +65,7 @@ class Masticate::Mender < Masticate::Base
61
65
  end
62
66
 
63
67
  unless opts[:dejunk] && junky?(line)
68
+ fieldcounts[fieldcount(line)] += 1
64
69
  emit(line)
65
70
  end
66
71
  end
@@ -72,6 +77,7 @@ class Masticate::Mender < Masticate::Base
72
77
  {
73
78
  :input_count => @input_count,
74
79
  :output_count => @output_count,
80
+ :field_counts => fieldcounts,
75
81
  :headers => headers
76
82
  }
77
83
  end
@@ -81,7 +87,7 @@ class Masticate::Mender < Masticate::Base
81
87
  end
82
88
 
83
89
  def explode(line)
84
- CSV.parse_line(line, :col_sep => col_sep, :quote_char => @quote_char)
90
+ CSV.parse_line(line, :col_sep => @col_sep, :quote_char => @quote_char)
85
91
  end
86
92
 
87
93
  # a line is "junky" if it has 2 or fewer fields with any content
@@ -1,3 +1,3 @@
1
1
  module Masticate
2
- VERSION = "0.1.1"
2
+ VERSION = "0.1.3"
3
3
  end
@@ -4,7 +4,7 @@ LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NA
4
4
  LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,ADAMS,JOHN QUINCY,A,209,8637,02/24/2003,12/02/2007,TM,Imaging Svcs - MRI,F,11/03/1966,3/10/2012
5
5
  LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,HAMILTON,ANDREW,,278,10065,01/09/2007,11/16/2007,TM,Information Technology,M,09/16/1968,3/10/2012
6
6
  LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,MADISON,JAMES,F,672,10720,01/05/2009,02/16/2010,TM,Rehab Svcs - Outpatients,F,04/15/1985,3/10/2012
7
- LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,FRANKLIN,BENJAMIN,R,674,8340,05/01/2002,09/01/2003,TM,Rehab Svcs - PRN,F,03/15/1973,3/10/2012
7
+ LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,FRANKLIN,BENJAMIN,R,674,8340,05/01/2002,09/01/2003,TM,"Rehab Svcs, xyz",F,03/15/1973,3/10/2012
8
8
  LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,LINCOLN,ABRAHAM,M,634,11340,05/02/2011,,PN,Sibley Ambulatory Surgery Ctr,F,07/11/1960,3/10/2012
9
9
  LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,MONROE,JAMES,L,614,10757,02/16/2009,,RF,Labor & Delivery,F,11/06/1983,3/10/2012
10
10
  LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,REVERE,PAUL,B,424,8568,11/18/2002,06/27/2006,TM,Laundry & Linen,M,12/31/1976,3/10/2012
@@ -1,11 +1,11 @@
1
1
  LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE
2
- WASHINGTON,GEORGE,D,824,9556,09/10/2005,07/01/2006,TM,Surgical House Staff,M,09/23/1975,3/10/2012
3
- JEFFERSON,TOM,,621,8052,07/23/2001,01/28/2011,TM,Telemetry,F,12/24/1976,3/10/2012
4
- ADAMS,JOHN,,655,8834,09/22/2003,,WA,6 East,F,08/07/1978,3/10/2012
5
- ADAMS,JOHN QUINCY,A,209,8637,02/24/2003,12/02/2007,TM,Imaging Svcs - MRI,F,11/03/1966,3/10/2012
6
- HAMILTON,ANDREW,,278,10065,01/09/2007,11/16/2007,TM,Information Technology,M,09/16/1968,3/10/2012
7
- MADISON,JAMES,F,672,10720,01/05/2009,02/16/2010,TM,Rehab Svcs - Outpatients,F,04/15/1985,3/10/2012
8
- FRANKLIN,BENJAMIN,R,674,8340,05/01/2002,09/01/2003,TM,Rehab Svcs - PRN,F,03/15/1973,3/10/2012
9
- LINCOLN,ABRAHAM,M,634,11340,05/02/2011,,PN,Sibley Ambulatory Surgery Ctr,F,07/11/1960,3/10/2012
10
- MONROE,JAMES,L,614,10757,02/16/2009,,RF,Labor & Delivery,F,11/06/1983,3/10/2012
11
- REVERE,PAUL,B,424,8568,11/18/2002,06/27/2006,TM,Laundry & Linen,M,12/31/1976,3/10/2012
2
+ WASHINGTON,GEORGE,D,824,9556,09/10/2005,07/01/2006,TM,Surgical House Staff,M,09/23/1975
3
+ JEFFERSON,TOM,,621,8052,07/23/2001,01/28/2011,TM,Telemetry,F,12/24/1976
4
+ ADAMS,JOHN,,655,8834,09/22/2003,,WA,6 East,F,08/07/1978
5
+ ADAMS,JOHN QUINCY,A,209,8637,02/24/2003,12/02/2007,TM,Imaging Svcs - MRI,F,11/03/1966
6
+ HAMILTON,ANDREW,,278,10065,01/09/2007,11/16/2007,TM,Information Technology,M,09/16/1968
7
+ MADISON,JAMES,F,672,10720,01/05/2009,02/16/2010,TM,Rehab Svcs - Outpatients,F,04/15/1985
8
+ FRANKLIN,BENJAMIN,R,674,8340,05/01/2002,09/01/2003,TM,"Rehab Svcs, xyz",F,03/15/1973
9
+ LINCOLN,ABRAHAM,M,634,11340,05/02/2011,,PN,Sibley Ambulatory Surgery Ctr,F,07/11/1960
10
+ MONROE,JAMES,L,614,10757,02/16/2009,,RF,Labor & Delivery,F,11/06/1983
11
+ REVERE,PAUL,B,424,8568,11/18/2002,06/27/2006,TM,Laundry & Linen,M,12/31/1976
@@ -1,5 +1,5 @@
1
1
  3/7/2012,hospid,usrorder,dteorder,usrsend,dtesend,usrdone,dtedone,department
2
2
  15267,407,201201060140,407,201201060140,0,201201060309,L
3
- 15267,381,201201060222,381,201201060222,0,201201060647,X
3
+ 15267,381,201201060222,381,201201060222,"abc,def",201201060647,X
4
4
  15267,407,201201060311,407,201201060311,0,201201060339,L
5
5
  15267,407,201201060514,108,201201060515,108,201201060515,SEC
@@ -30,12 +30,13 @@ describe "mending" do
30
30
  it "should unfold inlined headers" do
31
31
  filename = File.dirname(__FILE__) + "/../data/inlined_headers.csv"
32
32
  tmp = Tempfile.new('mending')
33
- results = Masticate.mend(filename, :inlined => true, :output => tmp)
33
+ results = Masticate.mend(filename, :col_sep => ',', :quote_char => '"', :inlined => true, :output => tmp)
34
34
  output = File.read(tmp)
35
35
  correct_output = File.read(File.dirname(__FILE__) + "/../data/inlined_headers.csv.output")
36
36
 
37
37
  results[:input_count].should == 11
38
38
  results[:output_count].should == 11
39
+ results[:field_counts].should == {11 => 11}
39
40
  output.should == correct_output
40
41
  end
41
42
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: masticate
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.3
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -13,7 +13,7 @@ date: 2012-04-06 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rspec
16
- requirement: &2152293880 !ruby/object:Gem::Requirement
16
+ requirement: &2152447240 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: 2.9.0
22
22
  type: :development
23
23
  prerelease: false
24
- version_requirements: *2152293880
24
+ version_requirements: *2152447240
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: guard-rspec
27
- requirement: &2152293360 !ruby/object:Gem::Requirement
27
+ requirement: &2152446740 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ~>
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: 0.7.0
33
33
  type: :development
34
34
  prerelease: false
35
- version_requirements: *2152293360
35
+ version_requirements: *2152446740
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: ruby_gntp
38
- requirement: &2152292900 !ruby/object:Gem::Requirement
38
+ requirement: &2152446280 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ~>
@@ -43,7 +43,7 @@ dependencies:
43
43
  version: 0.3.4
44
44
  type: :development
45
45
  prerelease: false
46
- version_requirements: *2152292900
46
+ version_requirements: *2152446280
47
47
  description: Data file crunching
48
48
  email:
49
49
  - jmay@pobox.com