masticate 0.1.1 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/bin/masticate CHANGED
@@ -13,11 +13,15 @@ OptionParser.new do |opts|
13
13
  options[:format] = v
14
14
  end
15
15
 
16
- opts.on("--delim DELIMITER", "Specify field delimiter (character or TAB)") do |v|
16
+ opts.on("--delim DELIMITER", "Specify field delimiter (character or TAB; default is ',')") do |v|
17
17
  options[:col_sep] = v
18
18
  options[:col_sep] = "\t" if options[:col_sep] == "TAB"
19
19
  end
20
20
 
21
+ opts.on("--quote QUOTE-CHAR", "Specify character used for quoting fields (optional; default is no quoting)") do |char|
22
+ options[:quote_char] = char
23
+ end
24
+
21
25
  opts.on("--fields LIST", Array, "Specify fields to select") do |list|
22
26
  options[:fields] = list
23
27
  end
@@ -41,6 +45,10 @@ OptionParser.new do |opts|
41
45
  opts.on("--inlined", "(for *mend* only) Source file has headers inlined on each line") do |v|
42
46
  options[:inlined] = v
43
47
  end
48
+
49
+ opts.on("--dejunk", "(for *mend* only) Expunge junk lines from source") do |v|
50
+ options[:dejunk] = v
51
+ end
44
52
  end.parse!
45
53
 
46
54
  filename = ARGV.shift # use stdin if no filename provided
@@ -51,6 +59,9 @@ def logmessage(command, options, results)
51
59
  Lines in input: #{results[:input_count]}
52
60
  Lines in output: #{results[:output_count]}
53
61
  EOT
62
+ if results[:field_counts]
63
+ $stderr.puts " Field counts: #{results[:field_counts].inspect}"
64
+ end
54
65
  end
55
66
 
56
67
  case command
@@ -18,6 +18,7 @@ class Masticate::Mender < Masticate::Base
18
18
  expected_field_count = nil
19
19
  headers = nil
20
20
  @output_count = 0
21
+ fieldcounts = Hash.new(0)
21
22
  with_input do |input|
22
23
  while (line = get) do
23
24
  unless line =~ /^\s*$/
@@ -27,13 +28,15 @@ class Masticate::Mender < Masticate::Base
27
28
  if !expected_field_count
28
29
  headers = row[0..ncells]
29
30
  expected_field_count = headers.count
31
+ fieldcounts[headers.count] += 1
30
32
  emit(headers.to_csv(:col_sep => @col_sep))
31
33
  else
32
34
  if row[0..ncells] != headers
33
35
  raise "Header mismatch on line #{@input_count}\n Expected: #{headers.join(',')}\n Found: #{row[0..ncells].join(',')}"
34
36
  end
35
37
  end
36
- row = row[ncells+1..-1]
38
+ row = row[ncells+1, expected_field_count]
39
+ fieldcounts[row.count] += 1
37
40
  emit(row.to_csv(:col_sep => @col_sep))
38
41
  elsif !expected_field_count
39
42
  # trust the first row
@@ -49,6 +52,7 @@ class Masticate::Mender < Masticate::Base
49
52
  raise "Do not understand snip instruction [#{opts[:snip].inspect}]"
50
53
  end
51
54
  expected_field_count = headers.count
55
+ fieldcounts[headers.count] += 1
52
56
  emit(headers.to_csv(:col_sep => @col_sep))
53
57
  else
54
58
  running_count = fieldcount(line)
@@ -61,6 +65,7 @@ class Masticate::Mender < Masticate::Base
61
65
  end
62
66
 
63
67
  unless opts[:dejunk] && junky?(line)
68
+ fieldcounts[fieldcount(line)] += 1
64
69
  emit(line)
65
70
  end
66
71
  end
@@ -72,6 +77,7 @@ class Masticate::Mender < Masticate::Base
72
77
  {
73
78
  :input_count => @input_count,
74
79
  :output_count => @output_count,
80
+ :field_counts => fieldcounts,
75
81
  :headers => headers
76
82
  }
77
83
  end
@@ -81,7 +87,7 @@ class Masticate::Mender < Masticate::Base
81
87
  end
82
88
 
83
89
  def explode(line)
84
- CSV.parse_line(line, :col_sep => col_sep, :quote_char => @quote_char)
90
+ CSV.parse_line(line, :col_sep => @col_sep, :quote_char => @quote_char)
85
91
  end
86
92
 
87
93
  # a line is "junky" if it has 2 or fewer fields with any content
@@ -1,3 +1,3 @@
1
1
  module Masticate
2
- VERSION = "0.1.1"
2
+ VERSION = "0.1.3"
3
3
  end
@@ -4,7 +4,7 @@ LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NA
4
4
  LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,ADAMS,JOHN QUINCY,A,209,8637,02/24/2003,12/02/2007,TM,Imaging Svcs - MRI,F,11/03/1966,3/10/2012
5
5
  LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,HAMILTON,ANDREW,,278,10065,01/09/2007,11/16/2007,TM,Information Technology,M,09/16/1968,3/10/2012
6
6
  LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,MADISON,JAMES,F,672,10720,01/05/2009,02/16/2010,TM,Rehab Svcs - Outpatients,F,04/15/1985,3/10/2012
7
- LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,FRANKLIN,BENJAMIN,R,674,8340,05/01/2002,09/01/2003,TM,Rehab Svcs - PRN,F,03/15/1973,3/10/2012
7
+ LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,FRANKLIN,BENJAMIN,R,674,8340,05/01/2002,09/01/2003,TM,"Rehab Svcs, xyz",F,03/15/1973,3/10/2012
8
8
  LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,LINCOLN,ABRAHAM,M,634,11340,05/02/2011,,PN,Sibley Ambulatory Surgery Ctr,F,07/11/1960,3/10/2012
9
9
  LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,MONROE,JAMES,L,614,10757,02/16/2009,,RF,Labor & Delivery,F,11/06/1983,3/10/2012
10
10
  LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,REVERE,PAUL,B,424,8568,11/18/2002,06/27/2006,TM,Laundry & Linen,M,12/31/1976,3/10/2012
@@ -1,11 +1,11 @@
1
1
  LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE
2
- WASHINGTON,GEORGE,D,824,9556,09/10/2005,07/01/2006,TM,Surgical House Staff,M,09/23/1975,3/10/2012
3
- JEFFERSON,TOM,,621,8052,07/23/2001,01/28/2011,TM,Telemetry,F,12/24/1976,3/10/2012
4
- ADAMS,JOHN,,655,8834,09/22/2003,,WA,6 East,F,08/07/1978,3/10/2012
5
- ADAMS,JOHN QUINCY,A,209,8637,02/24/2003,12/02/2007,TM,Imaging Svcs - MRI,F,11/03/1966,3/10/2012
6
- HAMILTON,ANDREW,,278,10065,01/09/2007,11/16/2007,TM,Information Technology,M,09/16/1968,3/10/2012
7
- MADISON,JAMES,F,672,10720,01/05/2009,02/16/2010,TM,Rehab Svcs - Outpatients,F,04/15/1985,3/10/2012
8
- FRANKLIN,BENJAMIN,R,674,8340,05/01/2002,09/01/2003,TM,Rehab Svcs - PRN,F,03/15/1973,3/10/2012
9
- LINCOLN,ABRAHAM,M,634,11340,05/02/2011,,PN,Sibley Ambulatory Surgery Ctr,F,07/11/1960,3/10/2012
10
- MONROE,JAMES,L,614,10757,02/16/2009,,RF,Labor & Delivery,F,11/06/1983,3/10/2012
11
- REVERE,PAUL,B,424,8568,11/18/2002,06/27/2006,TM,Laundry & Linen,M,12/31/1976,3/10/2012
2
+ WASHINGTON,GEORGE,D,824,9556,09/10/2005,07/01/2006,TM,Surgical House Staff,M,09/23/1975
3
+ JEFFERSON,TOM,,621,8052,07/23/2001,01/28/2011,TM,Telemetry,F,12/24/1976
4
+ ADAMS,JOHN,,655,8834,09/22/2003,,WA,6 East,F,08/07/1978
5
+ ADAMS,JOHN QUINCY,A,209,8637,02/24/2003,12/02/2007,TM,Imaging Svcs - MRI,F,11/03/1966
6
+ HAMILTON,ANDREW,,278,10065,01/09/2007,11/16/2007,TM,Information Technology,M,09/16/1968
7
+ MADISON,JAMES,F,672,10720,01/05/2009,02/16/2010,TM,Rehab Svcs - Outpatients,F,04/15/1985
8
+ FRANKLIN,BENJAMIN,R,674,8340,05/01/2002,09/01/2003,TM,"Rehab Svcs, xyz",F,03/15/1973
9
+ LINCOLN,ABRAHAM,M,634,11340,05/02/2011,,PN,Sibley Ambulatory Surgery Ctr,F,07/11/1960
10
+ MONROE,JAMES,L,614,10757,02/16/2009,,RF,Labor & Delivery,F,11/06/1983
11
+ REVERE,PAUL,B,424,8568,11/18/2002,06/27/2006,TM,Laundry & Linen,M,12/31/1976
@@ -1,5 +1,5 @@
1
1
  3/7/2012,hospid,usrorder,dteorder,usrsend,dtesend,usrdone,dtedone,department
2
2
  15267,407,201201060140,407,201201060140,0,201201060309,L
3
- 15267,381,201201060222,381,201201060222,0,201201060647,X
3
+ 15267,381,201201060222,381,201201060222,"abc,def",201201060647,X
4
4
  15267,407,201201060311,407,201201060311,0,201201060339,L
5
5
  15267,407,201201060514,108,201201060515,108,201201060515,SEC
@@ -30,12 +30,13 @@ describe "mending" do
30
30
  it "should unfold inlined headers" do
31
31
  filename = File.dirname(__FILE__) + "/../data/inlined_headers.csv"
32
32
  tmp = Tempfile.new('mending')
33
- results = Masticate.mend(filename, :inlined => true, :output => tmp)
33
+ results = Masticate.mend(filename, :col_sep => ',', :quote_char => '"', :inlined => true, :output => tmp)
34
34
  output = File.read(tmp)
35
35
  correct_output = File.read(File.dirname(__FILE__) + "/../data/inlined_headers.csv.output")
36
36
 
37
37
  results[:input_count].should == 11
38
38
  results[:output_count].should == 11
39
+ results[:field_counts].should == {11 => 11}
39
40
  output.should == correct_output
40
41
  end
41
42
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: masticate
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.3
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -13,7 +13,7 @@ date: 2012-04-06 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rspec
16
- requirement: &2152293880 !ruby/object:Gem::Requirement
16
+ requirement: &2152447240 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: 2.9.0
22
22
  type: :development
23
23
  prerelease: false
24
- version_requirements: *2152293880
24
+ version_requirements: *2152447240
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: guard-rspec
27
- requirement: &2152293360 !ruby/object:Gem::Requirement
27
+ requirement: &2152446740 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ~>
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: 0.7.0
33
33
  type: :development
34
34
  prerelease: false
35
- version_requirements: *2152293360
35
+ version_requirements: *2152446740
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: ruby_gntp
38
- requirement: &2152292900 !ruby/object:Gem::Requirement
38
+ requirement: &2152446280 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ~>
@@ -43,7 +43,7 @@ dependencies:
43
43
  version: 0.3.4
44
44
  type: :development
45
45
  prerelease: false
46
- version_requirements: *2152292900
46
+ version_requirements: *2152446280
47
47
  description: Data file crunching
48
48
  email:
49
49
  - jmay@pobox.com