masticate 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/Guardfile +1 -1
- data/bin/masticate +19 -25
- data/lib/masticate/base.rb +10 -0
- data/lib/masticate/csvify.rb +1 -4
- data/lib/masticate/datify.rb +2 -5
- data/lib/masticate/gsubber.rb +2 -5
- data/lib/masticate/max_rows.rb +51 -0
- data/lib/masticate/mender.rb +27 -13
- data/lib/masticate/plucker.rb +1 -4
- data/lib/masticate/sniffer.rb +2 -7
- data/lib/masticate/version.rb +1 -1
- data/lib/masticate.rb +5 -0
- data/spec/data/events.csv +8 -0
- data/spec/data/events_reduced.csv +5 -0
- data/spec/data/inlined_headers.csv +10 -0
- data/spec/data/inlined_headers.csv.output +11 -0
- data/spec/data/junk_trailer.txt +1 -0
- data/spec/lib/maxrow_spec.rb +21 -0
- data/spec/lib/{mend_spec.rb → mender_spec.rb} +15 -3
- metadata +21 -10
data/Guardfile
CHANGED
@@ -3,7 +3,7 @@
|
|
3
3
|
|
4
4
|
guard 'rspec', :version => 2 do
|
5
5
|
watch(%r{^spec/.+_spec\.rb$})
|
6
|
-
watch(%r{^lib/(.+)\.rb$}) { |m| "spec/lib/#{m[1]}_spec.rb" }
|
6
|
+
watch(%r{^lib/(.+)\.rb$}) { |m| "spec/lib/#{m[1].gsub(/masticate./, '')}_spec.rb" }
|
7
7
|
watch('spec/spec_helper.rb') { "spec" }
|
8
8
|
|
9
9
|
watch(%r{^spec/support/(.+)\.rb$}) { "spec" }
|
data/bin/masticate
CHANGED
@@ -37,61 +37,55 @@ OptionParser.new do |opts|
|
|
37
37
|
opts.on("--to STRING", "Result string for gsub conversion") do |s|
|
38
38
|
options[:to] = s
|
39
39
|
end
|
40
|
+
|
41
|
+
opts.on("--inlined", "(for *mend* only) Source file has headers inlined on each line") do |v|
|
42
|
+
options[:inlined] = v
|
43
|
+
end
|
40
44
|
end.parse!
|
41
45
|
|
42
46
|
filename = ARGV.shift # use stdin if no filename provided
|
43
47
|
|
48
|
+
def logmessage(command, options, results)
|
49
|
+
$stderr.puts <<-EOT
|
50
|
+
* masticate #{command} (#{options.keys.join(', ')})
|
51
|
+
Lines in input: #{results[:input_count]}
|
52
|
+
Lines in output: #{results[:output_count]}
|
53
|
+
EOT
|
54
|
+
end
|
55
|
+
|
44
56
|
case command
|
45
57
|
when 'sniff'
|
46
58
|
results = Masticate.sniff(filename)
|
47
59
|
col_sep = results[:col_sep]
|
48
60
|
col_sep = "TAB" if col_sep == "\t"
|
61
|
+
quote_char = results[:quote_char] || "NONE"
|
49
62
|
$stderr.puts <<-EOT
|
50
63
|
Processing complete.
|
51
64
|
Input delimiter: #{col_sep}
|
65
|
+
Quote char: #{quote_char}
|
52
66
|
Field counts: #{results[:field_counts].inspect}
|
53
67
|
Headers: #{results[:headers].join(',')}
|
54
68
|
EOT
|
55
69
|
|
56
70
|
when 'mend'
|
57
71
|
results = Masticate.mend(filename, options)
|
58
|
-
|
59
|
-
Processing complete.
|
60
|
-
Lines in input: #{results[:input_count]}
|
61
|
-
Lines in output: #{results[:output_count]}
|
62
|
-
EOT
|
72
|
+
logmessage(command, options, results)
|
63
73
|
|
64
74
|
when 'csvify'
|
65
75
|
results = Masticate.csvify(filename, options)
|
66
|
-
|
67
|
-
Processing complete.
|
68
|
-
Lines in input: #{results[:input_count]}
|
69
|
-
Lines in output: #{results[:output_count]}
|
70
|
-
EOT
|
76
|
+
logmessage(command, options, results)
|
71
77
|
|
72
78
|
when 'pluck'
|
73
79
|
results = Masticate.pluck(filename, options)
|
74
|
-
|
75
|
-
Processing complete.
|
76
|
-
Lines in input: #{results[:input_count]}
|
77
|
-
Lines in output: #{results[:output_count]}
|
78
|
-
EOT
|
80
|
+
logmessage(command, options, results)
|
79
81
|
|
80
82
|
when 'datify'
|
81
83
|
results = Masticate.datify(filename, options)
|
82
|
-
|
83
|
-
Processing complete.
|
84
|
-
Lines in input: #{results[:input_count]}
|
85
|
-
Lines in output: #{results[:output_count]}
|
86
|
-
EOT
|
84
|
+
logmessage(command, options, results)
|
87
85
|
|
88
86
|
when 'gsub'
|
89
87
|
results = Masticate.gsub(filename, options)
|
90
|
-
|
91
|
-
# Processing complete.
|
92
|
-
# Lines in input: #{results[:input_count]}
|
93
|
-
# Lines in output: #{results[:output_count]}
|
94
|
-
# EOT
|
88
|
+
logmessage(command, options, results)
|
95
89
|
|
96
90
|
else
|
97
91
|
raise "unknown command #{command}"
|
data/lib/masticate/base.rb
CHANGED
@@ -2,6 +2,7 @@ class Masticate::Base
|
|
2
2
|
attr_reader :filename
|
3
3
|
attr_reader :input, :output
|
4
4
|
attr_reader :input_count, :output_count
|
5
|
+
attr_reader :csv_options
|
5
6
|
|
6
7
|
def initialize(filename)
|
7
8
|
@filename = filename
|
@@ -30,4 +31,13 @@ class Masticate::Base
|
|
30
31
|
# silently ignore this condition, it's not fatal and doesn't need a warning
|
31
32
|
end
|
32
33
|
end
|
34
|
+
|
35
|
+
def standard_options(opts)
|
36
|
+
@output = opts[:output] ? File.open(opts[:output], "w") : $stdout
|
37
|
+
@csv_options = {}
|
38
|
+
@csv_options[:col_sep] = opts[:col_sep] if opts[:col_sep]
|
39
|
+
if opts[:col_sep]
|
40
|
+
@csv_options[:quote_char] = opts[:quote_char] || "\0"
|
41
|
+
end
|
42
|
+
end
|
33
43
|
end
|
data/lib/masticate/csvify.rb
CHANGED
@@ -7,10 +7,7 @@ class Masticate::Csvify < Masticate::Base
|
|
7
7
|
end
|
8
8
|
|
9
9
|
def csvify(opts)
|
10
|
-
|
11
|
-
csv_options = {}
|
12
|
-
csv_options[:col_sep] = opts[:col_sep] if opts[:col_sep]
|
13
|
-
csv_options[:quote_char] = opts[:quote_char] || opts[:col_sep] if opts[:quote_char] || opts[:col_sep]
|
10
|
+
standard_options(opts)
|
14
11
|
|
15
12
|
@output_count = 0
|
16
13
|
with_input do |input|
|
data/lib/masticate/datify.rb
CHANGED
@@ -3,10 +3,7 @@ require "csv"
|
|
3
3
|
|
4
4
|
class Masticate::Datify < Masticate::Base
|
5
5
|
def datify(opts)
|
6
|
-
|
7
|
-
csv_options = {}
|
8
|
-
csv_options[:col_sep] = opts[:col_sep] if opts[:col_sep]
|
9
|
-
csv_options[:quote_char] = opts[:quote_char] || opts[:col_sep] if opts[:quote_char] || opts[:col_sep]
|
6
|
+
standard_options(opts)
|
10
7
|
|
11
8
|
field = opts[:field] or raise "missing field to datify"
|
12
9
|
format = opts[:format] or raise "strptime format required for parsing timestamps"
|
@@ -29,7 +26,7 @@ class Masticate::Datify < Masticate::Base
|
|
29
26
|
@output.close if opts[:output]
|
30
27
|
|
31
28
|
{
|
32
|
-
:input_count => input_count,
|
29
|
+
:input_count => @input_count,
|
33
30
|
:output_count => @output_count
|
34
31
|
}
|
35
32
|
end
|
data/lib/masticate/gsubber.rb
CHANGED
@@ -3,10 +3,7 @@ require "csv"
|
|
3
3
|
|
4
4
|
class Masticate::Gsubber < Masticate::Base
|
5
5
|
def gsub(opts)
|
6
|
-
|
7
|
-
csv_options = {}
|
8
|
-
csv_options[:col_sep] = opts[:col_sep] if opts[:col_sep]
|
9
|
-
csv_options[:quote_char] = opts[:quote_char] || opts[:col_sep] if opts[:quote_char] || opts[:col_sep]
|
6
|
+
standard_options(opts)
|
10
7
|
|
11
8
|
field = opts[:field] or raise "missing field to gsub"
|
12
9
|
from = Regexp.new(opts[:from]) or raise "Invalid regex '#{opts[:from]}' for conversion"
|
@@ -19,7 +16,7 @@ class Masticate::Gsubber < Masticate::Base
|
|
19
16
|
row = CSV.parse_line(line, csv_options)
|
20
17
|
if !headers
|
21
18
|
headers = row
|
22
|
-
index = headers.index(field) or raise "Unable to find column '#{field}'"
|
19
|
+
index = headers.index(field) or raise "Unable to find column '#{field}' in headers"
|
23
20
|
emit(line)
|
24
21
|
else
|
25
22
|
oldval = row[index]
|
@@ -0,0 +1,51 @@
|
|
1
|
+
# extract subset of columns from CSV
|
2
|
+
require "csv"
|
3
|
+
|
4
|
+
class Masticate::MaxRows < Masticate::Base
|
5
|
+
def maxrows(opts)
|
6
|
+
@output = opts[:output] ? File.open(opts[:output], "w") : $stdout
|
7
|
+
csv_options = {}
|
8
|
+
csv_options[:col_sep] = opts[:col_sep] if opts[:col_sep]
|
9
|
+
csv_options[:quote_char] = opts[:quote_char] || "\0"
|
10
|
+
|
11
|
+
groupby = opts[:by] or raise "missing field to group by"
|
12
|
+
maxon = opts[:max] or raise "missing field to max on"
|
13
|
+
|
14
|
+
@output_count = 0
|
15
|
+
headers = nil
|
16
|
+
accum = {}
|
17
|
+
with_input do |input|
|
18
|
+
while line = get
|
19
|
+
row = CSV.parse_line(line, csv_options)
|
20
|
+
if !headers
|
21
|
+
headers = row
|
22
|
+
index_by = headers.index(groupby) or raise "Unable to find column '#{groupby}'"
|
23
|
+
index_max = headers.index(maxon) or raise "Unable to find column '#{maxon}'"
|
24
|
+
emit(line)
|
25
|
+
else
|
26
|
+
key = row[index_by]
|
27
|
+
if !accum[key]
|
28
|
+
accum[key] = row
|
29
|
+
else
|
30
|
+
oldscore = accum[key][index_max]
|
31
|
+
newscore = row[index_max]
|
32
|
+
if newscore > oldscore
|
33
|
+
accum[key] = row
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
accum.each do |k,row|
|
41
|
+
emit(row.to_csv)
|
42
|
+
end
|
43
|
+
|
44
|
+
@output.close if opts[:output]
|
45
|
+
|
46
|
+
{
|
47
|
+
:input_count => input_count,
|
48
|
+
:output_count => @output_count
|
49
|
+
}
|
50
|
+
end
|
51
|
+
end
|
data/lib/masticate/mender.rb
CHANGED
@@ -13,6 +13,7 @@ class Masticate::Mender < Masticate::Base
|
|
13
13
|
def mend(opts)
|
14
14
|
@output = opts[:output] ? File.open(opts[:output], "w") : $stdout
|
15
15
|
@col_sep = opts[:col_sep] || ','
|
16
|
+
@quote_char = opts[:quote_char] || "\0"
|
16
17
|
|
17
18
|
expected_field_count = nil
|
18
19
|
headers = nil
|
@@ -20,12 +21,28 @@ class Masticate::Mender < Masticate::Base
|
|
20
21
|
with_input do |input|
|
21
22
|
while (line = get) do
|
22
23
|
unless line =~ /^\s*$/
|
23
|
-
if
|
24
|
+
if opts[:inlined]
|
25
|
+
row = explode(line)
|
26
|
+
ncells = row.count/2-1
|
27
|
+
if !expected_field_count
|
28
|
+
headers = row[0..ncells]
|
29
|
+
expected_field_count = headers.count
|
30
|
+
emit(headers.to_csv(:col_sep => @col_sep))
|
31
|
+
else
|
32
|
+
if row[0..ncells] != headers
|
33
|
+
raise "Header mismatch on line #{@input_count}\n Expected: #{headers.join(',')}\n Found: #{row[0..ncells].join(',')}"
|
34
|
+
end
|
35
|
+
end
|
36
|
+
row = row[ncells+1..-1]
|
37
|
+
emit(row.to_csv(:col_sep => @col_sep))
|
38
|
+
elsif !expected_field_count
|
24
39
|
# trust the first row
|
25
|
-
headers = explode(line)
|
40
|
+
headers = explode(line).map(&:strip)
|
26
41
|
case opts[:snip]
|
27
42
|
when Fixnum
|
28
43
|
headers.shift(opts[:snip])
|
44
|
+
when String
|
45
|
+
raise "TODO: snip named header. Multiple?"
|
29
46
|
when nil
|
30
47
|
# do nothing
|
31
48
|
else
|
@@ -43,7 +60,7 @@ class Masticate::Mender < Masticate::Base
|
|
43
60
|
end
|
44
61
|
end
|
45
62
|
|
46
|
-
|
63
|
+
unless opts[:dejunk] && junky?(line)
|
47
64
|
emit(line)
|
48
65
|
end
|
49
66
|
end
|
@@ -60,18 +77,15 @@ class Masticate::Mender < Masticate::Base
|
|
60
77
|
end
|
61
78
|
|
62
79
|
def fieldcount(line)
|
63
|
-
|
64
|
-
CSV.parse_line(line).count
|
65
|
-
else
|
66
|
-
line.count(col_sep)+1
|
67
|
-
end
|
80
|
+
explode(line).count
|
68
81
|
end
|
69
82
|
|
70
83
|
def explode(line)
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
84
|
+
CSV.parse_line(line, :col_sep => col_sep, :quote_char => @quote_char)
|
85
|
+
end
|
86
|
+
|
87
|
+
# a line is "junky" if it has 2 or fewer fields with any content
|
88
|
+
def junky?(line)
|
89
|
+
explode(line).select {|s| s && !s.strip.empty?}.count <= 2
|
76
90
|
end
|
77
91
|
end
|
data/lib/masticate/plucker.rb
CHANGED
@@ -3,10 +3,7 @@ require "csv"
|
|
3
3
|
|
4
4
|
class Masticate::Plucker < Masticate::Base
|
5
5
|
def pluck(opts)
|
6
|
-
|
7
|
-
csv_options = {}
|
8
|
-
csv_options[:col_sep] = opts[:col_sep] if opts[:col_sep]
|
9
|
-
csv_options[:quote_char] = opts[:quote_char] || opts[:col_sep] if opts[:quote_char] || opts[:col_sep]
|
6
|
+
standard_options(opts)
|
10
7
|
|
11
8
|
fields = opts[:fields] or raise "missing fields to pluck"
|
12
9
|
|
data/lib/masticate/sniffer.rb
CHANGED
@@ -67,13 +67,8 @@ class Masticate::Sniffer < Masticate::Base
|
|
67
67
|
end
|
68
68
|
|
69
69
|
def stats
|
70
|
-
|
71
|
-
|
72
|
-
input.lines.each_with_object(Hash.new(0)) {|line, counts| counts[CSV.parse_line(line, :quote_char => quote_char).count] += 1}
|
73
|
-
else
|
74
|
-
input.lines.each_with_object(Hash.new(0)) {|line, counts| counts[line.split(col_sep).count] += 1}
|
75
|
-
end
|
70
|
+
with_input do |input|
|
71
|
+
input.lines.each_with_object(Hash.new(0)) {|line, counts| counts[CSV.parse_line(line, :col_sep => col_sep, :quote_char => quote_char || "\0").count] += 1}
|
76
72
|
end
|
77
|
-
counts
|
78
73
|
end
|
79
74
|
end
|
data/lib/masticate/version.rb
CHANGED
data/lib/masticate.rb
CHANGED
@@ -8,6 +8,7 @@ require_relative "masticate/csvify"
|
|
8
8
|
require_relative "masticate/plucker"
|
9
9
|
require_relative "masticate/datify"
|
10
10
|
require_relative "masticate/gsubber"
|
11
|
+
require_relative "masticate/max_rows"
|
11
12
|
|
12
13
|
module Masticate
|
13
14
|
def self.sniff(filename)
|
@@ -33,4 +34,8 @@ module Masticate
|
|
33
34
|
def self.gsub(filename, opts)
|
34
35
|
Gsubber.new(filename).gsub(opts)
|
35
36
|
end
|
37
|
+
|
38
|
+
def self.maxrows(filename, opts)
|
39
|
+
MaxRows.new(filename).maxrows(opts)
|
40
|
+
end
|
36
41
|
end
|
@@ -0,0 +1,10 @@
|
|
1
|
+
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,WASHINGTON,GEORGE,D,824,9556,09/10/2005,07/01/2006,TM,Surgical House Staff,M,09/23/1975,3/10/2012
|
2
|
+
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,JEFFERSON,TOM,,621,8052,07/23/2001,01/28/2011,TM,Telemetry,F,12/24/1976,3/10/2012
|
3
|
+
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,ADAMS,JOHN,,655,8834,09/22/2003,,WA,6 East,F,08/07/1978,3/10/2012
|
4
|
+
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,ADAMS,JOHN QUINCY,A,209,8637,02/24/2003,12/02/2007,TM,Imaging Svcs - MRI,F,11/03/1966,3/10/2012
|
5
|
+
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,HAMILTON,ANDREW,,278,10065,01/09/2007,11/16/2007,TM,Information Technology,M,09/16/1968,3/10/2012
|
6
|
+
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,MADISON,JAMES,F,672,10720,01/05/2009,02/16/2010,TM,Rehab Svcs - Outpatients,F,04/15/1985,3/10/2012
|
7
|
+
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,FRANKLIN,BENJAMIN,R,674,8340,05/01/2002,09/01/2003,TM,Rehab Svcs - PRN,F,03/15/1973,3/10/2012
|
8
|
+
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,LINCOLN,ABRAHAM,M,634,11340,05/02/2011,,PN,Sibley Ambulatory Surgery Ctr,F,07/11/1960,3/10/2012
|
9
|
+
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,MONROE,JAMES,L,614,10757,02/16/2009,,RF,Labor & Delivery,F,11/06/1983,3/10/2012
|
10
|
+
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,REVERE,PAUL,B,424,8568,11/18/2002,06/27/2006,TM,Laundry & Linen,M,12/31/1976,3/10/2012
|
@@ -0,0 +1,11 @@
|
|
1
|
+
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE
|
2
|
+
WASHINGTON,GEORGE,D,824,9556,09/10/2005,07/01/2006,TM,Surgical House Staff,M,09/23/1975,3/10/2012
|
3
|
+
JEFFERSON,TOM,,621,8052,07/23/2001,01/28/2011,TM,Telemetry,F,12/24/1976,3/10/2012
|
4
|
+
ADAMS,JOHN,,655,8834,09/22/2003,,WA,6 East,F,08/07/1978,3/10/2012
|
5
|
+
ADAMS,JOHN QUINCY,A,209,8637,02/24/2003,12/02/2007,TM,Imaging Svcs - MRI,F,11/03/1966,3/10/2012
|
6
|
+
HAMILTON,ANDREW,,278,10065,01/09/2007,11/16/2007,TM,Information Technology,M,09/16/1968,3/10/2012
|
7
|
+
MADISON,JAMES,F,672,10720,01/05/2009,02/16/2010,TM,Rehab Svcs - Outpatients,F,04/15/1985,3/10/2012
|
8
|
+
FRANKLIN,BENJAMIN,R,674,8340,05/01/2002,09/01/2003,TM,Rehab Svcs - PRN,F,03/15/1973,3/10/2012
|
9
|
+
LINCOLN,ABRAHAM,M,634,11340,05/02/2011,,PN,Sibley Ambulatory Surgery Ctr,F,07/11/1960,3/10/2012
|
10
|
+
MONROE,JAMES,L,614,10757,02/16/2009,,RF,Labor & Delivery,F,11/06/1983,3/10/2012
|
11
|
+
REVERE,PAUL,B,424,8568,11/18/2002,06/27/2006,TM,Laundry & Linen,M,12/31/1976,3/10/2012
|
data/spec/data/junk_trailer.txt
CHANGED
@@ -0,0 +1,21 @@
|
|
1
|
+
# spec for picking most-recent or highest-scoring rows
|
2
|
+
#
|
3
|
+
# something like:
|
4
|
+
# select * from rows group by col_a having col_b = max(col_b)
|
5
|
+
#
|
6
|
+
# usage: masticate maxrows --by col_a --max col_b
|
7
|
+
|
8
|
+
require "spec_helper"
|
9
|
+
require "tempfile"
|
10
|
+
|
11
|
+
describe "maxrows" do
|
12
|
+
it "should find " do
|
13
|
+
filename = File.dirname(__FILE__) + "/../data/events.csv"
|
14
|
+
tmp = Tempfile.new('maxrows')
|
15
|
+
results = Masticate.maxrows(filename, :output => tmp, :by => 'uid', :max => 'timestamp')
|
16
|
+
output = File.read(tmp)
|
17
|
+
correct_output = File.read(File.dirname(__FILE__) + "/../data/events_reduced.csv")
|
18
|
+
|
19
|
+
output.should == correct_output
|
20
|
+
end
|
21
|
+
end
|
@@ -1,6 +1,7 @@
|
|
1
1
|
# spec for file-sniffing functions
|
2
2
|
|
3
3
|
require "spec_helper"
|
4
|
+
require "tempfile"
|
4
5
|
|
5
6
|
describe "mending" do
|
6
7
|
it "should merge lines when delimiter counts don't match'" do
|
@@ -12,9 +13,8 @@ describe "mending" do
|
|
12
13
|
|
13
14
|
it "should strip trailer records" do
|
14
15
|
filename = File.dirname(__FILE__) + "/../data/junk_trailer.txt"
|
15
|
-
|
16
|
-
results
|
17
|
-
results[:input_count].should == 9
|
16
|
+
results = Masticate.mend(filename, :col_sep => '|', :output => "/dev/null", :dejunk => true)
|
17
|
+
results[:input_count].should == 10
|
18
18
|
results[:output_count].should == 5
|
19
19
|
results[:headers].should == ['COL1', 'COL 2', 'Col 3', 'col-4', 'col5', 'col6']
|
20
20
|
end
|
@@ -26,4 +26,16 @@ describe "mending" do
|
|
26
26
|
results[:output_count].should == 5
|
27
27
|
results[:headers].should == %w(hospid usrorder dteorder usrsend dtesend usrdone dtedone department)
|
28
28
|
end
|
29
|
+
|
30
|
+
it "should unfold inlined headers" do
|
31
|
+
filename = File.dirname(__FILE__) + "/../data/inlined_headers.csv"
|
32
|
+
tmp = Tempfile.new('mending')
|
33
|
+
results = Masticate.mend(filename, :inlined => true, :output => tmp)
|
34
|
+
output = File.read(tmp)
|
35
|
+
correct_output = File.read(File.dirname(__FILE__) + "/../data/inlined_headers.csv.output")
|
36
|
+
|
37
|
+
results[:input_count].should == 11
|
38
|
+
results[:output_count].should == 11
|
39
|
+
output.should == correct_output
|
40
|
+
end
|
29
41
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: masticate
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-04-
|
12
|
+
date: 2012-04-06 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rspec
|
16
|
-
requirement: &
|
16
|
+
requirement: &2152293880 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: 2.9.0
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *2152293880
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: guard-rspec
|
27
|
-
requirement: &
|
27
|
+
requirement: &2152293360 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ~>
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: 0.7.0
|
33
33
|
type: :development
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *2152293360
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: ruby_gntp
|
38
|
-
requirement: &
|
38
|
+
requirement: &2152292900 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ~>
|
@@ -43,7 +43,7 @@ dependencies:
|
|
43
43
|
version: 0.3.4
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *2152292900
|
47
47
|
description: Data file crunching
|
48
48
|
email:
|
49
49
|
- jmay@pobox.com
|
@@ -64,6 +64,7 @@ files:
|
|
64
64
|
- lib/masticate/csvify.rb
|
65
65
|
- lib/masticate/datify.rb
|
66
66
|
- lib/masticate/gsubber.rb
|
67
|
+
- lib/masticate/max_rows.rb
|
67
68
|
- lib/masticate/mender.rb
|
68
69
|
- lib/masticate/plucker.rb
|
69
70
|
- lib/masticate/sniffer.rb
|
@@ -72,6 +73,10 @@ files:
|
|
72
73
|
- spec/data/badnums.csv
|
73
74
|
- spec/data/badnums_fixed.csv
|
74
75
|
- spec/data/broken_psv.txt
|
76
|
+
- spec/data/events.csv
|
77
|
+
- spec/data/events_reduced.csv
|
78
|
+
- spec/data/inlined_headers.csv
|
79
|
+
- spec/data/inlined_headers.csv.output
|
75
80
|
- spec/data/junk_header.csv
|
76
81
|
- spec/data/junk_trailer.txt
|
77
82
|
- spec/data/namedcols.csv
|
@@ -81,7 +86,8 @@ files:
|
|
81
86
|
- spec/data/tabbed_data.txt
|
82
87
|
- spec/lib/csvify_spec.rb
|
83
88
|
- spec/lib/gsub_spec.rb
|
84
|
-
- spec/lib/
|
89
|
+
- spec/lib/maxrow_spec.rb
|
90
|
+
- spec/lib/mender_spec.rb
|
85
91
|
- spec/lib/plucker_spec.rb
|
86
92
|
- spec/lib/sniffer_spec.rb
|
87
93
|
- spec/spec_helper.rb
|
@@ -113,6 +119,10 @@ test_files:
|
|
113
119
|
- spec/data/badnums.csv
|
114
120
|
- spec/data/badnums_fixed.csv
|
115
121
|
- spec/data/broken_psv.txt
|
122
|
+
- spec/data/events.csv
|
123
|
+
- spec/data/events_reduced.csv
|
124
|
+
- spec/data/inlined_headers.csv
|
125
|
+
- spec/data/inlined_headers.csv.output
|
116
126
|
- spec/data/junk_header.csv
|
117
127
|
- spec/data/junk_trailer.txt
|
118
128
|
- spec/data/namedcols.csv
|
@@ -122,7 +132,8 @@ test_files:
|
|
122
132
|
- spec/data/tabbed_data.txt
|
123
133
|
- spec/lib/csvify_spec.rb
|
124
134
|
- spec/lib/gsub_spec.rb
|
125
|
-
- spec/lib/
|
135
|
+
- spec/lib/maxrow_spec.rb
|
136
|
+
- spec/lib/mender_spec.rb
|
126
137
|
- spec/lib/plucker_spec.rb
|
127
138
|
- spec/lib/sniffer_spec.rb
|
128
139
|
- spec/spec_helper.rb
|