masticate 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Guardfile +1 -1
- data/bin/masticate +19 -25
- data/lib/masticate/base.rb +10 -0
- data/lib/masticate/csvify.rb +1 -4
- data/lib/masticate/datify.rb +2 -5
- data/lib/masticate/gsubber.rb +2 -5
- data/lib/masticate/max_rows.rb +51 -0
- data/lib/masticate/mender.rb +27 -13
- data/lib/masticate/plucker.rb +1 -4
- data/lib/masticate/sniffer.rb +2 -7
- data/lib/masticate/version.rb +1 -1
- data/lib/masticate.rb +5 -0
- data/spec/data/events.csv +8 -0
- data/spec/data/events_reduced.csv +5 -0
- data/spec/data/inlined_headers.csv +10 -0
- data/spec/data/inlined_headers.csv.output +11 -0
- data/spec/data/junk_trailer.txt +1 -0
- data/spec/lib/maxrow_spec.rb +21 -0
- data/spec/lib/{mend_spec.rb → mender_spec.rb} +15 -3
- metadata +21 -10
data/Guardfile
CHANGED
@@ -3,7 +3,7 @@
|
|
3
3
|
|
4
4
|
guard 'rspec', :version => 2 do
|
5
5
|
watch(%r{^spec/.+_spec\.rb$})
|
6
|
-
watch(%r{^lib/(.+)\.rb$}) { |m| "spec/lib/#{m[1]}_spec.rb" }
|
6
|
+
watch(%r{^lib/(.+)\.rb$}) { |m| "spec/lib/#{m[1].gsub(/masticate./, '')}_spec.rb" }
|
7
7
|
watch('spec/spec_helper.rb') { "spec" }
|
8
8
|
|
9
9
|
watch(%r{^spec/support/(.+)\.rb$}) { "spec" }
|
data/bin/masticate
CHANGED
@@ -37,61 +37,55 @@ OptionParser.new do |opts|
|
|
37
37
|
opts.on("--to STRING", "Result string for gsub conversion") do |s|
|
38
38
|
options[:to] = s
|
39
39
|
end
|
40
|
+
|
41
|
+
opts.on("--inlined", "(for *mend* only) Source file has headers inlined on each line") do |v|
|
42
|
+
options[:inlined] = v
|
43
|
+
end
|
40
44
|
end.parse!
|
41
45
|
|
42
46
|
filename = ARGV.shift # use stdin if no filename provided
|
43
47
|
|
48
|
+
def logmessage(command, options, results)
|
49
|
+
$stderr.puts <<-EOT
|
50
|
+
* masticate #{command} (#{options.keys.join(', ')})
|
51
|
+
Lines in input: #{results[:input_count]}
|
52
|
+
Lines in output: #{results[:output_count]}
|
53
|
+
EOT
|
54
|
+
end
|
55
|
+
|
44
56
|
case command
|
45
57
|
when 'sniff'
|
46
58
|
results = Masticate.sniff(filename)
|
47
59
|
col_sep = results[:col_sep]
|
48
60
|
col_sep = "TAB" if col_sep == "\t"
|
61
|
+
quote_char = results[:quote_char] || "NONE"
|
49
62
|
$stderr.puts <<-EOT
|
50
63
|
Processing complete.
|
51
64
|
Input delimiter: #{col_sep}
|
65
|
+
Quote char: #{quote_char}
|
52
66
|
Field counts: #{results[:field_counts].inspect}
|
53
67
|
Headers: #{results[:headers].join(',')}
|
54
68
|
EOT
|
55
69
|
|
56
70
|
when 'mend'
|
57
71
|
results = Masticate.mend(filename, options)
|
58
|
-
|
59
|
-
Processing complete.
|
60
|
-
Lines in input: #{results[:input_count]}
|
61
|
-
Lines in output: #{results[:output_count]}
|
62
|
-
EOT
|
72
|
+
logmessage(command, options, results)
|
63
73
|
|
64
74
|
when 'csvify'
|
65
75
|
results = Masticate.csvify(filename, options)
|
66
|
-
|
67
|
-
Processing complete.
|
68
|
-
Lines in input: #{results[:input_count]}
|
69
|
-
Lines in output: #{results[:output_count]}
|
70
|
-
EOT
|
76
|
+
logmessage(command, options, results)
|
71
77
|
|
72
78
|
when 'pluck'
|
73
79
|
results = Masticate.pluck(filename, options)
|
74
|
-
|
75
|
-
Processing complete.
|
76
|
-
Lines in input: #{results[:input_count]}
|
77
|
-
Lines in output: #{results[:output_count]}
|
78
|
-
EOT
|
80
|
+
logmessage(command, options, results)
|
79
81
|
|
80
82
|
when 'datify'
|
81
83
|
results = Masticate.datify(filename, options)
|
82
|
-
|
83
|
-
Processing complete.
|
84
|
-
Lines in input: #{results[:input_count]}
|
85
|
-
Lines in output: #{results[:output_count]}
|
86
|
-
EOT
|
84
|
+
logmessage(command, options, results)
|
87
85
|
|
88
86
|
when 'gsub'
|
89
87
|
results = Masticate.gsub(filename, options)
|
90
|
-
|
91
|
-
# Processing complete.
|
92
|
-
# Lines in input: #{results[:input_count]}
|
93
|
-
# Lines in output: #{results[:output_count]}
|
94
|
-
# EOT
|
88
|
+
logmessage(command, options, results)
|
95
89
|
|
96
90
|
else
|
97
91
|
raise "unknown command #{command}"
|
data/lib/masticate/base.rb
CHANGED
@@ -2,6 +2,7 @@ class Masticate::Base
|
|
2
2
|
attr_reader :filename
|
3
3
|
attr_reader :input, :output
|
4
4
|
attr_reader :input_count, :output_count
|
5
|
+
attr_reader :csv_options
|
5
6
|
|
6
7
|
def initialize(filename)
|
7
8
|
@filename = filename
|
@@ -30,4 +31,13 @@ class Masticate::Base
|
|
30
31
|
# silently ignore this condition, it's not fatal and doesn't need a warning
|
31
32
|
end
|
32
33
|
end
|
34
|
+
|
35
|
+
def standard_options(opts)
|
36
|
+
@output = opts[:output] ? File.open(opts[:output], "w") : $stdout
|
37
|
+
@csv_options = {}
|
38
|
+
@csv_options[:col_sep] = opts[:col_sep] if opts[:col_sep]
|
39
|
+
if opts[:col_sep]
|
40
|
+
@csv_options[:quote_char] = opts[:quote_char] || "\0"
|
41
|
+
end
|
42
|
+
end
|
33
43
|
end
|
data/lib/masticate/csvify.rb
CHANGED
@@ -7,10 +7,7 @@ class Masticate::Csvify < Masticate::Base
|
|
7
7
|
end
|
8
8
|
|
9
9
|
def csvify(opts)
|
10
|
-
|
11
|
-
csv_options = {}
|
12
|
-
csv_options[:col_sep] = opts[:col_sep] if opts[:col_sep]
|
13
|
-
csv_options[:quote_char] = opts[:quote_char] || opts[:col_sep] if opts[:quote_char] || opts[:col_sep]
|
10
|
+
standard_options(opts)
|
14
11
|
|
15
12
|
@output_count = 0
|
16
13
|
with_input do |input|
|
data/lib/masticate/datify.rb
CHANGED
@@ -3,10 +3,7 @@ require "csv"
|
|
3
3
|
|
4
4
|
class Masticate::Datify < Masticate::Base
|
5
5
|
def datify(opts)
|
6
|
-
|
7
|
-
csv_options = {}
|
8
|
-
csv_options[:col_sep] = opts[:col_sep] if opts[:col_sep]
|
9
|
-
csv_options[:quote_char] = opts[:quote_char] || opts[:col_sep] if opts[:quote_char] || opts[:col_sep]
|
6
|
+
standard_options(opts)
|
10
7
|
|
11
8
|
field = opts[:field] or raise "missing field to datify"
|
12
9
|
format = opts[:format] or raise "strptime format required for parsing timestamps"
|
@@ -29,7 +26,7 @@ class Masticate::Datify < Masticate::Base
|
|
29
26
|
@output.close if opts[:output]
|
30
27
|
|
31
28
|
{
|
32
|
-
:input_count => input_count,
|
29
|
+
:input_count => @input_count,
|
33
30
|
:output_count => @output_count
|
34
31
|
}
|
35
32
|
end
|
data/lib/masticate/gsubber.rb
CHANGED
@@ -3,10 +3,7 @@ require "csv"
|
|
3
3
|
|
4
4
|
class Masticate::Gsubber < Masticate::Base
|
5
5
|
def gsub(opts)
|
6
|
-
|
7
|
-
csv_options = {}
|
8
|
-
csv_options[:col_sep] = opts[:col_sep] if opts[:col_sep]
|
9
|
-
csv_options[:quote_char] = opts[:quote_char] || opts[:col_sep] if opts[:quote_char] || opts[:col_sep]
|
6
|
+
standard_options(opts)
|
10
7
|
|
11
8
|
field = opts[:field] or raise "missing field to gsub"
|
12
9
|
from = Regexp.new(opts[:from]) or raise "Invalid regex '#{opts[:from]}' for conversion"
|
@@ -19,7 +16,7 @@ class Masticate::Gsubber < Masticate::Base
|
|
19
16
|
row = CSV.parse_line(line, csv_options)
|
20
17
|
if !headers
|
21
18
|
headers = row
|
22
|
-
index = headers.index(field) or raise "Unable to find column '#{field}'"
|
19
|
+
index = headers.index(field) or raise "Unable to find column '#{field}' in headers"
|
23
20
|
emit(line)
|
24
21
|
else
|
25
22
|
oldval = row[index]
|
@@ -0,0 +1,51 @@
|
|
1
|
+
# extract subset of columns from CSV
|
2
|
+
require "csv"
|
3
|
+
|
4
|
+
class Masticate::MaxRows < Masticate::Base
|
5
|
+
def maxrows(opts)
|
6
|
+
@output = opts[:output] ? File.open(opts[:output], "w") : $stdout
|
7
|
+
csv_options = {}
|
8
|
+
csv_options[:col_sep] = opts[:col_sep] if opts[:col_sep]
|
9
|
+
csv_options[:quote_char] = opts[:quote_char] || "\0"
|
10
|
+
|
11
|
+
groupby = opts[:by] or raise "missing field to group by"
|
12
|
+
maxon = opts[:max] or raise "missing field to max on"
|
13
|
+
|
14
|
+
@output_count = 0
|
15
|
+
headers = nil
|
16
|
+
accum = {}
|
17
|
+
with_input do |input|
|
18
|
+
while line = get
|
19
|
+
row = CSV.parse_line(line, csv_options)
|
20
|
+
if !headers
|
21
|
+
headers = row
|
22
|
+
index_by = headers.index(groupby) or raise "Unable to find column '#{groupby}'"
|
23
|
+
index_max = headers.index(maxon) or raise "Unable to find column '#{maxon}'"
|
24
|
+
emit(line)
|
25
|
+
else
|
26
|
+
key = row[index_by]
|
27
|
+
if !accum[key]
|
28
|
+
accum[key] = row
|
29
|
+
else
|
30
|
+
oldscore = accum[key][index_max]
|
31
|
+
newscore = row[index_max]
|
32
|
+
if newscore > oldscore
|
33
|
+
accum[key] = row
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
accum.each do |k,row|
|
41
|
+
emit(row.to_csv)
|
42
|
+
end
|
43
|
+
|
44
|
+
@output.close if opts[:output]
|
45
|
+
|
46
|
+
{
|
47
|
+
:input_count => input_count,
|
48
|
+
:output_count => @output_count
|
49
|
+
}
|
50
|
+
end
|
51
|
+
end
|
data/lib/masticate/mender.rb
CHANGED
@@ -13,6 +13,7 @@ class Masticate::Mender < Masticate::Base
|
|
13
13
|
def mend(opts)
|
14
14
|
@output = opts[:output] ? File.open(opts[:output], "w") : $stdout
|
15
15
|
@col_sep = opts[:col_sep] || ','
|
16
|
+
@quote_char = opts[:quote_char] || "\0"
|
16
17
|
|
17
18
|
expected_field_count = nil
|
18
19
|
headers = nil
|
@@ -20,12 +21,28 @@ class Masticate::Mender < Masticate::Base
|
|
20
21
|
with_input do |input|
|
21
22
|
while (line = get) do
|
22
23
|
unless line =~ /^\s*$/
|
23
|
-
if
|
24
|
+
if opts[:inlined]
|
25
|
+
row = explode(line)
|
26
|
+
ncells = row.count/2-1
|
27
|
+
if !expected_field_count
|
28
|
+
headers = row[0..ncells]
|
29
|
+
expected_field_count = headers.count
|
30
|
+
emit(headers.to_csv(:col_sep => @col_sep))
|
31
|
+
else
|
32
|
+
if row[0..ncells] != headers
|
33
|
+
raise "Header mismatch on line #{@input_count}\n Expected: #{headers.join(',')}\n Found: #{row[0..ncells].join(',')}"
|
34
|
+
end
|
35
|
+
end
|
36
|
+
row = row[ncells+1..-1]
|
37
|
+
emit(row.to_csv(:col_sep => @col_sep))
|
38
|
+
elsif !expected_field_count
|
24
39
|
# trust the first row
|
25
|
-
headers = explode(line)
|
40
|
+
headers = explode(line).map(&:strip)
|
26
41
|
case opts[:snip]
|
27
42
|
when Fixnum
|
28
43
|
headers.shift(opts[:snip])
|
44
|
+
when String
|
45
|
+
raise "TODO: snip named header. Multiple?"
|
29
46
|
when nil
|
30
47
|
# do nothing
|
31
48
|
else
|
@@ -43,7 +60,7 @@ class Masticate::Mender < Masticate::Base
|
|
43
60
|
end
|
44
61
|
end
|
45
62
|
|
46
|
-
|
63
|
+
unless opts[:dejunk] && junky?(line)
|
47
64
|
emit(line)
|
48
65
|
end
|
49
66
|
end
|
@@ -60,18 +77,15 @@ class Masticate::Mender < Masticate::Base
|
|
60
77
|
end
|
61
78
|
|
62
79
|
def fieldcount(line)
|
63
|
-
|
64
|
-
CSV.parse_line(line).count
|
65
|
-
else
|
66
|
-
line.count(col_sep)+1
|
67
|
-
end
|
80
|
+
explode(line).count
|
68
81
|
end
|
69
82
|
|
70
83
|
def explode(line)
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
84
|
+
CSV.parse_line(line, :col_sep => col_sep, :quote_char => @quote_char)
|
85
|
+
end
|
86
|
+
|
87
|
+
# a line is "junky" if it has 2 or fewer fields with any content
|
88
|
+
def junky?(line)
|
89
|
+
explode(line).select {|s| s && !s.strip.empty?}.count <= 2
|
76
90
|
end
|
77
91
|
end
|
data/lib/masticate/plucker.rb
CHANGED
@@ -3,10 +3,7 @@ require "csv"
|
|
3
3
|
|
4
4
|
class Masticate::Plucker < Masticate::Base
|
5
5
|
def pluck(opts)
|
6
|
-
|
7
|
-
csv_options = {}
|
8
|
-
csv_options[:col_sep] = opts[:col_sep] if opts[:col_sep]
|
9
|
-
csv_options[:quote_char] = opts[:quote_char] || opts[:col_sep] if opts[:quote_char] || opts[:col_sep]
|
6
|
+
standard_options(opts)
|
10
7
|
|
11
8
|
fields = opts[:fields] or raise "missing fields to pluck"
|
12
9
|
|
data/lib/masticate/sniffer.rb
CHANGED
@@ -67,13 +67,8 @@ class Masticate::Sniffer < Masticate::Base
|
|
67
67
|
end
|
68
68
|
|
69
69
|
def stats
|
70
|
-
|
71
|
-
|
72
|
-
input.lines.each_with_object(Hash.new(0)) {|line, counts| counts[CSV.parse_line(line, :quote_char => quote_char).count] += 1}
|
73
|
-
else
|
74
|
-
input.lines.each_with_object(Hash.new(0)) {|line, counts| counts[line.split(col_sep).count] += 1}
|
75
|
-
end
|
70
|
+
with_input do |input|
|
71
|
+
input.lines.each_with_object(Hash.new(0)) {|line, counts| counts[CSV.parse_line(line, :col_sep => col_sep, :quote_char => quote_char || "\0").count] += 1}
|
76
72
|
end
|
77
|
-
counts
|
78
73
|
end
|
79
74
|
end
|
data/lib/masticate/version.rb
CHANGED
data/lib/masticate.rb
CHANGED
@@ -8,6 +8,7 @@ require_relative "masticate/csvify"
|
|
8
8
|
require_relative "masticate/plucker"
|
9
9
|
require_relative "masticate/datify"
|
10
10
|
require_relative "masticate/gsubber"
|
11
|
+
require_relative "masticate/max_rows"
|
11
12
|
|
12
13
|
module Masticate
|
13
14
|
def self.sniff(filename)
|
@@ -33,4 +34,8 @@ module Masticate
|
|
33
34
|
def self.gsub(filename, opts)
|
34
35
|
Gsubber.new(filename).gsub(opts)
|
35
36
|
end
|
37
|
+
|
38
|
+
def self.maxrows(filename, opts)
|
39
|
+
MaxRows.new(filename).maxrows(opts)
|
40
|
+
end
|
36
41
|
end
|
@@ -0,0 +1,10 @@
|
|
1
|
+
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,WASHINGTON,GEORGE,D,824,9556,09/10/2005,07/01/2006,TM,Surgical House Staff,M,09/23/1975,3/10/2012
|
2
|
+
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,JEFFERSON,TOM,,621,8052,07/23/2001,01/28/2011,TM,Telemetry,F,12/24/1976,3/10/2012
|
3
|
+
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,ADAMS,JOHN,,655,8834,09/22/2003,,WA,6 East,F,08/07/1978,3/10/2012
|
4
|
+
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,ADAMS,JOHN QUINCY,A,209,8637,02/24/2003,12/02/2007,TM,Imaging Svcs - MRI,F,11/03/1966,3/10/2012
|
5
|
+
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,HAMILTON,ANDREW,,278,10065,01/09/2007,11/16/2007,TM,Information Technology,M,09/16/1968,3/10/2012
|
6
|
+
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,MADISON,JAMES,F,672,10720,01/05/2009,02/16/2010,TM,Rehab Svcs - Outpatients,F,04/15/1985,3/10/2012
|
7
|
+
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,FRANKLIN,BENJAMIN,R,674,8340,05/01/2002,09/01/2003,TM,Rehab Svcs - PRN,F,03/15/1973,3/10/2012
|
8
|
+
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,LINCOLN,ABRAHAM,M,634,11340,05/02/2011,,PN,Sibley Ambulatory Surgery Ctr,F,07/11/1960,3/10/2012
|
9
|
+
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,MONROE,JAMES,L,614,10757,02/16/2009,,RF,Labor & Delivery,F,11/06/1983,3/10/2012
|
10
|
+
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE,REVERE,PAUL,B,424,8568,11/18/2002,06/27/2006,TM,Laundry & Linen,M,12/31/1976,3/10/2012
|
@@ -0,0 +1,11 @@
|
|
1
|
+
LAST_NAME,FIRST_NAME,MIDDLE_INIT,DEPT #,Empl #,DATE_HIRED,Term Date ,Status,R_NAME,SEX,BIRTHDATE
|
2
|
+
WASHINGTON,GEORGE,D,824,9556,09/10/2005,07/01/2006,TM,Surgical House Staff,M,09/23/1975,3/10/2012
|
3
|
+
JEFFERSON,TOM,,621,8052,07/23/2001,01/28/2011,TM,Telemetry,F,12/24/1976,3/10/2012
|
4
|
+
ADAMS,JOHN,,655,8834,09/22/2003,,WA,6 East,F,08/07/1978,3/10/2012
|
5
|
+
ADAMS,JOHN QUINCY,A,209,8637,02/24/2003,12/02/2007,TM,Imaging Svcs - MRI,F,11/03/1966,3/10/2012
|
6
|
+
HAMILTON,ANDREW,,278,10065,01/09/2007,11/16/2007,TM,Information Technology,M,09/16/1968,3/10/2012
|
7
|
+
MADISON,JAMES,F,672,10720,01/05/2009,02/16/2010,TM,Rehab Svcs - Outpatients,F,04/15/1985,3/10/2012
|
8
|
+
FRANKLIN,BENJAMIN,R,674,8340,05/01/2002,09/01/2003,TM,Rehab Svcs - PRN,F,03/15/1973,3/10/2012
|
9
|
+
LINCOLN,ABRAHAM,M,634,11340,05/02/2011,,PN,Sibley Ambulatory Surgery Ctr,F,07/11/1960,3/10/2012
|
10
|
+
MONROE,JAMES,L,614,10757,02/16/2009,,RF,Labor & Delivery,F,11/06/1983,3/10/2012
|
11
|
+
REVERE,PAUL,B,424,8568,11/18/2002,06/27/2006,TM,Laundry & Linen,M,12/31/1976,3/10/2012
|
data/spec/data/junk_trailer.txt
CHANGED
@@ -0,0 +1,21 @@
|
|
1
|
+
# spec for picking most-recent or highest-scoring rows
|
2
|
+
#
|
3
|
+
# something like:
|
4
|
+
# select * from rows group by col_a having col_b = max(col_b)
|
5
|
+
#
|
6
|
+
# usage: masticate maxrows --by col_a --max col_b
|
7
|
+
|
8
|
+
require "spec_helper"
|
9
|
+
require "tempfile"
|
10
|
+
|
11
|
+
describe "maxrows" do
|
12
|
+
it "should find " do
|
13
|
+
filename = File.dirname(__FILE__) + "/../data/events.csv"
|
14
|
+
tmp = Tempfile.new('maxrows')
|
15
|
+
results = Masticate.maxrows(filename, :output => tmp, :by => 'uid', :max => 'timestamp')
|
16
|
+
output = File.read(tmp)
|
17
|
+
correct_output = File.read(File.dirname(__FILE__) + "/../data/events_reduced.csv")
|
18
|
+
|
19
|
+
output.should == correct_output
|
20
|
+
end
|
21
|
+
end
|
@@ -1,6 +1,7 @@
|
|
1
1
|
# spec for file-sniffing functions
|
2
2
|
|
3
3
|
require "spec_helper"
|
4
|
+
require "tempfile"
|
4
5
|
|
5
6
|
describe "mending" do
|
6
7
|
it "should merge lines when delimiter counts don't match'" do
|
@@ -12,9 +13,8 @@ describe "mending" do
|
|
12
13
|
|
13
14
|
it "should strip trailer records" do
|
14
15
|
filename = File.dirname(__FILE__) + "/../data/junk_trailer.txt"
|
15
|
-
|
16
|
-
results
|
17
|
-
results[:input_count].should == 9
|
16
|
+
results = Masticate.mend(filename, :col_sep => '|', :output => "/dev/null", :dejunk => true)
|
17
|
+
results[:input_count].should == 10
|
18
18
|
results[:output_count].should == 5
|
19
19
|
results[:headers].should == ['COL1', 'COL 2', 'Col 3', 'col-4', 'col5', 'col6']
|
20
20
|
end
|
@@ -26,4 +26,16 @@ describe "mending" do
|
|
26
26
|
results[:output_count].should == 5
|
27
27
|
results[:headers].should == %w(hospid usrorder dteorder usrsend dtesend usrdone dtedone department)
|
28
28
|
end
|
29
|
+
|
30
|
+
it "should unfold inlined headers" do
|
31
|
+
filename = File.dirname(__FILE__) + "/../data/inlined_headers.csv"
|
32
|
+
tmp = Tempfile.new('mending')
|
33
|
+
results = Masticate.mend(filename, :inlined => true, :output => tmp)
|
34
|
+
output = File.read(tmp)
|
35
|
+
correct_output = File.read(File.dirname(__FILE__) + "/../data/inlined_headers.csv.output")
|
36
|
+
|
37
|
+
results[:input_count].should == 11
|
38
|
+
results[:output_count].should == 11
|
39
|
+
output.should == correct_output
|
40
|
+
end
|
29
41
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: masticate
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-04-
|
12
|
+
date: 2012-04-06 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rspec
|
16
|
-
requirement: &
|
16
|
+
requirement: &2152293880 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: 2.9.0
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *2152293880
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: guard-rspec
|
27
|
-
requirement: &
|
27
|
+
requirement: &2152293360 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ~>
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: 0.7.0
|
33
33
|
type: :development
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *2152293360
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: ruby_gntp
|
38
|
-
requirement: &
|
38
|
+
requirement: &2152292900 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ~>
|
@@ -43,7 +43,7 @@ dependencies:
|
|
43
43
|
version: 0.3.4
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *2152292900
|
47
47
|
description: Data file crunching
|
48
48
|
email:
|
49
49
|
- jmay@pobox.com
|
@@ -64,6 +64,7 @@ files:
|
|
64
64
|
- lib/masticate/csvify.rb
|
65
65
|
- lib/masticate/datify.rb
|
66
66
|
- lib/masticate/gsubber.rb
|
67
|
+
- lib/masticate/max_rows.rb
|
67
68
|
- lib/masticate/mender.rb
|
68
69
|
- lib/masticate/plucker.rb
|
69
70
|
- lib/masticate/sniffer.rb
|
@@ -72,6 +73,10 @@ files:
|
|
72
73
|
- spec/data/badnums.csv
|
73
74
|
- spec/data/badnums_fixed.csv
|
74
75
|
- spec/data/broken_psv.txt
|
76
|
+
- spec/data/events.csv
|
77
|
+
- spec/data/events_reduced.csv
|
78
|
+
- spec/data/inlined_headers.csv
|
79
|
+
- spec/data/inlined_headers.csv.output
|
75
80
|
- spec/data/junk_header.csv
|
76
81
|
- spec/data/junk_trailer.txt
|
77
82
|
- spec/data/namedcols.csv
|
@@ -81,7 +86,8 @@ files:
|
|
81
86
|
- spec/data/tabbed_data.txt
|
82
87
|
- spec/lib/csvify_spec.rb
|
83
88
|
- spec/lib/gsub_spec.rb
|
84
|
-
- spec/lib/
|
89
|
+
- spec/lib/maxrow_spec.rb
|
90
|
+
- spec/lib/mender_spec.rb
|
85
91
|
- spec/lib/plucker_spec.rb
|
86
92
|
- spec/lib/sniffer_spec.rb
|
87
93
|
- spec/spec_helper.rb
|
@@ -113,6 +119,10 @@ test_files:
|
|
113
119
|
- spec/data/badnums.csv
|
114
120
|
- spec/data/badnums_fixed.csv
|
115
121
|
- spec/data/broken_psv.txt
|
122
|
+
- spec/data/events.csv
|
123
|
+
- spec/data/events_reduced.csv
|
124
|
+
- spec/data/inlined_headers.csv
|
125
|
+
- spec/data/inlined_headers.csv.output
|
116
126
|
- spec/data/junk_header.csv
|
117
127
|
- spec/data/junk_trailer.txt
|
118
128
|
- spec/data/namedcols.csv
|
@@ -122,7 +132,8 @@ test_files:
|
|
122
132
|
- spec/data/tabbed_data.txt
|
123
133
|
- spec/lib/csvify_spec.rb
|
124
134
|
- spec/lib/gsub_spec.rb
|
125
|
-
- spec/lib/
|
135
|
+
- spec/lib/maxrow_spec.rb
|
136
|
+
- spec/lib/mender_spec.rb
|
126
137
|
- spec/lib/plucker_spec.rb
|
127
138
|
- spec/lib/sniffer_spec.rb
|
128
139
|
- spec/spec_helper.rb
|