masticate 0.0.4 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/masticate +68 -12
- data/lib/masticate/base.rb +33 -0
- data/lib/masticate/csvify.rb +9 -19
- data/lib/masticate/datify.rb +36 -0
- data/lib/masticate/gsubber.rb +39 -0
- data/lib/masticate/mender.rb +50 -36
- data/lib/masticate/plucker.rb +34 -0
- data/lib/masticate/sniffer.rb +52 -13
- data/lib/masticate/version.rb +1 -1
- data/lib/masticate.rb +16 -0
- data/spec/data/badnums.csv +8 -0
- data/spec/data/badnums_fixed.csv +8 -0
- data/spec/data/junk_header.csv +5 -0
- data/spec/data/namedcols.csv +4 -0
- data/spec/data/namedcols.csv.output +4 -0
- data/spec/data/quoted_csv_data.txt +100 -0
- data/spec/lib/csvify_spec.rb +1 -1
- data/spec/lib/gsub_spec.rb +16 -0
- data/spec/lib/mend_spec.rb +13 -4
- data/spec/lib/plucker_spec.rb +18 -0
- data/spec/lib/sniffer_spec.rb +8 -0
- metadata +28 -8
data/bin/masticate
CHANGED
@@ -1,10 +1,47 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
3
|
require_relative "../lib/masticate"
|
4
|
+
require "optparse"
|
4
5
|
|
5
|
-
command
|
6
|
+
command = ARGV.shift
|
6
7
|
|
7
|
-
|
8
|
+
options = {}
|
9
|
+
OptionParser.new do |opts|
|
10
|
+
opts.banner = "Usage: example.rb [options]"
|
11
|
+
|
12
|
+
opts.on("--format FORMAT", "Specify format") do |v|
|
13
|
+
options[:format] = v
|
14
|
+
end
|
15
|
+
|
16
|
+
opts.on("--delim DELIMITER", "Specify field delimiter (character or TAB)") do |v|
|
17
|
+
options[:col_sep] = v
|
18
|
+
options[:col_sep] = "\t" if options[:col_sep] == "TAB"
|
19
|
+
end
|
20
|
+
|
21
|
+
opts.on("--fields LIST", Array, "Specify fields to select") do |list|
|
22
|
+
options[:fields] = list
|
23
|
+
end
|
24
|
+
|
25
|
+
opts.on("--field FIELD", "Specify field to convert") do |f|
|
26
|
+
options[:field] = f
|
27
|
+
end
|
28
|
+
|
29
|
+
opts.on("--snip DIRECTIVE", "Specify header fields to snip: first N, or by name") do |f|
|
30
|
+
options[:snip] = f.to_i
|
31
|
+
end
|
32
|
+
|
33
|
+
opts.on("--from REGEXP", "Regular expression for gsub conversion") do |s|
|
34
|
+
options[:from] = s
|
35
|
+
end
|
36
|
+
|
37
|
+
opts.on("--to STRING", "Result string for gsub conversion") do |s|
|
38
|
+
options[:to] = s
|
39
|
+
end
|
40
|
+
end.parse!
|
41
|
+
|
42
|
+
filename = ARGV.shift # use stdin if no filename provided
|
43
|
+
|
44
|
+
case command
|
8
45
|
when 'sniff'
|
9
46
|
results = Masticate.sniff(filename)
|
10
47
|
col_sep = results[:col_sep]
|
@@ -13,30 +50,49 @@ when 'sniff'
|
|
13
50
|
Processing complete.
|
14
51
|
Input delimiter: #{col_sep}
|
15
52
|
Field counts: #{results[:field_counts].inspect}
|
53
|
+
Headers: #{results[:headers].join(',')}
|
16
54
|
EOT
|
17
55
|
|
18
56
|
when 'mend'
|
19
|
-
|
20
|
-
col_sep = metadata[:col_sep]
|
21
|
-
col_sep = "TAB" if col_sep == "\t"
|
22
|
-
results = Masticate.mend(filename, metadata)
|
57
|
+
results = Masticate.mend(filename, options)
|
23
58
|
$stderr.puts <<-EOT
|
24
59
|
Processing complete.
|
25
|
-
|
26
|
-
Lines in
|
27
|
-
Lines in output: #{results[:output_records]}
|
60
|
+
Lines in input: #{results[:input_count]}
|
61
|
+
Lines in output: #{results[:output_count]}
|
28
62
|
EOT
|
29
63
|
|
30
64
|
when 'csvify'
|
31
|
-
|
32
|
-
results = Masticate.csvify(filename, metadata)
|
65
|
+
results = Masticate.csvify(filename, options)
|
33
66
|
$stderr.puts <<-EOT
|
34
67
|
Processing complete.
|
35
|
-
Input delimiter: #{metadata[:col_sep]}
|
36
68
|
Lines in input: #{results[:input_count]}
|
37
69
|
Lines in output: #{results[:output_count]}
|
38
70
|
EOT
|
39
71
|
|
72
|
+
when 'pluck'
|
73
|
+
results = Masticate.pluck(filename, options)
|
74
|
+
$stderr.puts <<-EOT
|
75
|
+
Processing complete.
|
76
|
+
Lines in input: #{results[:input_count]}
|
77
|
+
Lines in output: #{results[:output_count]}
|
78
|
+
EOT
|
79
|
+
|
80
|
+
when 'datify'
|
81
|
+
results = Masticate.datify(filename, options)
|
82
|
+
$stderr.puts <<-EOT
|
83
|
+
Processing complete.
|
84
|
+
Lines in input: #{results[:input_count]}
|
85
|
+
Lines in output: #{results[:output_count]}
|
86
|
+
EOT
|
87
|
+
|
88
|
+
when 'gsub'
|
89
|
+
results = Masticate.gsub(filename, options)
|
90
|
+
# $stderr.puts <<-EOT
|
91
|
+
# Processing complete.
|
92
|
+
# Lines in input: #{results[:input_count]}
|
93
|
+
# Lines in output: #{results[:output_count]}
|
94
|
+
# EOT
|
95
|
+
|
40
96
|
else
|
41
97
|
raise "unknown command #{command}"
|
42
98
|
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
class Masticate::Base
|
2
|
+
attr_reader :filename
|
3
|
+
attr_reader :input, :output
|
4
|
+
attr_reader :input_count, :output_count
|
5
|
+
|
6
|
+
def initialize(filename)
|
7
|
+
@filename = filename
|
8
|
+
end
|
9
|
+
|
10
|
+
def with_input
|
11
|
+
@input = @filename ? open(@filename) : $stdin
|
12
|
+
@input_count = 0
|
13
|
+
result = yield @input
|
14
|
+
@input.close if @filename
|
15
|
+
result
|
16
|
+
end
|
17
|
+
|
18
|
+
def get
|
19
|
+
line = @input.gets
|
20
|
+
@input_count += 1
|
21
|
+
line && line.chomp
|
22
|
+
end
|
23
|
+
|
24
|
+
def emit(line)
|
25
|
+
@output_count += 1
|
26
|
+
begin
|
27
|
+
@output.puts line
|
28
|
+
rescue Errno::EPIPE
|
29
|
+
# output was closed, e.g. ran piped into `head`
|
30
|
+
# silently ignore this condition, it's not fatal and doesn't need a warning
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
data/lib/masticate/csvify.rb
CHANGED
@@ -1,11 +1,9 @@
|
|
1
1
|
# convert input to clean standard CSV
|
2
2
|
require "csv"
|
3
3
|
|
4
|
-
class Masticate::Csvify
|
5
|
-
attr_reader :input
|
6
|
-
|
4
|
+
class Masticate::Csvify < Masticate::Base
|
7
5
|
def initialize(filename)
|
8
|
-
@
|
6
|
+
@filename = filename
|
9
7
|
end
|
10
8
|
|
11
9
|
def csvify(opts)
|
@@ -14,26 +12,18 @@ class Masticate::Csvify
|
|
14
12
|
csv_options[:col_sep] = opts[:col_sep] if opts[:col_sep]
|
15
13
|
csv_options[:quote_char] = opts[:quote_char] || opts[:col_sep] if opts[:quote_char] || opts[:col_sep]
|
16
14
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
15
|
+
@output_count = 0
|
16
|
+
with_input do |input|
|
17
|
+
while line = get
|
18
|
+
row = CSV.parse_line(line, csv_options)
|
19
|
+
emit(row.to_csv) if row
|
20
|
+
end
|
21
21
|
end
|
22
22
|
@output.close if opts[:output]
|
23
|
-
|
23
|
+
|
24
24
|
{
|
25
25
|
:input_count => input_count,
|
26
26
|
:output_count => @output_count
|
27
27
|
}
|
28
28
|
end
|
29
|
-
|
30
|
-
def emit(line)
|
31
|
-
@output_count += 1
|
32
|
-
begin
|
33
|
-
@output.puts line
|
34
|
-
rescue Errno::EPIPE
|
35
|
-
# output was closed, e.g. ran piped into `head`
|
36
|
-
# silently ignore this condition, it's not fatal and doesn't need a warning
|
37
|
-
end
|
38
|
-
end
|
39
29
|
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
# convert date columns to numerics
|
2
|
+
require "csv"
|
3
|
+
|
4
|
+
class Masticate::Datify < Masticate::Base
|
5
|
+
def datify(opts)
|
6
|
+
@output = opts[:output] ? File.open(opts[:output], "w") : $stdout
|
7
|
+
csv_options = {}
|
8
|
+
csv_options[:col_sep] = opts[:col_sep] if opts[:col_sep]
|
9
|
+
csv_options[:quote_char] = opts[:quote_char] || opts[:col_sep] if opts[:quote_char] || opts[:col_sep]
|
10
|
+
|
11
|
+
field = opts[:field] or raise "missing field to datify"
|
12
|
+
format = opts[:format] or raise "strptime format required for parsing timestamps"
|
13
|
+
|
14
|
+
@output_count = 0
|
15
|
+
headers = nil
|
16
|
+
with_input do |input|
|
17
|
+
while line = get
|
18
|
+
row = CSV.parse_line(line, csv_options)
|
19
|
+
if !headers
|
20
|
+
headers = row
|
21
|
+
index = headers.index(field) or raise "Unable to find column '#{field}'"
|
22
|
+
emit(headers.to_csv)
|
23
|
+
else
|
24
|
+
row[index] = DateTime.strptime(row[index], format).to_time.to_i rescue nil
|
25
|
+
emit(row.to_csv)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
@output.close if opts[:output]
|
30
|
+
|
31
|
+
{
|
32
|
+
:input_count => input_count,
|
33
|
+
:output_count => @output_count
|
34
|
+
}
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
# extract subset of columns from CSV
|
2
|
+
require "csv"
|
3
|
+
|
4
|
+
class Masticate::Gsubber < Masticate::Base
|
5
|
+
def gsub(opts)
|
6
|
+
@output = opts[:output] ? File.open(opts[:output], "w") : $stdout
|
7
|
+
csv_options = {}
|
8
|
+
csv_options[:col_sep] = opts[:col_sep] if opts[:col_sep]
|
9
|
+
csv_options[:quote_char] = opts[:quote_char] || opts[:col_sep] if opts[:quote_char] || opts[:col_sep]
|
10
|
+
|
11
|
+
field = opts[:field] or raise "missing field to gsub"
|
12
|
+
from = Regexp.new(opts[:from]) or raise "Invalid regex '#{opts[:from]}' for conversion"
|
13
|
+
to = opts[:to] or raise "missing 'to' string for gsub"
|
14
|
+
|
15
|
+
@output_count = 0
|
16
|
+
headers = nil
|
17
|
+
with_input do |input|
|
18
|
+
while line = get
|
19
|
+
row = CSV.parse_line(line, csv_options)
|
20
|
+
if !headers
|
21
|
+
headers = row
|
22
|
+
index = headers.index(field) or raise "Unable to find column '#{field}'"
|
23
|
+
emit(line)
|
24
|
+
else
|
25
|
+
oldval = row[index]
|
26
|
+
newval = oldval.gsub(from, to)
|
27
|
+
row[index] = newval
|
28
|
+
emit(row.to_csv)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
@output.close if opts[:output]
|
33
|
+
|
34
|
+
{
|
35
|
+
:input_count => input_count,
|
36
|
+
:output_count => @output_count
|
37
|
+
}
|
38
|
+
end
|
39
|
+
end
|
data/lib/masticate/mender.rb
CHANGED
@@ -3,61 +3,75 @@
|
|
3
3
|
# A row that contains fewer delimiters than expected has been split across two lines
|
4
4
|
# (due to a newline embedded in a field). Glue those two lines into a single line in the output.
|
5
5
|
|
6
|
-
class Masticate::Mender
|
7
|
-
attr_reader :
|
6
|
+
class Masticate::Mender < Masticate::Base
|
7
|
+
attr_reader :col_sep
|
8
8
|
|
9
9
|
def initialize(filename)
|
10
|
-
@
|
10
|
+
@filename = filename
|
11
11
|
end
|
12
12
|
|
13
13
|
def mend(opts)
|
14
14
|
@output = opts[:output] ? File.open(opts[:output], "w") : $stdout
|
15
|
-
col_sep = opts[:col_sep]
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
15
|
+
@col_sep = opts[:col_sep] || ','
|
16
|
+
|
17
|
+
expected_field_count = nil
|
18
|
+
headers = nil
|
19
|
+
@output_count = 0
|
20
|
+
with_input do |input|
|
21
|
+
while (line = get) do
|
22
|
+
unless line =~ /^\s*$/
|
23
|
+
if !expected_field_count
|
24
|
+
# trust the first row
|
25
|
+
headers = explode(line)
|
26
|
+
case opts[:snip]
|
27
|
+
when Fixnum
|
28
|
+
headers.shift(opts[:snip])
|
29
|
+
when nil
|
30
|
+
# do nothing
|
31
|
+
else
|
32
|
+
raise "Do not understand snip instruction [#{opts[:snip].inspect}]"
|
33
|
+
end
|
34
|
+
expected_field_count = headers.count
|
35
|
+
emit(headers.to_csv(:col_sep => @col_sep))
|
36
|
+
else
|
37
|
+
running_count = fieldcount(line)
|
38
|
+
while !input.eof? && running_count < expected_field_count do
|
39
|
+
nextbit = get
|
40
|
+
if nextbit
|
41
|
+
line = line + ' ' + nextbit
|
42
|
+
running_count = fieldcount(line)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
if line.count(col_sep) > 2
|
47
|
+
emit(line)
|
31
48
|
end
|
32
49
|
end
|
33
50
|
end
|
34
|
-
if line.count(col_sep) > 2
|
35
|
-
emit(line)
|
36
|
-
end
|
37
51
|
end
|
38
52
|
end
|
39
53
|
|
40
|
-
@input.close
|
41
54
|
@output.close if opts[:output]
|
42
55
|
{
|
43
|
-
:
|
44
|
-
:
|
56
|
+
:input_count => @input_count,
|
57
|
+
:output_count => @output_count,
|
58
|
+
:headers => headers
|
45
59
|
}
|
46
60
|
end
|
47
61
|
|
48
|
-
def
|
49
|
-
|
50
|
-
|
51
|
-
|
62
|
+
def fieldcount(line)
|
63
|
+
if col_sep == ','
|
64
|
+
CSV.parse_line(line).count
|
65
|
+
else
|
66
|
+
line.count(col_sep)+1
|
67
|
+
end
|
52
68
|
end
|
53
69
|
|
54
|
-
def
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
# output was closed, e.g. ran piped into `head`
|
60
|
-
# silently ignore this condition, it's not fatal and doesn't need a warning
|
70
|
+
def explode(line)
|
71
|
+
if col_sep == ','
|
72
|
+
CSV.parse_line(line).map(&:strip)
|
73
|
+
else
|
74
|
+
line.split(col_sep).map(&:strip)
|
61
75
|
end
|
62
76
|
end
|
63
77
|
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
# extract subset of columns from CSV
|
2
|
+
require "csv"
|
3
|
+
|
4
|
+
class Masticate::Plucker < Masticate::Base
|
5
|
+
def pluck(opts)
|
6
|
+
@output = opts[:output] ? File.open(opts[:output], "w") : $stdout
|
7
|
+
csv_options = {}
|
8
|
+
csv_options[:col_sep] = opts[:col_sep] if opts[:col_sep]
|
9
|
+
csv_options[:quote_char] = opts[:quote_char] || opts[:col_sep] if opts[:quote_char] || opts[:col_sep]
|
10
|
+
|
11
|
+
fields = opts[:fields] or raise "missing fields to pluck"
|
12
|
+
|
13
|
+
@output_count = 0
|
14
|
+
headers = nil
|
15
|
+
with_input do |input|
|
16
|
+
while line = get
|
17
|
+
row = CSV.parse_line(line, csv_options)
|
18
|
+
if !headers
|
19
|
+
headers = row
|
20
|
+
indexes = fields.map {|f| headers.index(f) or raise "Unable to find column '#{f}'"}
|
21
|
+
emit(fields.to_csv)
|
22
|
+
else
|
23
|
+
emit(indexes.map {|i| row[i]}.to_csv) if row
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
@output.close if opts[:output]
|
28
|
+
|
29
|
+
{
|
30
|
+
:input_count => input_count,
|
31
|
+
:output_count => @output_count
|
32
|
+
}
|
33
|
+
end
|
34
|
+
end
|
data/lib/masticate/sniffer.rb
CHANGED
@@ -1,5 +1,8 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
require "set"
|
2
|
+
|
3
|
+
class Masticate::Sniffer < Masticate::Base
|
4
|
+
attr_reader :col_sep, :quote_char, :stats
|
5
|
+
attr_reader :delimstats
|
3
6
|
|
4
7
|
CandidateDelimiters = [',', '|', "\t"]
|
5
8
|
|
@@ -9,32 +12,68 @@ class Masticate::Sniffer
|
|
9
12
|
|
10
13
|
def sniff
|
11
14
|
@col_sep = find_col_sep
|
15
|
+
@quote_char = delimstats[@col_sep][:quote_char]
|
12
16
|
@stats = stats
|
13
17
|
{
|
14
18
|
:col_sep => @col_sep,
|
19
|
+
:quote_char => @quote_char,
|
15
20
|
:field_counts => @stats,
|
16
|
-
:
|
21
|
+
:headers => @line1.split(@col_sep).map(&:strip)
|
17
22
|
}
|
18
23
|
end
|
19
24
|
|
20
25
|
def find_col_sep
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
26
|
+
@delimstats = {}
|
27
|
+
with_input do |input|
|
28
|
+
input.lines.take(10).each do |line|
|
29
|
+
@line1 = line unless @line1
|
30
|
+
|
31
|
+
CandidateDelimiters.each do |delim|
|
32
|
+
delimstats[delim] ||= { :counts => Set.new, :quote_char => nil}
|
33
|
+
h = delimstats[delim]
|
34
|
+
fieldcount, quote_char = consider_delim(line, delim)
|
35
|
+
h[:counts] << fieldcount
|
36
|
+
h[:quote_char] ||= quote_char
|
37
|
+
end
|
38
|
+
end
|
25
39
|
end
|
26
|
-
|
27
|
-
delimcounts.sort_by{|h,v| -v}.first.first
|
40
|
+
delimstats.sort_by{|delim,stats| stats[:counts].max || 0}.last.first
|
28
41
|
end
|
29
42
|
|
30
43
|
def consider_delim(line, delim)
|
31
|
-
|
44
|
+
@quote_char = nil
|
45
|
+
n = count_fields(line, delim)
|
46
|
+
[n, @quote_char]
|
47
|
+
end
|
48
|
+
|
49
|
+
def count_fields(line, delim)
|
50
|
+
if delim == ','
|
51
|
+
straight_count = line.count(delim) + 1
|
52
|
+
count_with_quoting = begin
|
53
|
+
CSV.parse_line(line).count
|
54
|
+
rescue CSV::MalformedCSVError
|
55
|
+
# this is not valid CSV, e.g. has incorrectly embedded quotes
|
56
|
+
0
|
57
|
+
end
|
58
|
+
if count_with_quoting < straight_count
|
59
|
+
@quote_char = '"'
|
60
|
+
count_with_quoting
|
61
|
+
else
|
62
|
+
straight_count
|
63
|
+
end
|
64
|
+
else
|
65
|
+
line.count(delim) + 1
|
66
|
+
end
|
32
67
|
end
|
33
68
|
|
34
69
|
def stats
|
35
|
-
|
36
|
-
|
37
|
-
|
70
|
+
counts = with_input do |input|
|
71
|
+
if col_sep == ',' && quote_char
|
72
|
+
input.lines.each_with_object(Hash.new(0)) {|line, counts| counts[CSV.parse_line(line, :quote_char => quote_char).count] += 1}
|
73
|
+
else
|
74
|
+
input.lines.each_with_object(Hash.new(0)) {|line, counts| counts[line.split(col_sep).count] += 1}
|
75
|
+
end
|
76
|
+
end
|
38
77
|
counts
|
39
78
|
end
|
40
79
|
end
|
data/lib/masticate/version.rb
CHANGED
data/lib/masticate.rb
CHANGED
@@ -1,9 +1,13 @@
|
|
1
1
|
require "open-uri"
|
2
2
|
|
3
3
|
require_relative "masticate/version"
|
4
|
+
require_relative "masticate/base"
|
4
5
|
require_relative "masticate/sniffer"
|
5
6
|
require_relative "masticate/mender"
|
6
7
|
require_relative "masticate/csvify"
|
8
|
+
require_relative "masticate/plucker"
|
9
|
+
require_relative "masticate/datify"
|
10
|
+
require_relative "masticate/gsubber"
|
7
11
|
|
8
12
|
module Masticate
|
9
13
|
def self.sniff(filename)
|
@@ -17,4 +21,16 @@ module Masticate
|
|
17
21
|
def self.csvify(filename, opts)
|
18
22
|
Csvify.new(filename).csvify(opts)
|
19
23
|
end
|
24
|
+
|
25
|
+
def self.pluck(filename, opts)
|
26
|
+
Plucker.new(filename).pluck(opts)
|
27
|
+
end
|
28
|
+
|
29
|
+
def self.datify(filename, opts)
|
30
|
+
Datify.new(filename).datify(opts)
|
31
|
+
end
|
32
|
+
|
33
|
+
def self.gsub(filename, opts)
|
34
|
+
Gsubber.new(filename).gsub(opts)
|
35
|
+
end
|
20
36
|
end
|
@@ -0,0 +1,5 @@
|
|
1
|
+
3/7/2012,hospid,usrorder,dteorder,usrsend,dtesend,usrdone,dtedone,department
|
2
|
+
15267,407,201201060140,407,201201060140,0,201201060309,L
|
3
|
+
15267,381,201201060222,381,201201060222,0,201201060647,X
|
4
|
+
15267,407,201201060311,407,201201060311,0,201201060339,L
|
5
|
+
15267,407,201201060514,108,201201060515,108,201201060515,SEC
|
@@ -0,0 +1,100 @@
|
|
1
|
+
site,ibex,unit,face,doctor,hospid,usrorder,dteorder,usrsend,dtesend,usrdone,dtedone,department,order_number
|
2
|
+
1,20120106003230,2044272,L,407,15267,407,201201060140,407,201201060140,0,201201060309,L,"594,756"
|
3
|
+
1,20120106003230,2044277,X,407,15267,381,201201060222,381,201201060222,0,201201060647,X,"594,761"
|
4
|
+
1,20120106003230,2044309,L,407,15267,407,201201060311,407,201201060311,0,201201060339,L,"594,766"
|
5
|
+
1,20120106003230,,Q,407,15267,407,201201060514,108,201201060515,108,201201060515,SEC,"594,787"
|
6
|
+
1,20120106024355,,Q,407,15267,407,201201060309,90,201201060316,90,201201060316,IV,"594,764"
|
7
|
+
1,20120106024355,2044306,L,407,15267,407,201201060309,407,201201060309,0,201201060345,L,"594,763"
|
8
|
+
1,20120106024355,2044308,X,407,15267,407,201201060310,407,201201060310,0,201201060556,X,"594,765"
|
9
|
+
1,20120106024355,2044307,L,407,15267,407,201201060309,407,201201060309,0,201201060333,L,"594,762"
|
10
|
+
1,20120106024355,,Q,407,15267,407,201201060520,108,201201060522,108,201201060522,SEC,"594,789"
|
11
|
+
1,20120106024355,2044579,L,407,15267,68,201201060826,68,201201060826,0,201201071149,L,"594,823"
|
12
|
+
1,20120106032719,2044345,L,407,15267,407,201201060348,407,201201060348,0,201201060442,L,"594,775"
|
13
|
+
1,20120106032719,2044344,L,407,15267,407,201201060348,407,201201060348,0,201201060442,L,"594,777"
|
14
|
+
1,20120106032719,2044343,L,407,15267,407,201201060348,407,201201060348,0,201201060428,L,"594,773"
|
15
|
+
1,20120106032719,,Q,407,15267,407,201201060348,426,201201060408,426,201201060408,IV,"594,774"
|
16
|
+
1,20120106032719,,Q,407,15267,407,201201060348,426,201201060634,426,201201060634,URINE,"594,776"
|
17
|
+
1,20120106032719,2044386,L,407,15267,407,201201060445,407,201201060445,0,201201060519,L,"594,785"
|
18
|
+
1,20120106032719,2044401,X,407,15267,407,201201060521,407,201201060521,0,201201060646,X,"594,790"
|
19
|
+
1,20120106033235,,Q,407,15267,407,201201060347,74,201201060353,74,201201060353,IV,"594,769"
|
20
|
+
1,20120106033235,2044349,L,407,15267,407,201201060347,74,201201060353,0,201201060443,L,"594,771"
|
21
|
+
1,20120106033235,2044350,L,407,15267,407,201201060347,74,201201060353,0,201201060434,URINE,"594,770"
|
22
|
+
1,20120106033235,2044347,L,407,15267,407,201201060347,74,201201060353,0,201201060428,L,"594,768"
|
23
|
+
1,20120106033235,2044348,L,407,15267,407,201201060347,74,201201060353,0,201201060443,L,"594,772"
|
24
|
+
1,20120106033235,2044372,X,407,15267,407,201201060429,407,201201060429,0,201201060649,X,"594,780"
|
25
|
+
1,20120106035346,,Q,407,15267,407,201201060446,426,201201060448,426,201201060448,N,"594,786"
|
26
|
+
1,20120106041426,2044383,L,407,15267,407,201201060445,407,201201060445,0,201201060657,L,"594,784"
|
27
|
+
1,20120106041426,2044384,L,407,15267,407,201201060445,407,201201060445,0,201201060657,L,"594,782"
|
28
|
+
1,20120106041426,2044382,L,407,15267,407,201201060445,407,201201060445,0,201201060522,L,"594,781"
|
29
|
+
1,20120106041426,,Q,407,15267,407,201201060445,381,201201060452,381,201201060452,IV,"594,783"
|
30
|
+
1,20120106043025,2044400,X,407,15267,407,201201060515,407,201201060515,0,201201060554,X,"594,788"
|
31
|
+
1,20120106045326,2044411,R,407,15267,407,201201060535,407,201201060535,0,201201060630,RS,"594,791"
|
32
|
+
1,20120106045326,,Q,407,15267,407,201201060535,108,201201060540,108,201201060540,SEC,"594,794"
|
33
|
+
1,20120106045326,2044412,R,407,15267,407,201201060535,407,201201060535,0,201201060629,RS,"594,795"
|
34
|
+
1,20120106045326,2044413,X,407,15267,407,201201060536,407,201201060536,0,201201060649,X,"594,796"
|
35
|
+
1,20120106045326,,Q,407,15267,407,201201060535,108,201201060541,108,201201060541,SEC,"594,792"
|
36
|
+
1,20120106045326,2044410,R,407,15267,407,201201060535,407,201201060535,0,201201060628,RS,"594,793"
|
37
|
+
1,20120106052714,2044421,L,407,15267,407,201201060544,407,201201060544,0,201201060605,L,"594,797"
|
38
|
+
1,20120106052714,,Q,407,15267,407,201201060544,90,201201060545,90,201201060545,IV,"594,799"
|
39
|
+
1,20120106052714,,Q,407,15267,407,201201060544,90,201201060545,90,201201060545,N,"594,800"
|
40
|
+
1,20120106052714,2044422,L,407,15267,407,201201060544,407,201201060544,0,201201060621,L,"594,801"
|
41
|
+
1,20120106052714,2044423,L,407,15267,407,201201060544,407,201201060544,0,201201060727,L,"594,798"
|
42
|
+
1,20120106052714,2044424,L,407,15267,407,201201060551,407,201201060551,0,201201060714,L,"594,802"
|
43
|
+
1,20120106070243,2044439,L,504,15550,504,201201060721,504,201201060721,0,201201060753,L,"594,803"
|
44
|
+
1,20120106070243,2044440,L,504,15550,504,201201060721,504,201201060721,0,201201060748,L,"594,807"
|
45
|
+
1,20120106070243,2044441,L,504,15550,504,201201060721,504,201201060721,0,201201060748,L,"594,806"
|
46
|
+
1,20120106070243,,Q,504,15550,504,201201060721,155,201201060735,155,201201060735,IV,"594,805"
|
47
|
+
1,20120106070243,,Q,504,15550,504,201201060806,155,201201060813,155,201201060813,N,"594,820"
|
48
|
+
1,20120106070243,2044524,L,504,15550,504,201201060806,504,201201060806,0,201201061004,L,"594,816"
|
49
|
+
1,20120106070243,,Q,504,15550,504,201201060807,195,201201060813,195,201201060813,SEC,"594,822"
|
50
|
+
1,20120106070243,2044522,L,504,15550,504,201201060806,504,201201060806,0,201201060959,L,"594,819"
|
51
|
+
1,20120106070243,,Q,504,15550,504,201201060807,195,201201060811,195,201201060811,SEC,"594,821"
|
52
|
+
1,20120106070243,,Q,504,15550,504,201201060806,155,201201060813,155,201201060813,N,"594,818"
|
53
|
+
1,20120106070243,,Q,504,15550,504,201201060910,155,201201060916,155,201201060916,N,"594,831"
|
54
|
+
1,20120106070243,2044716,X,504,15550,504,201201060928,504,201201060928,0,201201060953,X,"594,834"
|
55
|
+
1,20120106073142,2044480,X,504,15550,504,201201060757,504,201201060757,0,201201060819,X,"594,815"
|
56
|
+
1,20120106073757,2044475,L,504,15550,504,201201060749,155,201201060755,0,201201060925,URINE,"594,810"
|
57
|
+
1,20120106073757,2044466,L,504,15550,504,201201060749,504,201201060749,0,201201060827,L,"594,808"
|
58
|
+
1,20120106073757,2044470,X,504,15550,504,201201060749,504,201201060749,0,201201060818,X,"594,809"
|
59
|
+
1,20120106073757,2044467,L,504,15550,504,201201060749,504,201201060749,0,201201060826,L,"594,813"
|
60
|
+
1,20120106073757,2044468,L,504,15550,504,201201060749,504,201201060749,0,201201060839,L,"594,811"
|
61
|
+
1,20120106073757,2044469,L,504,15550,504,201201060749,504,201201060749,0,201201060825,L,"594,814"
|
62
|
+
1,20120106073757,,Q,504,15550,504,201201060749,155,201201060755,155,201201060755,IV,"594,812"
|
63
|
+
1,20120106073757,,Q,504,15550,504,201201060911,76,201201060933,76,201201060933,IV,"594,832"
|
64
|
+
1,20120106073757,,Q,504,15550,504,201201060928,34,201201060934,34,201201060934,SEC,"594,833"
|
65
|
+
1,20120106073757,,Q,504,15550,504,201201061022,155,201201061108,155,201201061108,IV,"594,862"
|
66
|
+
1,20120106073757,,Q,504,15550,504,201201061019,155,201201061025,155,201201061025,IV,"594,861"
|
67
|
+
1,20120106073757,,Q,504,15550,504,201201061131,195,201201061133,195,201201061133,SEC,"594,896"
|
68
|
+
1,20120106073757,,Q,504,15550,504,201201061131,195,201201061133,195,201201061133,SEC,"594,895"
|
69
|
+
1,20120106073757,2045028,X,504,15550,504,201201061131,504,201201061131,0,201201061209,X,"594,898"
|
70
|
+
1,20120106073757,2045029,X,504,15550,504,201201061131,504,201201061131,0,201201061345,X,"594,897"
|
71
|
+
1,20120106073757,,Q,504,15550,504,201201061131,155,201201061223,155,201201061223,N,"594,894"
|
72
|
+
1,20120106084347,2044639,X,504,15550,76,201201060850,76,201201060850,0,201201060931,X,"594,828"
|
73
|
+
1,20120106084720,2044670,X,55,4644,55,201201060909,55,201201060909,0,201201060934,X,"594,829"
|
74
|
+
1,20120106084720,,Q,55,4644,55,201201060910,66,201201060914,66,201201060914,N,"594,830"
|
75
|
+
1,20120106085558,2044755,L,55,4644,55,201201060949,55,201201060949,0,201201061018,L,"594,846"
|
76
|
+
1,20120106085558,2044756,L,55,4644,55,201201060949,55,201201060949,0,201201061038,L,"594,851"
|
77
|
+
1,20120106085558,2044793,L,55,4644,55,201201060949,76,201201061003,0,201201061239,URINE,"594,848"
|
78
|
+
1,20120106085558,,Q,55,4644,55,201201060949,76,201201061003,76,201201061003,IV,"594,850"
|
79
|
+
1,20120106085558,,Q,55,4644,55,201201060949,76,201201061003,76,201201061003,IV,"594,847"
|
80
|
+
1,20120106085558,2044757,L,55,4644,55,201201060949,55,201201060949,0,201201061040,L,"594,849"
|
81
|
+
1,20120106085558,2044843,L,55,4644,55,201201061033,55,201201061033,0,201201071505,L,"594,864"
|
82
|
+
1,20120106085558,2044841,X,55,4644,55,201201061032,55,201201061032,0,201201061136,X,"594,863"
|
83
|
+
1,20120106085558,2044844,L,55,4644,55,201201061033,55,201201061033,0,201201061119,L,"594,865"
|
84
|
+
1,20120106085558,,Q,55,4644,55,201201061228,195,201201061240,195,201201061240,SEC,"594,961"
|
85
|
+
1,20120106091726,2044741,L,504,15550,504,201201060942,504,201201060942,0,201201061024,L,"594,839"
|
86
|
+
1,20120106091726,2044745,X,504,15550,504,201201060942,504,201201060942,0,201201061016,X,"594,835"
|
87
|
+
1,20120106091726,2044746,L,504,15550,504,201201060942,504,201201060942,0,201201061107,L,"594,842"
|
88
|
+
1,20120106091726,2044740,L,504,15550,504,201201060942,504,201201060942,0,201201061017,L,"594,836"
|
89
|
+
1,20120106091726,2044744,L,504,15550,504,201201060942,504,201201060942,0,201201061024,L,"594,838"
|
90
|
+
1,20120106091726,2044742,L,504,15550,504,201201060942,504,201201060942,0,201201061016,L,"594,841"
|
91
|
+
1,20120106091726,,Q,504,15550,504,201201060942,66,201201060944,66,201201060944,IV,"594,837"
|
92
|
+
1,20120106091726,2044743,L,504,15550,504,201201060942,504,201201060942,0,201201061016,L,"594,840"
|
93
|
+
1,20120106095129,2044814,X,55,4644,55,201201061010,55,201201061010,0,201201061037,X,"594,853"
|
94
|
+
1,20120106100014,,Q,504,15550,504,201201061011,885,201201061037,885,201201061037,IV,"594,857"
|
95
|
+
1,20120106100014,,Q,504,15550,504,201201061011,885,201201061037,885,201201061037,N,"594,858"
|
96
|
+
1,20120106100014,,Q,504,15550,504,201201061011,885,201201061037,885,201201061037,N,"594,859"
|
97
|
+
1,20120106100014,2044815,L,504,15550,504,201201061011,504,201201061011,0,201201061023,L,"594,854"
|
98
|
+
1,20120106100014,2044817,L,504,15550,504,201201061011,504,201201061011,0,201201061049,L,"594,856"
|
99
|
+
1,20120106100014,2044818,X,504,15550,504,201201061011,504,201201061011,0,201201061038,X,"594,855"
|
100
|
+
1,20120106100014,2044816,L,504,15550,504,201201061011,504,201201061011,0,201201061049,L,"594,860"
|
data/spec/lib/csvify_spec.rb
CHANGED
@@ -0,0 +1,16 @@
|
|
1
|
+
# spec for field regexp conversion
|
2
|
+
|
3
|
+
require "spec_helper"
|
4
|
+
require "tempfile"
|
5
|
+
|
6
|
+
describe "gsubbing" do
|
7
|
+
it "should apply conversion to a single column" do
|
8
|
+
filename = File.dirname(__FILE__) + "/../data/badnums.csv"
|
9
|
+
tmp = Tempfile.new('gsubber')
|
10
|
+
results = Masticate.gsub(filename, :output => tmp, :field => 'AuditByID', :from => '/,|(.00$)/', :to => '')
|
11
|
+
output = File.read(tmp)
|
12
|
+
correct_output = File.read(File.dirname(__FILE__) + "/../data/badnums_fixed.csv")
|
13
|
+
|
14
|
+
output.should == correct_output
|
15
|
+
end
|
16
|
+
end
|
data/spec/lib/mend_spec.rb
CHANGED
@@ -6,15 +6,24 @@ describe "mending" do
|
|
6
6
|
it "should merge lines when delimiter counts don't match'" do
|
7
7
|
filename = File.dirname(__FILE__) + "/../data/broken_psv.txt"
|
8
8
|
results = Masticate.mend(filename, :col_sep => '|', :output => "/dev/null")
|
9
|
-
results[:
|
10
|
-
results[:
|
9
|
+
results[:input_count].should == 7
|
10
|
+
results[:output_count].should == 5
|
11
11
|
end
|
12
12
|
|
13
13
|
it "should strip trailer records" do
|
14
14
|
filename = File.dirname(__FILE__) + "/../data/junk_trailer.txt"
|
15
15
|
metadata = Masticate.sniff(filename)
|
16
16
|
results = Masticate.mend(filename, metadata.merge(:output => "/dev/null"))
|
17
|
-
results[:
|
18
|
-
results[:
|
17
|
+
results[:input_count].should == 9
|
18
|
+
results[:output_count].should == 5
|
19
|
+
results[:headers].should == ['COL1', 'COL 2', 'Col 3', 'col-4', 'col5', 'col6']
|
20
|
+
end
|
21
|
+
|
22
|
+
it "should snip head fields" do
|
23
|
+
filename = File.dirname(__FILE__) + "/../data/junk_header.csv"
|
24
|
+
results = Masticate.mend(filename, :col_sep => ',', :snip => 1, :output => "/dev/null")
|
25
|
+
results[:input_count].should == 6
|
26
|
+
results[:output_count].should == 5
|
27
|
+
results[:headers].should == %w(hospid usrorder dteorder usrsend dtesend usrdone dtedone department)
|
19
28
|
end
|
20
29
|
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
# spec for column-plucking functions
|
2
|
+
|
3
|
+
require "spec_helper"
|
4
|
+
require "tempfile"
|
5
|
+
|
6
|
+
describe "plucker" do
|
7
|
+
it "should pull named columns" do
|
8
|
+
filename = File.dirname(__FILE__) + "/../data/namedcols.csv"
|
9
|
+
tmp = Tempfile.new('plucker')
|
10
|
+
results = Masticate.pluck(filename, :output => tmp, :fields => ['three', 'five'])
|
11
|
+
output = File.read(tmp)
|
12
|
+
correct_output = File.read(File.dirname(__FILE__) + "/../data/namedcols.csv.output")
|
13
|
+
tmp.unlink
|
14
|
+
|
15
|
+
results[:input_count].should == 5
|
16
|
+
output.should == correct_output
|
17
|
+
end
|
18
|
+
end
|
data/spec/lib/sniffer_spec.rb
CHANGED
@@ -16,4 +16,12 @@ describe "delimiter sniffing" do
|
|
16
16
|
results[:col_sep].should == '|'
|
17
17
|
results[:field_counts].should == {6 => 5}
|
18
18
|
end
|
19
|
+
|
20
|
+
it "should recognize quotes in CSV sources" do
|
21
|
+
filename = File.dirname(__FILE__) + "/../data/quoted_csv_data.txt"
|
22
|
+
results = Masticate.sniff(filename)
|
23
|
+
results[:col_sep].should == ','
|
24
|
+
results[:quote_char].should == '"'
|
25
|
+
results[:field_counts].should == {14 => 100}
|
26
|
+
end
|
19
27
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: masticate
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-04-
|
12
|
+
date: 2012-04-05 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rspec
|
16
|
-
requirement: &
|
16
|
+
requirement: &2153339260 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: 2.9.0
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *2153339260
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: guard-rspec
|
27
|
-
requirement: &
|
27
|
+
requirement: &2153338760 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ~>
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: 0.7.0
|
33
33
|
type: :development
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *2153338760
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: ruby_gntp
|
38
|
-
requirement: &
|
38
|
+
requirement: &2153338100 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ~>
|
@@ -43,7 +43,7 @@ dependencies:
|
|
43
43
|
version: 0.3.4
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *2153338100
|
47
47
|
description: Data file crunching
|
48
48
|
email:
|
49
49
|
- jmay@pobox.com
|
@@ -60,17 +60,29 @@ files:
|
|
60
60
|
- Rakefile
|
61
61
|
- bin/masticate
|
62
62
|
- lib/masticate.rb
|
63
|
+
- lib/masticate/base.rb
|
63
64
|
- lib/masticate/csvify.rb
|
65
|
+
- lib/masticate/datify.rb
|
66
|
+
- lib/masticate/gsubber.rb
|
64
67
|
- lib/masticate/mender.rb
|
68
|
+
- lib/masticate/plucker.rb
|
65
69
|
- lib/masticate/sniffer.rb
|
66
70
|
- lib/masticate/version.rb
|
67
71
|
- masticate.gemspec
|
72
|
+
- spec/data/badnums.csv
|
73
|
+
- spec/data/badnums_fixed.csv
|
68
74
|
- spec/data/broken_psv.txt
|
75
|
+
- spec/data/junk_header.csv
|
69
76
|
- spec/data/junk_trailer.txt
|
77
|
+
- spec/data/namedcols.csv
|
78
|
+
- spec/data/namedcols.csv.output
|
70
79
|
- spec/data/pipe_data.txt
|
80
|
+
- spec/data/quoted_csv_data.txt
|
71
81
|
- spec/data/tabbed_data.txt
|
72
82
|
- spec/lib/csvify_spec.rb
|
83
|
+
- spec/lib/gsub_spec.rb
|
73
84
|
- spec/lib/mend_spec.rb
|
85
|
+
- spec/lib/plucker_spec.rb
|
74
86
|
- spec/lib/sniffer_spec.rb
|
75
87
|
- spec/spec_helper.rb
|
76
88
|
homepage: ''
|
@@ -98,12 +110,20 @@ signing_key:
|
|
98
110
|
specification_version: 3
|
99
111
|
summary: Utility functions for parsing incoming text data files.
|
100
112
|
test_files:
|
113
|
+
- spec/data/badnums.csv
|
114
|
+
- spec/data/badnums_fixed.csv
|
101
115
|
- spec/data/broken_psv.txt
|
116
|
+
- spec/data/junk_header.csv
|
102
117
|
- spec/data/junk_trailer.txt
|
118
|
+
- spec/data/namedcols.csv
|
119
|
+
- spec/data/namedcols.csv.output
|
103
120
|
- spec/data/pipe_data.txt
|
121
|
+
- spec/data/quoted_csv_data.txt
|
104
122
|
- spec/data/tabbed_data.txt
|
105
123
|
- spec/lib/csvify_spec.rb
|
124
|
+
- spec/lib/gsub_spec.rb
|
106
125
|
- spec/lib/mend_spec.rb
|
126
|
+
- spec/lib/plucker_spec.rb
|
107
127
|
- spec/lib/sniffer_spec.rb
|
108
128
|
- spec/spec_helper.rb
|
109
129
|
has_rdoc:
|