masticate 0.0.4 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/bin/masticate CHANGED
@@ -1,10 +1,47 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
3
  require_relative "../lib/masticate"
4
+ require "optparse"
4
5
 
5
- command, filename = ARGV
6
+ command = ARGV.shift
6
7
 
7
- case ARGV.shift
8
+ options = {}
9
+ OptionParser.new do |opts|
10
+ opts.banner = "Usage: example.rb [options]"
11
+
12
+ opts.on("--format FORMAT", "Specify format") do |v|
13
+ options[:format] = v
14
+ end
15
+
16
+ opts.on("--delim DELIMITER", "Specify field delimiter (character or TAB)") do |v|
17
+ options[:col_sep] = v
18
+ options[:col_sep] = "\t" if options[:col_sep] == "TAB"
19
+ end
20
+
21
+ opts.on("--fields LIST", Array, "Specify fields to select") do |list|
22
+ options[:fields] = list
23
+ end
24
+
25
+ opts.on("--field FIELD", "Specify field to convert") do |f|
26
+ options[:field] = f
27
+ end
28
+
29
+ opts.on("--snip DIRECTIVE", "Specify header fields to snip: first N, or by name") do |f|
30
+ options[:snip] = f.to_i
31
+ end
32
+
33
+ opts.on("--from REGEXP", "Regular expression for gsub conversion") do |s|
34
+ options[:from] = s
35
+ end
36
+
37
+ opts.on("--to STRING", "Result string for gsub conversion") do |s|
38
+ options[:to] = s
39
+ end
40
+ end.parse!
41
+
42
+ filename = ARGV.shift # use stdin if no filename provided
43
+
44
+ case command
8
45
  when 'sniff'
9
46
  results = Masticate.sniff(filename)
10
47
  col_sep = results[:col_sep]
@@ -13,30 +50,49 @@ when 'sniff'
13
50
  Processing complete.
14
51
  Input delimiter: #{col_sep}
15
52
  Field counts: #{results[:field_counts].inspect}
53
+ Headers: #{results[:headers].join(',')}
16
54
  EOT
17
55
 
18
56
  when 'mend'
19
- metadata = Masticate.sniff(filename)
20
- col_sep = metadata[:col_sep]
21
- col_sep = "TAB" if col_sep == "\t"
22
- results = Masticate.mend(filename, metadata)
57
+ results = Masticate.mend(filename, options)
23
58
  $stderr.puts <<-EOT
24
59
  Processing complete.
25
- Input delimiter: #{col_sep}
26
- Lines in input: #{results[:input_records]}
27
- Lines in output: #{results[:output_records]}
60
+ Lines in input: #{results[:input_count]}
61
+ Lines in output: #{results[:output_count]}
28
62
  EOT
29
63
 
30
64
  when 'csvify'
31
- metadata = Masticate.sniff(filename)
32
- results = Masticate.csvify(filename, metadata)
65
+ results = Masticate.csvify(filename, options)
33
66
  $stderr.puts <<-EOT
34
67
  Processing complete.
35
- Input delimiter: #{metadata[:col_sep]}
36
68
  Lines in input: #{results[:input_count]}
37
69
  Lines in output: #{results[:output_count]}
38
70
  EOT
39
71
 
72
+ when 'pluck'
73
+ results = Masticate.pluck(filename, options)
74
+ $stderr.puts <<-EOT
75
+ Processing complete.
76
+ Lines in input: #{results[:input_count]}
77
+ Lines in output: #{results[:output_count]}
78
+ EOT
79
+
80
+ when 'datify'
81
+ results = Masticate.datify(filename, options)
82
+ $stderr.puts <<-EOT
83
+ Processing complete.
84
+ Lines in input: #{results[:input_count]}
85
+ Lines in output: #{results[:output_count]}
86
+ EOT
87
+
88
+ when 'gsub'
89
+ results = Masticate.gsub(filename, options)
90
+ # $stderr.puts <<-EOT
91
+ # Processing complete.
92
+ # Lines in input: #{results[:input_count]}
93
+ # Lines in output: #{results[:output_count]}
94
+ # EOT
95
+
40
96
  else
41
97
  raise "unknown command #{command}"
42
98
  end
@@ -0,0 +1,33 @@
1
+ class Masticate::Base
2
+ attr_reader :filename
3
+ attr_reader :input, :output
4
+ attr_reader :input_count, :output_count
5
+
6
+ def initialize(filename)
7
+ @filename = filename
8
+ end
9
+
10
+ def with_input
11
+ @input = @filename ? open(@filename) : $stdin
12
+ @input_count = 0
13
+ result = yield @input
14
+ @input.close if @filename
15
+ result
16
+ end
17
+
18
+ def get
19
+ line = @input.gets
20
+ @input_count += 1
21
+ line && line.chomp
22
+ end
23
+
24
+ def emit(line)
25
+ @output_count += 1
26
+ begin
27
+ @output.puts line
28
+ rescue Errno::EPIPE
29
+ # output was closed, e.g. ran piped into `head`
30
+ # silently ignore this condition, it's not fatal and doesn't need a warning
31
+ end
32
+ end
33
+ end
@@ -1,11 +1,9 @@
1
1
  # convert input to clean standard CSV
2
2
  require "csv"
3
3
 
4
- class Masticate::Csvify
5
- attr_reader :input
6
-
4
+ class Masticate::Csvify < Masticate::Base
7
5
  def initialize(filename)
8
- @input = File.open(filename)
6
+ @filename = filename
9
7
  end
10
8
 
11
9
  def csvify(opts)
@@ -14,26 +12,18 @@ class Masticate::Csvify
14
12
  csv_options[:col_sep] = opts[:col_sep] if opts[:col_sep]
15
13
  csv_options[:quote_char] = opts[:quote_char] || opts[:col_sep] if opts[:quote_char] || opts[:col_sep]
16
14
 
17
- input_count = @output_count = 0
18
- CSV.foreach(input, csv_options) do |row|
19
- input_count += 1
20
- emit(row.to_csv)
15
+ @output_count = 0
16
+ with_input do |input|
17
+ while line = get
18
+ row = CSV.parse_line(line, csv_options)
19
+ emit(row.to_csv) if row
20
+ end
21
21
  end
22
22
  @output.close if opts[:output]
23
- @input.close
23
+
24
24
  {
25
25
  :input_count => input_count,
26
26
  :output_count => @output_count
27
27
  }
28
28
  end
29
-
30
- def emit(line)
31
- @output_count += 1
32
- begin
33
- @output.puts line
34
- rescue Errno::EPIPE
35
- # output was closed, e.g. ran piped into `head`
36
- # silently ignore this condition, it's not fatal and doesn't need a warning
37
- end
38
- end
39
29
  end
@@ -0,0 +1,36 @@
1
+ # convert date columns to numerics
2
+ require "csv"
3
+
4
+ class Masticate::Datify < Masticate::Base
5
+ def datify(opts)
6
+ @output = opts[:output] ? File.open(opts[:output], "w") : $stdout
7
+ csv_options = {}
8
+ csv_options[:col_sep] = opts[:col_sep] if opts[:col_sep]
9
+ csv_options[:quote_char] = opts[:quote_char] || opts[:col_sep] if opts[:quote_char] || opts[:col_sep]
10
+
11
+ field = opts[:field] or raise "missing field to datify"
12
+ format = opts[:format] or raise "strptime format required for parsing timestamps"
13
+
14
+ @output_count = 0
15
+ headers = nil
16
+ with_input do |input|
17
+ while line = get
18
+ row = CSV.parse_line(line, csv_options)
19
+ if !headers
20
+ headers = row
21
+ index = headers.index(field) or raise "Unable to find column '#{field}'"
22
+ emit(headers.to_csv)
23
+ else
24
+ row[index] = DateTime.strptime(row[index], format).to_time.to_i rescue nil
25
+ emit(row.to_csv)
26
+ end
27
+ end
28
+ end
29
+ @output.close if opts[:output]
30
+
31
+ {
32
+ :input_count => input_count,
33
+ :output_count => @output_count
34
+ }
35
+ end
36
+ end
@@ -0,0 +1,39 @@
1
+ # extract subset of columns from CSV
2
+ require "csv"
3
+
4
+ class Masticate::Gsubber < Masticate::Base
5
+ def gsub(opts)
6
+ @output = opts[:output] ? File.open(opts[:output], "w") : $stdout
7
+ csv_options = {}
8
+ csv_options[:col_sep] = opts[:col_sep] if opts[:col_sep]
9
+ csv_options[:quote_char] = opts[:quote_char] || opts[:col_sep] if opts[:quote_char] || opts[:col_sep]
10
+
11
+ field = opts[:field] or raise "missing field to gsub"
12
+ from = Regexp.new(opts[:from]) or raise "Invalid regex '#{opts[:from]}' for conversion"
13
+ to = opts[:to] or raise "missing 'to' string for gsub"
14
+
15
+ @output_count = 0
16
+ headers = nil
17
+ with_input do |input|
18
+ while line = get
19
+ row = CSV.parse_line(line, csv_options)
20
+ if !headers
21
+ headers = row
22
+ index = headers.index(field) or raise "Unable to find column '#{field}'"
23
+ emit(line)
24
+ else
25
+ oldval = row[index]
26
+ newval = oldval.gsub(from, to)
27
+ row[index] = newval
28
+ emit(row.to_csv)
29
+ end
30
+ end
31
+ end
32
+ @output.close if opts[:output]
33
+
34
+ {
35
+ :input_count => input_count,
36
+ :output_count => @output_count
37
+ }
38
+ end
39
+ end
@@ -3,61 +3,75 @@
3
3
  # A row that contains fewer delimiters than expected has been split across two lines
4
4
  # (due to a newline embedded in a field). Glue those two lines into a single line in the output.
5
5
 
6
- class Masticate::Mender
7
- attr_reader :input
6
+ class Masticate::Mender < Masticate::Base
7
+ attr_reader :col_sep
8
8
 
9
9
  def initialize(filename)
10
- @input = open(filename)
10
+ @filename = filename
11
11
  end
12
12
 
13
13
  def mend(opts)
14
14
  @output = opts[:output] ? File.open(opts[:output], "w") : $stdout
15
- col_sep = opts[:col_sep]
16
-
17
- expected_delim_count = nil
18
- @input_count = @output_count = 0
19
- while (line = get) do
20
- unless line =~ /^\s*$/
21
- if !expected_delim_count
22
- # trust the first row
23
- expected_delim_count = line.count(col_sep)
24
- else
25
- running_count = line.count(col_sep)
26
- while !input.eof? && running_count < expected_delim_count do
27
- nextbit = get
28
- if nextbit
29
- line = line + ' ' + nextbit
30
- running_count = line.count(col_sep)
15
+ @col_sep = opts[:col_sep] || ','
16
+
17
+ expected_field_count = nil
18
+ headers = nil
19
+ @output_count = 0
20
+ with_input do |input|
21
+ while (line = get) do
22
+ unless line =~ /^\s*$/
23
+ if !expected_field_count
24
+ # trust the first row
25
+ headers = explode(line)
26
+ case opts[:snip]
27
+ when Fixnum
28
+ headers.shift(opts[:snip])
29
+ when nil
30
+ # do nothing
31
+ else
32
+ raise "Do not understand snip instruction [#{opts[:snip].inspect}]"
33
+ end
34
+ expected_field_count = headers.count
35
+ emit(headers.to_csv(:col_sep => @col_sep))
36
+ else
37
+ running_count = fieldcount(line)
38
+ while !input.eof? && running_count < expected_field_count do
39
+ nextbit = get
40
+ if nextbit
41
+ line = line + ' ' + nextbit
42
+ running_count = fieldcount(line)
43
+ end
44
+ end
45
+
46
+ if line.count(col_sep) > 2
47
+ emit(line)
31
48
  end
32
49
  end
33
50
  end
34
- if line.count(col_sep) > 2
35
- emit(line)
36
- end
37
51
  end
38
52
  end
39
53
 
40
- @input.close
41
54
  @output.close if opts[:output]
42
55
  {
43
- :input_records => @input_count,
44
- :output_records => @output_count
56
+ :input_count => @input_count,
57
+ :output_count => @output_count,
58
+ :headers => headers
45
59
  }
46
60
  end
47
61
 
48
- def get
49
- line = input.gets
50
- @input_count += 1
51
- line && line.chomp
62
+ def fieldcount(line)
63
+ if col_sep == ','
64
+ CSV.parse_line(line).count
65
+ else
66
+ line.count(col_sep)+1
67
+ end
52
68
  end
53
69
 
54
- def emit(line)
55
- @output_count += 1
56
- begin
57
- @output.puts line
58
- rescue Errno::EPIPE
59
- # output was closed, e.g. ran piped into `head`
60
- # silently ignore this condition, it's not fatal and doesn't need a warning
70
+ def explode(line)
71
+ if col_sep == ','
72
+ CSV.parse_line(line).map(&:strip)
73
+ else
74
+ line.split(col_sep).map(&:strip)
61
75
  end
62
76
  end
63
77
  end
@@ -0,0 +1,34 @@
1
+ # extract subset of columns from CSV
2
+ require "csv"
3
+
4
+ class Masticate::Plucker < Masticate::Base
5
+ def pluck(opts)
6
+ @output = opts[:output] ? File.open(opts[:output], "w") : $stdout
7
+ csv_options = {}
8
+ csv_options[:col_sep] = opts[:col_sep] if opts[:col_sep]
9
+ csv_options[:quote_char] = opts[:quote_char] || opts[:col_sep] if opts[:quote_char] || opts[:col_sep]
10
+
11
+ fields = opts[:fields] or raise "missing fields to pluck"
12
+
13
+ @output_count = 0
14
+ headers = nil
15
+ with_input do |input|
16
+ while line = get
17
+ row = CSV.parse_line(line, csv_options)
18
+ if !headers
19
+ headers = row
20
+ indexes = fields.map {|f| headers.index(f) or raise "Unable to find column '#{f}'"}
21
+ emit(fields.to_csv)
22
+ else
23
+ emit(indexes.map {|i| row[i]}.to_csv) if row
24
+ end
25
+ end
26
+ end
27
+ @output.close if opts[:output]
28
+
29
+ {
30
+ :input_count => input_count,
31
+ :output_count => @output_count
32
+ }
33
+ end
34
+ end
@@ -1,5 +1,8 @@
1
- class Masticate::Sniffer
2
- attr_reader :col_sep, :stats
1
+ require "set"
2
+
3
+ class Masticate::Sniffer < Masticate::Base
4
+ attr_reader :col_sep, :quote_char, :stats
5
+ attr_reader :delimstats
3
6
 
4
7
  CandidateDelimiters = [',', '|', "\t"]
5
8
 
@@ -9,32 +12,68 @@ class Masticate::Sniffer
9
12
 
10
13
  def sniff
11
14
  @col_sep = find_col_sep
15
+ @quote_char = delimstats[@col_sep][:quote_char]
12
16
  @stats = stats
13
17
  {
14
18
  :col_sep => @col_sep,
19
+ :quote_char => @quote_char,
15
20
  :field_counts => @stats,
16
- :line1 => @line1
21
+ :headers => @line1.split(@col_sep).map(&:strip)
17
22
  }
18
23
  end
19
24
 
20
25
  def find_col_sep
21
- input = open(@filename)
22
- @line1 = input.lines.first
23
- delimcounts = CandidateDelimiters.each_with_object({}) do |delim,h|
24
- h[delim] = consider_delim(@line1, delim)
26
+ @delimstats = {}
27
+ with_input do |input|
28
+ input.lines.take(10).each do |line|
29
+ @line1 = line unless @line1
30
+
31
+ CandidateDelimiters.each do |delim|
32
+ delimstats[delim] ||= { :counts => Set.new, :quote_char => nil}
33
+ h = delimstats[delim]
34
+ fieldcount, quote_char = consider_delim(line, delim)
35
+ h[:counts] << fieldcount
36
+ h[:quote_char] ||= quote_char
37
+ end
38
+ end
25
39
  end
26
- input.close
27
- delimcounts.sort_by{|h,v| -v}.first.first
40
+ delimstats.sort_by{|delim,stats| stats[:counts].max || 0}.last.first
28
41
  end
29
42
 
30
43
  def consider_delim(line, delim)
31
- line.count(delim)
44
+ @quote_char = nil
45
+ n = count_fields(line, delim)
46
+ [n, @quote_char]
47
+ end
48
+
49
+ def count_fields(line, delim)
50
+ if delim == ','
51
+ straight_count = line.count(delim) + 1
52
+ count_with_quoting = begin
53
+ CSV.parse_line(line).count
54
+ rescue CSV::MalformedCSVError
55
+ # this is not valid CSV, e.g. has incorrectly embedded quotes
56
+ 0
57
+ end
58
+ if count_with_quoting < straight_count
59
+ @quote_char = '"'
60
+ count_with_quoting
61
+ else
62
+ straight_count
63
+ end
64
+ else
65
+ line.count(delim) + 1
66
+ end
32
67
  end
33
68
 
34
69
  def stats
35
- input = open(@filename)
36
- counts = input.lines.each_with_object(Hash.new(0)) {|line, counts| counts[line.split(col_sep).count] += 1}
37
- input.close
70
+ counts = with_input do |input|
71
+ if col_sep == ',' && quote_char
72
+ input.lines.each_with_object(Hash.new(0)) {|line, counts| counts[CSV.parse_line(line, :quote_char => quote_char).count] += 1}
73
+ else
74
+ input.lines.each_with_object(Hash.new(0)) {|line, counts| counts[line.split(col_sep).count] += 1}
75
+ end
76
+ end
38
77
  counts
39
78
  end
40
79
  end
@@ -1,3 +1,3 @@
1
1
  module Masticate
2
- VERSION = "0.0.4"
2
+ VERSION = "0.1.0"
3
3
  end
data/lib/masticate.rb CHANGED
@@ -1,9 +1,13 @@
1
1
  require "open-uri"
2
2
 
3
3
  require_relative "masticate/version"
4
+ require_relative "masticate/base"
4
5
  require_relative "masticate/sniffer"
5
6
  require_relative "masticate/mender"
6
7
  require_relative "masticate/csvify"
8
+ require_relative "masticate/plucker"
9
+ require_relative "masticate/datify"
10
+ require_relative "masticate/gsubber"
7
11
 
8
12
  module Masticate
9
13
  def self.sniff(filename)
@@ -17,4 +21,16 @@ module Masticate
17
21
  def self.csvify(filename, opts)
18
22
  Csvify.new(filename).csvify(opts)
19
23
  end
24
+
25
+ def self.pluck(filename, opts)
26
+ Plucker.new(filename).pluck(opts)
27
+ end
28
+
29
+ def self.datify(filename, opts)
30
+ Datify.new(filename).datify(opts)
31
+ end
32
+
33
+ def self.gsub(filename, opts)
34
+ Gsubber.new(filename).gsub(opts)
35
+ end
20
36
  end
@@ -0,0 +1,8 @@
1
+ ChangeDTTM,AuditCode,AuditByID
2
+ 1326448188,MERGE,"518,437.00"
3
+ 1327481049,MERGEMATCH,0.00
4
+ 1327405172,MERGEMATCH,0.00
5
+ 1327655234,MERGEMATCH,0.00
6
+ 1327486334,MERGE,"429,073.00"
7
+ 1326447578,MERGEMATCH,0.00
8
+ 1326297465,MERGE,"123,456.00"
@@ -0,0 +1,8 @@
1
+ ChangeDTTM,AuditCode,AuditByID
2
+ 1326448188,MERGE,"518,437.00"
3
+ 1327481049,MERGEMATCH,0.00
4
+ 1327405172,MERGEMATCH,0.00
5
+ 1327655234,MERGEMATCH,0.00
6
+ 1327486334,MERGE,"429,073.00"
7
+ 1326447578,MERGEMATCH,0.00
8
+ 1326297465,MERGE,"123,456.00"
@@ -0,0 +1,5 @@
1
+ 3/7/2012,hospid,usrorder,dteorder,usrsend,dtesend,usrdone,dtedone,department
2
+ 15267,407,201201060140,407,201201060140,0,201201060309,L
3
+ 15267,381,201201060222,381,201201060222,0,201201060647,X
4
+ 15267,407,201201060311,407,201201060311,0,201201060339,L
5
+ 15267,407,201201060514,108,201201060515,108,201201060515,SEC
@@ -0,0 +1,4 @@
1
+ one,two,three,four,five
2
+ data1,data2,data3,data4,data5
3
+ 111,22,333,44,555
4
+ 91,92,93,94,95
@@ -0,0 +1,4 @@
1
+ three,five
2
+ data3,data5
3
+ 333,555
4
+ 93,95
@@ -0,0 +1,100 @@
1
+ site,ibex,unit,face,doctor,hospid,usrorder,dteorder,usrsend,dtesend,usrdone,dtedone,department,order_number
2
+ 1,20120106003230,2044272,L,407,15267,407,201201060140,407,201201060140,0,201201060309,L,"594,756"
3
+ 1,20120106003230,2044277,X,407,15267,381,201201060222,381,201201060222,0,201201060647,X,"594,761"
4
+ 1,20120106003230,2044309,L,407,15267,407,201201060311,407,201201060311,0,201201060339,L,"594,766"
5
+ 1,20120106003230,,Q,407,15267,407,201201060514,108,201201060515,108,201201060515,SEC,"594,787"
6
+ 1,20120106024355,,Q,407,15267,407,201201060309,90,201201060316,90,201201060316,IV,"594,764"
7
+ 1,20120106024355,2044306,L,407,15267,407,201201060309,407,201201060309,0,201201060345,L,"594,763"
8
+ 1,20120106024355,2044308,X,407,15267,407,201201060310,407,201201060310,0,201201060556,X,"594,765"
9
+ 1,20120106024355,2044307,L,407,15267,407,201201060309,407,201201060309,0,201201060333,L,"594,762"
10
+ 1,20120106024355,,Q,407,15267,407,201201060520,108,201201060522,108,201201060522,SEC,"594,789"
11
+ 1,20120106024355,2044579,L,407,15267,68,201201060826,68,201201060826,0,201201071149,L,"594,823"
12
+ 1,20120106032719,2044345,L,407,15267,407,201201060348,407,201201060348,0,201201060442,L,"594,775"
13
+ 1,20120106032719,2044344,L,407,15267,407,201201060348,407,201201060348,0,201201060442,L,"594,777"
14
+ 1,20120106032719,2044343,L,407,15267,407,201201060348,407,201201060348,0,201201060428,L,"594,773"
15
+ 1,20120106032719,,Q,407,15267,407,201201060348,426,201201060408,426,201201060408,IV,"594,774"
16
+ 1,20120106032719,,Q,407,15267,407,201201060348,426,201201060634,426,201201060634,URINE,"594,776"
17
+ 1,20120106032719,2044386,L,407,15267,407,201201060445,407,201201060445,0,201201060519,L,"594,785"
18
+ 1,20120106032719,2044401,X,407,15267,407,201201060521,407,201201060521,0,201201060646,X,"594,790"
19
+ 1,20120106033235,,Q,407,15267,407,201201060347,74,201201060353,74,201201060353,IV,"594,769"
20
+ 1,20120106033235,2044349,L,407,15267,407,201201060347,74,201201060353,0,201201060443,L,"594,771"
21
+ 1,20120106033235,2044350,L,407,15267,407,201201060347,74,201201060353,0,201201060434,URINE,"594,770"
22
+ 1,20120106033235,2044347,L,407,15267,407,201201060347,74,201201060353,0,201201060428,L,"594,768"
23
+ 1,20120106033235,2044348,L,407,15267,407,201201060347,74,201201060353,0,201201060443,L,"594,772"
24
+ 1,20120106033235,2044372,X,407,15267,407,201201060429,407,201201060429,0,201201060649,X,"594,780"
25
+ 1,20120106035346,,Q,407,15267,407,201201060446,426,201201060448,426,201201060448,N,"594,786"
26
+ 1,20120106041426,2044383,L,407,15267,407,201201060445,407,201201060445,0,201201060657,L,"594,784"
27
+ 1,20120106041426,2044384,L,407,15267,407,201201060445,407,201201060445,0,201201060657,L,"594,782"
28
+ 1,20120106041426,2044382,L,407,15267,407,201201060445,407,201201060445,0,201201060522,L,"594,781"
29
+ 1,20120106041426,,Q,407,15267,407,201201060445,381,201201060452,381,201201060452,IV,"594,783"
30
+ 1,20120106043025,2044400,X,407,15267,407,201201060515,407,201201060515,0,201201060554,X,"594,788"
31
+ 1,20120106045326,2044411,R,407,15267,407,201201060535,407,201201060535,0,201201060630,RS,"594,791"
32
+ 1,20120106045326,,Q,407,15267,407,201201060535,108,201201060540,108,201201060540,SEC,"594,794"
33
+ 1,20120106045326,2044412,R,407,15267,407,201201060535,407,201201060535,0,201201060629,RS,"594,795"
34
+ 1,20120106045326,2044413,X,407,15267,407,201201060536,407,201201060536,0,201201060649,X,"594,796"
35
+ 1,20120106045326,,Q,407,15267,407,201201060535,108,201201060541,108,201201060541,SEC,"594,792"
36
+ 1,20120106045326,2044410,R,407,15267,407,201201060535,407,201201060535,0,201201060628,RS,"594,793"
37
+ 1,20120106052714,2044421,L,407,15267,407,201201060544,407,201201060544,0,201201060605,L,"594,797"
38
+ 1,20120106052714,,Q,407,15267,407,201201060544,90,201201060545,90,201201060545,IV,"594,799"
39
+ 1,20120106052714,,Q,407,15267,407,201201060544,90,201201060545,90,201201060545,N,"594,800"
40
+ 1,20120106052714,2044422,L,407,15267,407,201201060544,407,201201060544,0,201201060621,L,"594,801"
41
+ 1,20120106052714,2044423,L,407,15267,407,201201060544,407,201201060544,0,201201060727,L,"594,798"
42
+ 1,20120106052714,2044424,L,407,15267,407,201201060551,407,201201060551,0,201201060714,L,"594,802"
43
+ 1,20120106070243,2044439,L,504,15550,504,201201060721,504,201201060721,0,201201060753,L,"594,803"
44
+ 1,20120106070243,2044440,L,504,15550,504,201201060721,504,201201060721,0,201201060748,L,"594,807"
45
+ 1,20120106070243,2044441,L,504,15550,504,201201060721,504,201201060721,0,201201060748,L,"594,806"
46
+ 1,20120106070243,,Q,504,15550,504,201201060721,155,201201060735,155,201201060735,IV,"594,805"
47
+ 1,20120106070243,,Q,504,15550,504,201201060806,155,201201060813,155,201201060813,N,"594,820"
48
+ 1,20120106070243,2044524,L,504,15550,504,201201060806,504,201201060806,0,201201061004,L,"594,816"
49
+ 1,20120106070243,,Q,504,15550,504,201201060807,195,201201060813,195,201201060813,SEC,"594,822"
50
+ 1,20120106070243,2044522,L,504,15550,504,201201060806,504,201201060806,0,201201060959,L,"594,819"
51
+ 1,20120106070243,,Q,504,15550,504,201201060807,195,201201060811,195,201201060811,SEC,"594,821"
52
+ 1,20120106070243,,Q,504,15550,504,201201060806,155,201201060813,155,201201060813,N,"594,818"
53
+ 1,20120106070243,,Q,504,15550,504,201201060910,155,201201060916,155,201201060916,N,"594,831"
54
+ 1,20120106070243,2044716,X,504,15550,504,201201060928,504,201201060928,0,201201060953,X,"594,834"
55
+ 1,20120106073142,2044480,X,504,15550,504,201201060757,504,201201060757,0,201201060819,X,"594,815"
56
+ 1,20120106073757,2044475,L,504,15550,504,201201060749,155,201201060755,0,201201060925,URINE,"594,810"
57
+ 1,20120106073757,2044466,L,504,15550,504,201201060749,504,201201060749,0,201201060827,L,"594,808"
58
+ 1,20120106073757,2044470,X,504,15550,504,201201060749,504,201201060749,0,201201060818,X,"594,809"
59
+ 1,20120106073757,2044467,L,504,15550,504,201201060749,504,201201060749,0,201201060826,L,"594,813"
60
+ 1,20120106073757,2044468,L,504,15550,504,201201060749,504,201201060749,0,201201060839,L,"594,811"
61
+ 1,20120106073757,2044469,L,504,15550,504,201201060749,504,201201060749,0,201201060825,L,"594,814"
62
+ 1,20120106073757,,Q,504,15550,504,201201060749,155,201201060755,155,201201060755,IV,"594,812"
63
+ 1,20120106073757,,Q,504,15550,504,201201060911,76,201201060933,76,201201060933,IV,"594,832"
64
+ 1,20120106073757,,Q,504,15550,504,201201060928,34,201201060934,34,201201060934,SEC,"594,833"
65
+ 1,20120106073757,,Q,504,15550,504,201201061022,155,201201061108,155,201201061108,IV,"594,862"
66
+ 1,20120106073757,,Q,504,15550,504,201201061019,155,201201061025,155,201201061025,IV,"594,861"
67
+ 1,20120106073757,,Q,504,15550,504,201201061131,195,201201061133,195,201201061133,SEC,"594,896"
68
+ 1,20120106073757,,Q,504,15550,504,201201061131,195,201201061133,195,201201061133,SEC,"594,895"
69
+ 1,20120106073757,2045028,X,504,15550,504,201201061131,504,201201061131,0,201201061209,X,"594,898"
70
+ 1,20120106073757,2045029,X,504,15550,504,201201061131,504,201201061131,0,201201061345,X,"594,897"
71
+ 1,20120106073757,,Q,504,15550,504,201201061131,155,201201061223,155,201201061223,N,"594,894"
72
+ 1,20120106084347,2044639,X,504,15550,76,201201060850,76,201201060850,0,201201060931,X,"594,828"
73
+ 1,20120106084720,2044670,X,55,4644,55,201201060909,55,201201060909,0,201201060934,X,"594,829"
74
+ 1,20120106084720,,Q,55,4644,55,201201060910,66,201201060914,66,201201060914,N,"594,830"
75
+ 1,20120106085558,2044755,L,55,4644,55,201201060949,55,201201060949,0,201201061018,L,"594,846"
76
+ 1,20120106085558,2044756,L,55,4644,55,201201060949,55,201201060949,0,201201061038,L,"594,851"
77
+ 1,20120106085558,2044793,L,55,4644,55,201201060949,76,201201061003,0,201201061239,URINE,"594,848"
78
+ 1,20120106085558,,Q,55,4644,55,201201060949,76,201201061003,76,201201061003,IV,"594,850"
79
+ 1,20120106085558,,Q,55,4644,55,201201060949,76,201201061003,76,201201061003,IV,"594,847"
80
+ 1,20120106085558,2044757,L,55,4644,55,201201060949,55,201201060949,0,201201061040,L,"594,849"
81
+ 1,20120106085558,2044843,L,55,4644,55,201201061033,55,201201061033,0,201201071505,L,"594,864"
82
+ 1,20120106085558,2044841,X,55,4644,55,201201061032,55,201201061032,0,201201061136,X,"594,863"
83
+ 1,20120106085558,2044844,L,55,4644,55,201201061033,55,201201061033,0,201201061119,L,"594,865"
84
+ 1,20120106085558,,Q,55,4644,55,201201061228,195,201201061240,195,201201061240,SEC,"594,961"
85
+ 1,20120106091726,2044741,L,504,15550,504,201201060942,504,201201060942,0,201201061024,L,"594,839"
86
+ 1,20120106091726,2044745,X,504,15550,504,201201060942,504,201201060942,0,201201061016,X,"594,835"
87
+ 1,20120106091726,2044746,L,504,15550,504,201201060942,504,201201060942,0,201201061107,L,"594,842"
88
+ 1,20120106091726,2044740,L,504,15550,504,201201060942,504,201201060942,0,201201061017,L,"594,836"
89
+ 1,20120106091726,2044744,L,504,15550,504,201201060942,504,201201060942,0,201201061024,L,"594,838"
90
+ 1,20120106091726,2044742,L,504,15550,504,201201060942,504,201201060942,0,201201061016,L,"594,841"
91
+ 1,20120106091726,,Q,504,15550,504,201201060942,66,201201060944,66,201201060944,IV,"594,837"
92
+ 1,20120106091726,2044743,L,504,15550,504,201201060942,504,201201060942,0,201201061016,L,"594,840"
93
+ 1,20120106095129,2044814,X,55,4644,55,201201061010,55,201201061010,0,201201061037,X,"594,853"
94
+ 1,20120106100014,,Q,504,15550,504,201201061011,885,201201061037,885,201201061037,IV,"594,857"
95
+ 1,20120106100014,,Q,504,15550,504,201201061011,885,201201061037,885,201201061037,N,"594,858"
96
+ 1,20120106100014,,Q,504,15550,504,201201061011,885,201201061037,885,201201061037,N,"594,859"
97
+ 1,20120106100014,2044815,L,504,15550,504,201201061011,504,201201061011,0,201201061023,L,"594,854"
98
+ 1,20120106100014,2044817,L,504,15550,504,201201061011,504,201201061011,0,201201061049,L,"594,856"
99
+ 1,20120106100014,2044818,X,504,15550,504,201201061011,504,201201061011,0,201201061038,X,"594,855"
100
+ 1,20120106100014,2044816,L,504,15550,504,201201061011,504,201201061011,0,201201061049,L,"594,860"
@@ -11,7 +11,7 @@ describe "csvification" do
11
11
  output = File.read(tmp)
12
12
  tmp.unlink
13
13
  output.lines.count.should == 5
14
- results[:input_count].should == 5
14
+ results[:input_count].should == 6
15
15
  results[:output_count].should == 5
16
16
  end
17
17
  end
@@ -0,0 +1,16 @@
1
+ # spec for field regexp conversion
2
+
3
+ require "spec_helper"
4
+ require "tempfile"
5
+
6
+ describe "gsubbing" do
7
+ it "should apply conversion to a single column" do
8
+ filename = File.dirname(__FILE__) + "/../data/badnums.csv"
9
+ tmp = Tempfile.new('gsubber')
10
+ results = Masticate.gsub(filename, :output => tmp, :field => 'AuditByID', :from => '/,|(.00$)/', :to => '')
11
+ output = File.read(tmp)
12
+ correct_output = File.read(File.dirname(__FILE__) + "/../data/badnums_fixed.csv")
13
+
14
+ output.should == correct_output
15
+ end
16
+ end
@@ -6,15 +6,24 @@ describe "mending" do
6
6
  it "should merge lines when delimiter counts don't match'" do
7
7
  filename = File.dirname(__FILE__) + "/../data/broken_psv.txt"
8
8
  results = Masticate.mend(filename, :col_sep => '|', :output => "/dev/null")
9
- results[:input_records].should == 7
10
- results[:output_records].should == 5
9
+ results[:input_count].should == 7
10
+ results[:output_count].should == 5
11
11
  end
12
12
 
13
13
  it "should strip trailer records" do
14
14
  filename = File.dirname(__FILE__) + "/../data/junk_trailer.txt"
15
15
  metadata = Masticate.sniff(filename)
16
16
  results = Masticate.mend(filename, metadata.merge(:output => "/dev/null"))
17
- results[:input_records].should == 9
18
- results[:output_records].should == 5
17
+ results[:input_count].should == 9
18
+ results[:output_count].should == 5
19
+ results[:headers].should == ['COL1', 'COL 2', 'Col 3', 'col-4', 'col5', 'col6']
20
+ end
21
+
22
+ it "should snip head fields" do
23
+ filename = File.dirname(__FILE__) + "/../data/junk_header.csv"
24
+ results = Masticate.mend(filename, :col_sep => ',', :snip => 1, :output => "/dev/null")
25
+ results[:input_count].should == 6
26
+ results[:output_count].should == 5
27
+ results[:headers].should == %w(hospid usrorder dteorder usrsend dtesend usrdone dtedone department)
19
28
  end
20
29
  end
@@ -0,0 +1,18 @@
1
+ # spec for column-plucking functions
2
+
3
+ require "spec_helper"
4
+ require "tempfile"
5
+
6
+ describe "plucker" do
7
+ it "should pull named columns" do
8
+ filename = File.dirname(__FILE__) + "/../data/namedcols.csv"
9
+ tmp = Tempfile.new('plucker')
10
+ results = Masticate.pluck(filename, :output => tmp, :fields => ['three', 'five'])
11
+ output = File.read(tmp)
12
+ correct_output = File.read(File.dirname(__FILE__) + "/../data/namedcols.csv.output")
13
+ tmp.unlink
14
+
15
+ results[:input_count].should == 5
16
+ output.should == correct_output
17
+ end
18
+ end
@@ -16,4 +16,12 @@ describe "delimiter sniffing" do
16
16
  results[:col_sep].should == '|'
17
17
  results[:field_counts].should == {6 => 5}
18
18
  end
19
+
20
+ it "should recognize quotes in CSV sources" do
21
+ filename = File.dirname(__FILE__) + "/../data/quoted_csv_data.txt"
22
+ results = Masticate.sniff(filename)
23
+ results[:col_sep].should == ','
24
+ results[:quote_char].should == '"'
25
+ results[:field_counts].should == {14 => 100}
26
+ end
19
27
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: masticate
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.1.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-04-04 00:00:00.000000000 Z
12
+ date: 2012-04-05 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rspec
16
- requirement: &2153254280 !ruby/object:Gem::Requirement
16
+ requirement: &2153339260 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: 2.9.0
22
22
  type: :development
23
23
  prerelease: false
24
- version_requirements: *2153254280
24
+ version_requirements: *2153339260
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: guard-rspec
27
- requirement: &2153246900 !ruby/object:Gem::Requirement
27
+ requirement: &2153338760 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ~>
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: 0.7.0
33
33
  type: :development
34
34
  prerelease: false
35
- version_requirements: *2153246900
35
+ version_requirements: *2153338760
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: ruby_gntp
38
- requirement: &2153246180 !ruby/object:Gem::Requirement
38
+ requirement: &2153338100 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ~>
@@ -43,7 +43,7 @@ dependencies:
43
43
  version: 0.3.4
44
44
  type: :development
45
45
  prerelease: false
46
- version_requirements: *2153246180
46
+ version_requirements: *2153338100
47
47
  description: Data file crunching
48
48
  email:
49
49
  - jmay@pobox.com
@@ -60,17 +60,29 @@ files:
60
60
  - Rakefile
61
61
  - bin/masticate
62
62
  - lib/masticate.rb
63
+ - lib/masticate/base.rb
63
64
  - lib/masticate/csvify.rb
65
+ - lib/masticate/datify.rb
66
+ - lib/masticate/gsubber.rb
64
67
  - lib/masticate/mender.rb
68
+ - lib/masticate/plucker.rb
65
69
  - lib/masticate/sniffer.rb
66
70
  - lib/masticate/version.rb
67
71
  - masticate.gemspec
72
+ - spec/data/badnums.csv
73
+ - spec/data/badnums_fixed.csv
68
74
  - spec/data/broken_psv.txt
75
+ - spec/data/junk_header.csv
69
76
  - spec/data/junk_trailer.txt
77
+ - spec/data/namedcols.csv
78
+ - spec/data/namedcols.csv.output
70
79
  - spec/data/pipe_data.txt
80
+ - spec/data/quoted_csv_data.txt
71
81
  - spec/data/tabbed_data.txt
72
82
  - spec/lib/csvify_spec.rb
83
+ - spec/lib/gsub_spec.rb
73
84
  - spec/lib/mend_spec.rb
85
+ - spec/lib/plucker_spec.rb
74
86
  - spec/lib/sniffer_spec.rb
75
87
  - spec/spec_helper.rb
76
88
  homepage: ''
@@ -98,12 +110,20 @@ signing_key:
98
110
  specification_version: 3
99
111
  summary: Utility functions for parsing incoming text data files.
100
112
  test_files:
113
+ - spec/data/badnums.csv
114
+ - spec/data/badnums_fixed.csv
101
115
  - spec/data/broken_psv.txt
116
+ - spec/data/junk_header.csv
102
117
  - spec/data/junk_trailer.txt
118
+ - spec/data/namedcols.csv
119
+ - spec/data/namedcols.csv.output
103
120
  - spec/data/pipe_data.txt
121
+ - spec/data/quoted_csv_data.txt
104
122
  - spec/data/tabbed_data.txt
105
123
  - spec/lib/csvify_spec.rb
124
+ - spec/lib/gsub_spec.rb
106
125
  - spec/lib/mend_spec.rb
126
+ - spec/lib/plucker_spec.rb
107
127
  - spec/lib/sniffer_spec.rb
108
128
  - spec/spec_helper.rb
109
129
  has_rdoc: