masticate 0.0.4 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/bin/masticate CHANGED
@@ -1,10 +1,47 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
3
  require_relative "../lib/masticate"
4
+ require "optparse"
4
5
 
5
- command, filename = ARGV
6
+ command = ARGV.shift
6
7
 
7
- case ARGV.shift
8
+ options = {}
9
+ OptionParser.new do |opts|
10
+ opts.banner = "Usage: example.rb [options]"
11
+
12
+ opts.on("--format FORMAT", "Specify format") do |v|
13
+ options[:format] = v
14
+ end
15
+
16
+ opts.on("--delim DELIMITER", "Specify field delimiter (character or TAB)") do |v|
17
+ options[:col_sep] = v
18
+ options[:col_sep] = "\t" if options[:col_sep] == "TAB"
19
+ end
20
+
21
+ opts.on("--fields LIST", Array, "Specify fields to select") do |list|
22
+ options[:fields] = list
23
+ end
24
+
25
+ opts.on("--field FIELD", "Specify field to convert") do |f|
26
+ options[:field] = f
27
+ end
28
+
29
+ opts.on("--snip DIRECTIVE", "Specify header fields to snip: first N, or by name") do |f|
30
+ options[:snip] = f.to_i
31
+ end
32
+
33
+ opts.on("--from REGEXP", "Regular expression for gsub conversion") do |s|
34
+ options[:from] = s
35
+ end
36
+
37
+ opts.on("--to STRING", "Result string for gsub conversion") do |s|
38
+ options[:to] = s
39
+ end
40
+ end.parse!
41
+
42
+ filename = ARGV.shift # use stdin if no filename provided
43
+
44
+ case command
8
45
  when 'sniff'
9
46
  results = Masticate.sniff(filename)
10
47
  col_sep = results[:col_sep]
@@ -13,30 +50,49 @@ when 'sniff'
13
50
  Processing complete.
14
51
  Input delimiter: #{col_sep}
15
52
  Field counts: #{results[:field_counts].inspect}
53
+ Headers: #{results[:headers].join(',')}
16
54
  EOT
17
55
 
18
56
  when 'mend'
19
- metadata = Masticate.sniff(filename)
20
- col_sep = metadata[:col_sep]
21
- col_sep = "TAB" if col_sep == "\t"
22
- results = Masticate.mend(filename, metadata)
57
+ results = Masticate.mend(filename, options)
23
58
  $stderr.puts <<-EOT
24
59
  Processing complete.
25
- Input delimiter: #{col_sep}
26
- Lines in input: #{results[:input_records]}
27
- Lines in output: #{results[:output_records]}
60
+ Lines in input: #{results[:input_count]}
61
+ Lines in output: #{results[:output_count]}
28
62
  EOT
29
63
 
30
64
  when 'csvify'
31
- metadata = Masticate.sniff(filename)
32
- results = Masticate.csvify(filename, metadata)
65
+ results = Masticate.csvify(filename, options)
33
66
  $stderr.puts <<-EOT
34
67
  Processing complete.
35
- Input delimiter: #{metadata[:col_sep]}
36
68
  Lines in input: #{results[:input_count]}
37
69
  Lines in output: #{results[:output_count]}
38
70
  EOT
39
71
 
72
+ when 'pluck'
73
+ results = Masticate.pluck(filename, options)
74
+ $stderr.puts <<-EOT
75
+ Processing complete.
76
+ Lines in input: #{results[:input_count]}
77
+ Lines in output: #{results[:output_count]}
78
+ EOT
79
+
80
+ when 'datify'
81
+ results = Masticate.datify(filename, options)
82
+ $stderr.puts <<-EOT
83
+ Processing complete.
84
+ Lines in input: #{results[:input_count]}
85
+ Lines in output: #{results[:output_count]}
86
+ EOT
87
+
88
+ when 'gsub'
89
+ results = Masticate.gsub(filename, options)
90
+ # $stderr.puts <<-EOT
91
+ # Processing complete.
92
+ # Lines in input: #{results[:input_count]}
93
+ # Lines in output: #{results[:output_count]}
94
+ # EOT
95
+
40
96
  else
41
97
  raise "unknown command #{command}"
42
98
  end
@@ -0,0 +1,33 @@
1
+ class Masticate::Base
2
+ attr_reader :filename
3
+ attr_reader :input, :output
4
+ attr_reader :input_count, :output_count
5
+
6
+ def initialize(filename)
7
+ @filename = filename
8
+ end
9
+
10
+ def with_input
11
+ @input = @filename ? open(@filename) : $stdin
12
+ @input_count = 0
13
+ result = yield @input
14
+ @input.close if @filename
15
+ result
16
+ end
17
+
18
+ def get
19
+ line = @input.gets
20
+ @input_count += 1
21
+ line && line.chomp
22
+ end
23
+
24
+ def emit(line)
25
+ @output_count += 1
26
+ begin
27
+ @output.puts line
28
+ rescue Errno::EPIPE
29
+ # output was closed, e.g. ran piped into `head`
30
+ # silently ignore this condition, it's not fatal and doesn't need a warning
31
+ end
32
+ end
33
+ end
@@ -1,11 +1,9 @@
1
1
  # convert input to clean standard CSV
2
2
  require "csv"
3
3
 
4
- class Masticate::Csvify
5
- attr_reader :input
6
-
4
+ class Masticate::Csvify < Masticate::Base
7
5
  def initialize(filename)
8
- @input = File.open(filename)
6
+ @filename = filename
9
7
  end
10
8
 
11
9
  def csvify(opts)
@@ -14,26 +12,18 @@ class Masticate::Csvify
14
12
  csv_options[:col_sep] = opts[:col_sep] if opts[:col_sep]
15
13
  csv_options[:quote_char] = opts[:quote_char] || opts[:col_sep] if opts[:quote_char] || opts[:col_sep]
16
14
 
17
- input_count = @output_count = 0
18
- CSV.foreach(input, csv_options) do |row|
19
- input_count += 1
20
- emit(row.to_csv)
15
+ @output_count = 0
16
+ with_input do |input|
17
+ while line = get
18
+ row = CSV.parse_line(line, csv_options)
19
+ emit(row.to_csv) if row
20
+ end
21
21
  end
22
22
  @output.close if opts[:output]
23
- @input.close
23
+
24
24
  {
25
25
  :input_count => input_count,
26
26
  :output_count => @output_count
27
27
  }
28
28
  end
29
-
30
- def emit(line)
31
- @output_count += 1
32
- begin
33
- @output.puts line
34
- rescue Errno::EPIPE
35
- # output was closed, e.g. ran piped into `head`
36
- # silently ignore this condition, it's not fatal and doesn't need a warning
37
- end
38
- end
39
29
  end
@@ -0,0 +1,36 @@
1
+ # convert date columns to numerics
2
+ require "csv"
3
+
4
+ class Masticate::Datify < Masticate::Base
5
+ def datify(opts)
6
+ @output = opts[:output] ? File.open(opts[:output], "w") : $stdout
7
+ csv_options = {}
8
+ csv_options[:col_sep] = opts[:col_sep] if opts[:col_sep]
9
+ csv_options[:quote_char] = opts[:quote_char] || opts[:col_sep] if opts[:quote_char] || opts[:col_sep]
10
+
11
+ field = opts[:field] or raise "missing field to datify"
12
+ format = opts[:format] or raise "strptime format required for parsing timestamps"
13
+
14
+ @output_count = 0
15
+ headers = nil
16
+ with_input do |input|
17
+ while line = get
18
+ row = CSV.parse_line(line, csv_options)
19
+ if !headers
20
+ headers = row
21
+ index = headers.index(field) or raise "Unable to find column '#{field}'"
22
+ emit(headers.to_csv)
23
+ else
24
+ row[index] = DateTime.strptime(row[index], format).to_time.to_i rescue nil
25
+ emit(row.to_csv)
26
+ end
27
+ end
28
+ end
29
+ @output.close if opts[:output]
30
+
31
+ {
32
+ :input_count => input_count,
33
+ :output_count => @output_count
34
+ }
35
+ end
36
+ end
@@ -0,0 +1,39 @@
1
+ # extract subset of columns from CSV
2
+ require "csv"
3
+
4
+ class Masticate::Gsubber < Masticate::Base
5
+ def gsub(opts)
6
+ @output = opts[:output] ? File.open(opts[:output], "w") : $stdout
7
+ csv_options = {}
8
+ csv_options[:col_sep] = opts[:col_sep] if opts[:col_sep]
9
+ csv_options[:quote_char] = opts[:quote_char] || opts[:col_sep] if opts[:quote_char] || opts[:col_sep]
10
+
11
+ field = opts[:field] or raise "missing field to gsub"
12
+ from = Regexp.new(opts[:from]) or raise "Invalid regex '#{opts[:from]}' for conversion"
13
+ to = opts[:to] or raise "missing 'to' string for gsub"
14
+
15
+ @output_count = 0
16
+ headers = nil
17
+ with_input do |input|
18
+ while line = get
19
+ row = CSV.parse_line(line, csv_options)
20
+ if !headers
21
+ headers = row
22
+ index = headers.index(field) or raise "Unable to find column '#{field}'"
23
+ emit(line)
24
+ else
25
+ oldval = row[index]
26
+ newval = oldval.gsub(from, to)
27
+ row[index] = newval
28
+ emit(row.to_csv)
29
+ end
30
+ end
31
+ end
32
+ @output.close if opts[:output]
33
+
34
+ {
35
+ :input_count => input_count,
36
+ :output_count => @output_count
37
+ }
38
+ end
39
+ end
@@ -3,61 +3,75 @@
3
3
  # A row that contains fewer delimiters than expected has been split across two lines
4
4
  # (due to a newline embedded in a field). Glue those two lines into a single line in the output.
5
5
 
6
- class Masticate::Mender
7
- attr_reader :input
6
+ class Masticate::Mender < Masticate::Base
7
+ attr_reader :col_sep
8
8
 
9
9
  def initialize(filename)
10
- @input = open(filename)
10
+ @filename = filename
11
11
  end
12
12
 
13
13
  def mend(opts)
14
14
  @output = opts[:output] ? File.open(opts[:output], "w") : $stdout
15
- col_sep = opts[:col_sep]
16
-
17
- expected_delim_count = nil
18
- @input_count = @output_count = 0
19
- while (line = get) do
20
- unless line =~ /^\s*$/
21
- if !expected_delim_count
22
- # trust the first row
23
- expected_delim_count = line.count(col_sep)
24
- else
25
- running_count = line.count(col_sep)
26
- while !input.eof? && running_count < expected_delim_count do
27
- nextbit = get
28
- if nextbit
29
- line = line + ' ' + nextbit
30
- running_count = line.count(col_sep)
15
+ @col_sep = opts[:col_sep] || ','
16
+
17
+ expected_field_count = nil
18
+ headers = nil
19
+ @output_count = 0
20
+ with_input do |input|
21
+ while (line = get) do
22
+ unless line =~ /^\s*$/
23
+ if !expected_field_count
24
+ # trust the first row
25
+ headers = explode(line)
26
+ case opts[:snip]
27
+ when Fixnum
28
+ headers.shift(opts[:snip])
29
+ when nil
30
+ # do nothing
31
+ else
32
+ raise "Do not understand snip instruction [#{opts[:snip].inspect}]"
33
+ end
34
+ expected_field_count = headers.count
35
+ emit(headers.to_csv(:col_sep => @col_sep))
36
+ else
37
+ running_count = fieldcount(line)
38
+ while !input.eof? && running_count < expected_field_count do
39
+ nextbit = get
40
+ if nextbit
41
+ line = line + ' ' + nextbit
42
+ running_count = fieldcount(line)
43
+ end
44
+ end
45
+
46
+ if line.count(col_sep) > 2
47
+ emit(line)
31
48
  end
32
49
  end
33
50
  end
34
- if line.count(col_sep) > 2
35
- emit(line)
36
- end
37
51
  end
38
52
  end
39
53
 
40
- @input.close
41
54
  @output.close if opts[:output]
42
55
  {
43
- :input_records => @input_count,
44
- :output_records => @output_count
56
+ :input_count => @input_count,
57
+ :output_count => @output_count,
58
+ :headers => headers
45
59
  }
46
60
  end
47
61
 
48
- def get
49
- line = input.gets
50
- @input_count += 1
51
- line && line.chomp
62
+ def fieldcount(line)
63
+ if col_sep == ','
64
+ CSV.parse_line(line).count
65
+ else
66
+ line.count(col_sep)+1
67
+ end
52
68
  end
53
69
 
54
- def emit(line)
55
- @output_count += 1
56
- begin
57
- @output.puts line
58
- rescue Errno::EPIPE
59
- # output was closed, e.g. ran piped into `head`
60
- # silently ignore this condition, it's not fatal and doesn't need a warning
70
+ def explode(line)
71
+ if col_sep == ','
72
+ CSV.parse_line(line).map(&:strip)
73
+ else
74
+ line.split(col_sep).map(&:strip)
61
75
  end
62
76
  end
63
77
  end
@@ -0,0 +1,34 @@
1
+ # extract subset of columns from CSV
2
+ require "csv"
3
+
4
+ class Masticate::Plucker < Masticate::Base
5
+ def pluck(opts)
6
+ @output = opts[:output] ? File.open(opts[:output], "w") : $stdout
7
+ csv_options = {}
8
+ csv_options[:col_sep] = opts[:col_sep] if opts[:col_sep]
9
+ csv_options[:quote_char] = opts[:quote_char] || opts[:col_sep] if opts[:quote_char] || opts[:col_sep]
10
+
11
+ fields = opts[:fields] or raise "missing fields to pluck"
12
+
13
+ @output_count = 0
14
+ headers = nil
15
+ with_input do |input|
16
+ while line = get
17
+ row = CSV.parse_line(line, csv_options)
18
+ if !headers
19
+ headers = row
20
+ indexes = fields.map {|f| headers.index(f) or raise "Unable to find column '#{f}'"}
21
+ emit(fields.to_csv)
22
+ else
23
+ emit(indexes.map {|i| row[i]}.to_csv) if row
24
+ end
25
+ end
26
+ end
27
+ @output.close if opts[:output]
28
+
29
+ {
30
+ :input_count => input_count,
31
+ :output_count => @output_count
32
+ }
33
+ end
34
+ end
@@ -1,5 +1,8 @@
1
- class Masticate::Sniffer
2
- attr_reader :col_sep, :stats
1
+ require "set"
2
+
3
+ class Masticate::Sniffer < Masticate::Base
4
+ attr_reader :col_sep, :quote_char, :stats
5
+ attr_reader :delimstats
3
6
 
4
7
  CandidateDelimiters = [',', '|', "\t"]
5
8
 
@@ -9,32 +12,68 @@ class Masticate::Sniffer
9
12
 
10
13
  def sniff
11
14
  @col_sep = find_col_sep
15
+ @quote_char = delimstats[@col_sep][:quote_char]
12
16
  @stats = stats
13
17
  {
14
18
  :col_sep => @col_sep,
19
+ :quote_char => @quote_char,
15
20
  :field_counts => @stats,
16
- :line1 => @line1
21
+ :headers => @line1.split(@col_sep).map(&:strip)
17
22
  }
18
23
  end
19
24
 
20
25
  def find_col_sep
21
- input = open(@filename)
22
- @line1 = input.lines.first
23
- delimcounts = CandidateDelimiters.each_with_object({}) do |delim,h|
24
- h[delim] = consider_delim(@line1, delim)
26
+ @delimstats = {}
27
+ with_input do |input|
28
+ input.lines.take(10).each do |line|
29
+ @line1 = line unless @line1
30
+
31
+ CandidateDelimiters.each do |delim|
32
+ delimstats[delim] ||= { :counts => Set.new, :quote_char => nil}
33
+ h = delimstats[delim]
34
+ fieldcount, quote_char = consider_delim(line, delim)
35
+ h[:counts] << fieldcount
36
+ h[:quote_char] ||= quote_char
37
+ end
38
+ end
25
39
  end
26
- input.close
27
- delimcounts.sort_by{|h,v| -v}.first.first
40
+ delimstats.sort_by{|delim,stats| stats[:counts].max || 0}.last.first
28
41
  end
29
42
 
30
43
  def consider_delim(line, delim)
31
- line.count(delim)
44
+ @quote_char = nil
45
+ n = count_fields(line, delim)
46
+ [n, @quote_char]
47
+ end
48
+
49
+ def count_fields(line, delim)
50
+ if delim == ','
51
+ straight_count = line.count(delim) + 1
52
+ count_with_quoting = begin
53
+ CSV.parse_line(line).count
54
+ rescue CSV::MalformedCSVError
55
+ # this is not valid CSV, e.g. has incorrectly embedded quotes
56
+ 0
57
+ end
58
+ if count_with_quoting < straight_count
59
+ @quote_char = '"'
60
+ count_with_quoting
61
+ else
62
+ straight_count
63
+ end
64
+ else
65
+ line.count(delim) + 1
66
+ end
32
67
  end
33
68
 
34
69
  def stats
35
- input = open(@filename)
36
- counts = input.lines.each_with_object(Hash.new(0)) {|line, counts| counts[line.split(col_sep).count] += 1}
37
- input.close
70
+ counts = with_input do |input|
71
+ if col_sep == ',' && quote_char
72
+ input.lines.each_with_object(Hash.new(0)) {|line, counts| counts[CSV.parse_line(line, :quote_char => quote_char).count] += 1}
73
+ else
74
+ input.lines.each_with_object(Hash.new(0)) {|line, counts| counts[line.split(col_sep).count] += 1}
75
+ end
76
+ end
38
77
  counts
39
78
  end
40
79
  end
@@ -1,3 +1,3 @@
1
1
  module Masticate
2
- VERSION = "0.0.4"
2
+ VERSION = "0.1.0"
3
3
  end
data/lib/masticate.rb CHANGED
@@ -1,9 +1,13 @@
1
1
  require "open-uri"
2
2
 
3
3
  require_relative "masticate/version"
4
+ require_relative "masticate/base"
4
5
  require_relative "masticate/sniffer"
5
6
  require_relative "masticate/mender"
6
7
  require_relative "masticate/csvify"
8
+ require_relative "masticate/plucker"
9
+ require_relative "masticate/datify"
10
+ require_relative "masticate/gsubber"
7
11
 
8
12
  module Masticate
9
13
  def self.sniff(filename)
@@ -17,4 +21,16 @@ module Masticate
17
21
  def self.csvify(filename, opts)
18
22
  Csvify.new(filename).csvify(opts)
19
23
  end
24
+
25
+ def self.pluck(filename, opts)
26
+ Plucker.new(filename).pluck(opts)
27
+ end
28
+
29
+ def self.datify(filename, opts)
30
+ Datify.new(filename).datify(opts)
31
+ end
32
+
33
+ def self.gsub(filename, opts)
34
+ Gsubber.new(filename).gsub(opts)
35
+ end
20
36
  end
@@ -0,0 +1,8 @@
1
+ ChangeDTTM,AuditCode,AuditByID
2
+ 1326448188,MERGE,"518,437.00"
3
+ 1327481049,MERGEMATCH,0.00
4
+ 1327405172,MERGEMATCH,0.00
5
+ 1327655234,MERGEMATCH,0.00
6
+ 1327486334,MERGE,"429,073.00"
7
+ 1326447578,MERGEMATCH,0.00
8
+ 1326297465,MERGE,"123,456.00"
@@ -0,0 +1,8 @@
1
+ ChangeDTTM,AuditCode,AuditByID
2
+ 1326448188,MERGE,"518,437.00"
3
+ 1327481049,MERGEMATCH,0.00
4
+ 1327405172,MERGEMATCH,0.00
5
+ 1327655234,MERGEMATCH,0.00
6
+ 1327486334,MERGE,"429,073.00"
7
+ 1326447578,MERGEMATCH,0.00
8
+ 1326297465,MERGE,"123,456.00"
@@ -0,0 +1,5 @@
1
+ 3/7/2012,hospid,usrorder,dteorder,usrsend,dtesend,usrdone,dtedone,department
2
+ 15267,407,201201060140,407,201201060140,0,201201060309,L
3
+ 15267,381,201201060222,381,201201060222,0,201201060647,X
4
+ 15267,407,201201060311,407,201201060311,0,201201060339,L
5
+ 15267,407,201201060514,108,201201060515,108,201201060515,SEC
@@ -0,0 +1,4 @@
1
+ one,two,three,four,five
2
+ data1,data2,data3,data4,data5
3
+ 111,22,333,44,555
4
+ 91,92,93,94,95
@@ -0,0 +1,4 @@
1
+ three,five
2
+ data3,data5
3
+ 333,555
4
+ 93,95
@@ -0,0 +1,100 @@
1
+ site,ibex,unit,face,doctor,hospid,usrorder,dteorder,usrsend,dtesend,usrdone,dtedone,department,order_number
2
+ 1,20120106003230,2044272,L,407,15267,407,201201060140,407,201201060140,0,201201060309,L,"594,756"
3
+ 1,20120106003230,2044277,X,407,15267,381,201201060222,381,201201060222,0,201201060647,X,"594,761"
4
+ 1,20120106003230,2044309,L,407,15267,407,201201060311,407,201201060311,0,201201060339,L,"594,766"
5
+ 1,20120106003230,,Q,407,15267,407,201201060514,108,201201060515,108,201201060515,SEC,"594,787"
6
+ 1,20120106024355,,Q,407,15267,407,201201060309,90,201201060316,90,201201060316,IV,"594,764"
7
+ 1,20120106024355,2044306,L,407,15267,407,201201060309,407,201201060309,0,201201060345,L,"594,763"
8
+ 1,20120106024355,2044308,X,407,15267,407,201201060310,407,201201060310,0,201201060556,X,"594,765"
9
+ 1,20120106024355,2044307,L,407,15267,407,201201060309,407,201201060309,0,201201060333,L,"594,762"
10
+ 1,20120106024355,,Q,407,15267,407,201201060520,108,201201060522,108,201201060522,SEC,"594,789"
11
+ 1,20120106024355,2044579,L,407,15267,68,201201060826,68,201201060826,0,201201071149,L,"594,823"
12
+ 1,20120106032719,2044345,L,407,15267,407,201201060348,407,201201060348,0,201201060442,L,"594,775"
13
+ 1,20120106032719,2044344,L,407,15267,407,201201060348,407,201201060348,0,201201060442,L,"594,777"
14
+ 1,20120106032719,2044343,L,407,15267,407,201201060348,407,201201060348,0,201201060428,L,"594,773"
15
+ 1,20120106032719,,Q,407,15267,407,201201060348,426,201201060408,426,201201060408,IV,"594,774"
16
+ 1,20120106032719,,Q,407,15267,407,201201060348,426,201201060634,426,201201060634,URINE,"594,776"
17
+ 1,20120106032719,2044386,L,407,15267,407,201201060445,407,201201060445,0,201201060519,L,"594,785"
18
+ 1,20120106032719,2044401,X,407,15267,407,201201060521,407,201201060521,0,201201060646,X,"594,790"
19
+ 1,20120106033235,,Q,407,15267,407,201201060347,74,201201060353,74,201201060353,IV,"594,769"
20
+ 1,20120106033235,2044349,L,407,15267,407,201201060347,74,201201060353,0,201201060443,L,"594,771"
21
+ 1,20120106033235,2044350,L,407,15267,407,201201060347,74,201201060353,0,201201060434,URINE,"594,770"
22
+ 1,20120106033235,2044347,L,407,15267,407,201201060347,74,201201060353,0,201201060428,L,"594,768"
23
+ 1,20120106033235,2044348,L,407,15267,407,201201060347,74,201201060353,0,201201060443,L,"594,772"
24
+ 1,20120106033235,2044372,X,407,15267,407,201201060429,407,201201060429,0,201201060649,X,"594,780"
25
+ 1,20120106035346,,Q,407,15267,407,201201060446,426,201201060448,426,201201060448,N,"594,786"
26
+ 1,20120106041426,2044383,L,407,15267,407,201201060445,407,201201060445,0,201201060657,L,"594,784"
27
+ 1,20120106041426,2044384,L,407,15267,407,201201060445,407,201201060445,0,201201060657,L,"594,782"
28
+ 1,20120106041426,2044382,L,407,15267,407,201201060445,407,201201060445,0,201201060522,L,"594,781"
29
+ 1,20120106041426,,Q,407,15267,407,201201060445,381,201201060452,381,201201060452,IV,"594,783"
30
+ 1,20120106043025,2044400,X,407,15267,407,201201060515,407,201201060515,0,201201060554,X,"594,788"
31
+ 1,20120106045326,2044411,R,407,15267,407,201201060535,407,201201060535,0,201201060630,RS,"594,791"
32
+ 1,20120106045326,,Q,407,15267,407,201201060535,108,201201060540,108,201201060540,SEC,"594,794"
33
+ 1,20120106045326,2044412,R,407,15267,407,201201060535,407,201201060535,0,201201060629,RS,"594,795"
34
+ 1,20120106045326,2044413,X,407,15267,407,201201060536,407,201201060536,0,201201060649,X,"594,796"
35
+ 1,20120106045326,,Q,407,15267,407,201201060535,108,201201060541,108,201201060541,SEC,"594,792"
36
+ 1,20120106045326,2044410,R,407,15267,407,201201060535,407,201201060535,0,201201060628,RS,"594,793"
37
+ 1,20120106052714,2044421,L,407,15267,407,201201060544,407,201201060544,0,201201060605,L,"594,797"
38
+ 1,20120106052714,,Q,407,15267,407,201201060544,90,201201060545,90,201201060545,IV,"594,799"
39
+ 1,20120106052714,,Q,407,15267,407,201201060544,90,201201060545,90,201201060545,N,"594,800"
40
+ 1,20120106052714,2044422,L,407,15267,407,201201060544,407,201201060544,0,201201060621,L,"594,801"
41
+ 1,20120106052714,2044423,L,407,15267,407,201201060544,407,201201060544,0,201201060727,L,"594,798"
42
+ 1,20120106052714,2044424,L,407,15267,407,201201060551,407,201201060551,0,201201060714,L,"594,802"
43
+ 1,20120106070243,2044439,L,504,15550,504,201201060721,504,201201060721,0,201201060753,L,"594,803"
44
+ 1,20120106070243,2044440,L,504,15550,504,201201060721,504,201201060721,0,201201060748,L,"594,807"
45
+ 1,20120106070243,2044441,L,504,15550,504,201201060721,504,201201060721,0,201201060748,L,"594,806"
46
+ 1,20120106070243,,Q,504,15550,504,201201060721,155,201201060735,155,201201060735,IV,"594,805"
47
+ 1,20120106070243,,Q,504,15550,504,201201060806,155,201201060813,155,201201060813,N,"594,820"
48
+ 1,20120106070243,2044524,L,504,15550,504,201201060806,504,201201060806,0,201201061004,L,"594,816"
49
+ 1,20120106070243,,Q,504,15550,504,201201060807,195,201201060813,195,201201060813,SEC,"594,822"
50
+ 1,20120106070243,2044522,L,504,15550,504,201201060806,504,201201060806,0,201201060959,L,"594,819"
51
+ 1,20120106070243,,Q,504,15550,504,201201060807,195,201201060811,195,201201060811,SEC,"594,821"
52
+ 1,20120106070243,,Q,504,15550,504,201201060806,155,201201060813,155,201201060813,N,"594,818"
53
+ 1,20120106070243,,Q,504,15550,504,201201060910,155,201201060916,155,201201060916,N,"594,831"
54
+ 1,20120106070243,2044716,X,504,15550,504,201201060928,504,201201060928,0,201201060953,X,"594,834"
55
+ 1,20120106073142,2044480,X,504,15550,504,201201060757,504,201201060757,0,201201060819,X,"594,815"
56
+ 1,20120106073757,2044475,L,504,15550,504,201201060749,155,201201060755,0,201201060925,URINE,"594,810"
57
+ 1,20120106073757,2044466,L,504,15550,504,201201060749,504,201201060749,0,201201060827,L,"594,808"
58
+ 1,20120106073757,2044470,X,504,15550,504,201201060749,504,201201060749,0,201201060818,X,"594,809"
59
+ 1,20120106073757,2044467,L,504,15550,504,201201060749,504,201201060749,0,201201060826,L,"594,813"
60
+ 1,20120106073757,2044468,L,504,15550,504,201201060749,504,201201060749,0,201201060839,L,"594,811"
61
+ 1,20120106073757,2044469,L,504,15550,504,201201060749,504,201201060749,0,201201060825,L,"594,814"
62
+ 1,20120106073757,,Q,504,15550,504,201201060749,155,201201060755,155,201201060755,IV,"594,812"
63
+ 1,20120106073757,,Q,504,15550,504,201201060911,76,201201060933,76,201201060933,IV,"594,832"
64
+ 1,20120106073757,,Q,504,15550,504,201201060928,34,201201060934,34,201201060934,SEC,"594,833"
65
+ 1,20120106073757,,Q,504,15550,504,201201061022,155,201201061108,155,201201061108,IV,"594,862"
66
+ 1,20120106073757,,Q,504,15550,504,201201061019,155,201201061025,155,201201061025,IV,"594,861"
67
+ 1,20120106073757,,Q,504,15550,504,201201061131,195,201201061133,195,201201061133,SEC,"594,896"
68
+ 1,20120106073757,,Q,504,15550,504,201201061131,195,201201061133,195,201201061133,SEC,"594,895"
69
+ 1,20120106073757,2045028,X,504,15550,504,201201061131,504,201201061131,0,201201061209,X,"594,898"
70
+ 1,20120106073757,2045029,X,504,15550,504,201201061131,504,201201061131,0,201201061345,X,"594,897"
71
+ 1,20120106073757,,Q,504,15550,504,201201061131,155,201201061223,155,201201061223,N,"594,894"
72
+ 1,20120106084347,2044639,X,504,15550,76,201201060850,76,201201060850,0,201201060931,X,"594,828"
73
+ 1,20120106084720,2044670,X,55,4644,55,201201060909,55,201201060909,0,201201060934,X,"594,829"
74
+ 1,20120106084720,,Q,55,4644,55,201201060910,66,201201060914,66,201201060914,N,"594,830"
75
+ 1,20120106085558,2044755,L,55,4644,55,201201060949,55,201201060949,0,201201061018,L,"594,846"
76
+ 1,20120106085558,2044756,L,55,4644,55,201201060949,55,201201060949,0,201201061038,L,"594,851"
77
+ 1,20120106085558,2044793,L,55,4644,55,201201060949,76,201201061003,0,201201061239,URINE,"594,848"
78
+ 1,20120106085558,,Q,55,4644,55,201201060949,76,201201061003,76,201201061003,IV,"594,850"
79
+ 1,20120106085558,,Q,55,4644,55,201201060949,76,201201061003,76,201201061003,IV,"594,847"
80
+ 1,20120106085558,2044757,L,55,4644,55,201201060949,55,201201060949,0,201201061040,L,"594,849"
81
+ 1,20120106085558,2044843,L,55,4644,55,201201061033,55,201201061033,0,201201071505,L,"594,864"
82
+ 1,20120106085558,2044841,X,55,4644,55,201201061032,55,201201061032,0,201201061136,X,"594,863"
83
+ 1,20120106085558,2044844,L,55,4644,55,201201061033,55,201201061033,0,201201061119,L,"594,865"
84
+ 1,20120106085558,,Q,55,4644,55,201201061228,195,201201061240,195,201201061240,SEC,"594,961"
85
+ 1,20120106091726,2044741,L,504,15550,504,201201060942,504,201201060942,0,201201061024,L,"594,839"
86
+ 1,20120106091726,2044745,X,504,15550,504,201201060942,504,201201060942,0,201201061016,X,"594,835"
87
+ 1,20120106091726,2044746,L,504,15550,504,201201060942,504,201201060942,0,201201061107,L,"594,842"
88
+ 1,20120106091726,2044740,L,504,15550,504,201201060942,504,201201060942,0,201201061017,L,"594,836"
89
+ 1,20120106091726,2044744,L,504,15550,504,201201060942,504,201201060942,0,201201061024,L,"594,838"
90
+ 1,20120106091726,2044742,L,504,15550,504,201201060942,504,201201060942,0,201201061016,L,"594,841"
91
+ 1,20120106091726,,Q,504,15550,504,201201060942,66,201201060944,66,201201060944,IV,"594,837"
92
+ 1,20120106091726,2044743,L,504,15550,504,201201060942,504,201201060942,0,201201061016,L,"594,840"
93
+ 1,20120106095129,2044814,X,55,4644,55,201201061010,55,201201061010,0,201201061037,X,"594,853"
94
+ 1,20120106100014,,Q,504,15550,504,201201061011,885,201201061037,885,201201061037,IV,"594,857"
95
+ 1,20120106100014,,Q,504,15550,504,201201061011,885,201201061037,885,201201061037,N,"594,858"
96
+ 1,20120106100014,,Q,504,15550,504,201201061011,885,201201061037,885,201201061037,N,"594,859"
97
+ 1,20120106100014,2044815,L,504,15550,504,201201061011,504,201201061011,0,201201061023,L,"594,854"
98
+ 1,20120106100014,2044817,L,504,15550,504,201201061011,504,201201061011,0,201201061049,L,"594,856"
99
+ 1,20120106100014,2044818,X,504,15550,504,201201061011,504,201201061011,0,201201061038,X,"594,855"
100
+ 1,20120106100014,2044816,L,504,15550,504,201201061011,504,201201061011,0,201201061049,L,"594,860"
@@ -11,7 +11,7 @@ describe "csvification" do
11
11
  output = File.read(tmp)
12
12
  tmp.unlink
13
13
  output.lines.count.should == 5
14
- results[:input_count].should == 5
14
+ results[:input_count].should == 6
15
15
  results[:output_count].should == 5
16
16
  end
17
17
  end
@@ -0,0 +1,16 @@
1
+ # spec for field regexp conversion
2
+
3
+ require "spec_helper"
4
+ require "tempfile"
5
+
6
+ describe "gsubbing" do
7
+ it "should apply conversion to a single column" do
8
+ filename = File.dirname(__FILE__) + "/../data/badnums.csv"
9
+ tmp = Tempfile.new('gsubber')
10
+ results = Masticate.gsub(filename, :output => tmp, :field => 'AuditByID', :from => '/,|(.00$)/', :to => '')
11
+ output = File.read(tmp)
12
+ correct_output = File.read(File.dirname(__FILE__) + "/../data/badnums_fixed.csv")
13
+
14
+ output.should == correct_output
15
+ end
16
+ end
@@ -6,15 +6,24 @@ describe "mending" do
6
6
  it "should merge lines when delimiter counts don't match'" do
7
7
  filename = File.dirname(__FILE__) + "/../data/broken_psv.txt"
8
8
  results = Masticate.mend(filename, :col_sep => '|', :output => "/dev/null")
9
- results[:input_records].should == 7
10
- results[:output_records].should == 5
9
+ results[:input_count].should == 7
10
+ results[:output_count].should == 5
11
11
  end
12
12
 
13
13
  it "should strip trailer records" do
14
14
  filename = File.dirname(__FILE__) + "/../data/junk_trailer.txt"
15
15
  metadata = Masticate.sniff(filename)
16
16
  results = Masticate.mend(filename, metadata.merge(:output => "/dev/null"))
17
- results[:input_records].should == 9
18
- results[:output_records].should == 5
17
+ results[:input_count].should == 9
18
+ results[:output_count].should == 5
19
+ results[:headers].should == ['COL1', 'COL 2', 'Col 3', 'col-4', 'col5', 'col6']
20
+ end
21
+
22
+ it "should snip head fields" do
23
+ filename = File.dirname(__FILE__) + "/../data/junk_header.csv"
24
+ results = Masticate.mend(filename, :col_sep => ',', :snip => 1, :output => "/dev/null")
25
+ results[:input_count].should == 6
26
+ results[:output_count].should == 5
27
+ results[:headers].should == %w(hospid usrorder dteorder usrsend dtesend usrdone dtedone department)
19
28
  end
20
29
  end
@@ -0,0 +1,18 @@
1
+ # spec for column-plucking functions
2
+
3
+ require "spec_helper"
4
+ require "tempfile"
5
+
6
+ describe "plucker" do
7
+ it "should pull named columns" do
8
+ filename = File.dirname(__FILE__) + "/../data/namedcols.csv"
9
+ tmp = Tempfile.new('plucker')
10
+ results = Masticate.pluck(filename, :output => tmp, :fields => ['three', 'five'])
11
+ output = File.read(tmp)
12
+ correct_output = File.read(File.dirname(__FILE__) + "/../data/namedcols.csv.output")
13
+ tmp.unlink
14
+
15
+ results[:input_count].should == 5
16
+ output.should == correct_output
17
+ end
18
+ end
@@ -16,4 +16,12 @@ describe "delimiter sniffing" do
16
16
  results[:col_sep].should == '|'
17
17
  results[:field_counts].should == {6 => 5}
18
18
  end
19
+
20
+ it "should recognize quotes in CSV sources" do
21
+ filename = File.dirname(__FILE__) + "/../data/quoted_csv_data.txt"
22
+ results = Masticate.sniff(filename)
23
+ results[:col_sep].should == ','
24
+ results[:quote_char].should == '"'
25
+ results[:field_counts].should == {14 => 100}
26
+ end
19
27
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: masticate
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.1.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-04-04 00:00:00.000000000 Z
12
+ date: 2012-04-05 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rspec
16
- requirement: &2153254280 !ruby/object:Gem::Requirement
16
+ requirement: &2153339260 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: 2.9.0
22
22
  type: :development
23
23
  prerelease: false
24
- version_requirements: *2153254280
24
+ version_requirements: *2153339260
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: guard-rspec
27
- requirement: &2153246900 !ruby/object:Gem::Requirement
27
+ requirement: &2153338760 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ~>
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: 0.7.0
33
33
  type: :development
34
34
  prerelease: false
35
- version_requirements: *2153246900
35
+ version_requirements: *2153338760
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: ruby_gntp
38
- requirement: &2153246180 !ruby/object:Gem::Requirement
38
+ requirement: &2153338100 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ~>
@@ -43,7 +43,7 @@ dependencies:
43
43
  version: 0.3.4
44
44
  type: :development
45
45
  prerelease: false
46
- version_requirements: *2153246180
46
+ version_requirements: *2153338100
47
47
  description: Data file crunching
48
48
  email:
49
49
  - jmay@pobox.com
@@ -60,17 +60,29 @@ files:
60
60
  - Rakefile
61
61
  - bin/masticate
62
62
  - lib/masticate.rb
63
+ - lib/masticate/base.rb
63
64
  - lib/masticate/csvify.rb
65
+ - lib/masticate/datify.rb
66
+ - lib/masticate/gsubber.rb
64
67
  - lib/masticate/mender.rb
68
+ - lib/masticate/plucker.rb
65
69
  - lib/masticate/sniffer.rb
66
70
  - lib/masticate/version.rb
67
71
  - masticate.gemspec
72
+ - spec/data/badnums.csv
73
+ - spec/data/badnums_fixed.csv
68
74
  - spec/data/broken_psv.txt
75
+ - spec/data/junk_header.csv
69
76
  - spec/data/junk_trailer.txt
77
+ - spec/data/namedcols.csv
78
+ - spec/data/namedcols.csv.output
70
79
  - spec/data/pipe_data.txt
80
+ - spec/data/quoted_csv_data.txt
71
81
  - spec/data/tabbed_data.txt
72
82
  - spec/lib/csvify_spec.rb
83
+ - spec/lib/gsub_spec.rb
73
84
  - spec/lib/mend_spec.rb
85
+ - spec/lib/plucker_spec.rb
74
86
  - spec/lib/sniffer_spec.rb
75
87
  - spec/spec_helper.rb
76
88
  homepage: ''
@@ -98,12 +110,20 @@ signing_key:
98
110
  specification_version: 3
99
111
  summary: Utility functions for parsing incoming text data files.
100
112
  test_files:
113
+ - spec/data/badnums.csv
114
+ - spec/data/badnums_fixed.csv
101
115
  - spec/data/broken_psv.txt
116
+ - spec/data/junk_header.csv
102
117
  - spec/data/junk_trailer.txt
118
+ - spec/data/namedcols.csv
119
+ - spec/data/namedcols.csv.output
103
120
  - spec/data/pipe_data.txt
121
+ - spec/data/quoted_csv_data.txt
104
122
  - spec/data/tabbed_data.txt
105
123
  - spec/lib/csvify_spec.rb
124
+ - spec/lib/gsub_spec.rb
106
125
  - spec/lib/mend_spec.rb
126
+ - spec/lib/plucker_spec.rb
107
127
  - spec/lib/sniffer_spec.rb
108
128
  - spec/spec_helper.rb
109
129
  has_rdoc: