masticate 0.1.5 → 0.2

Sign up to get free protection for your applications and to get access to all the features.
data/bin/masticate CHANGED
@@ -1,119 +1,8 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
3
  require_relative "../lib/masticate"
4
- require "optparse"
5
4
 
6
- command = ARGV.shift
5
+ masticator = Masticate::MyOptionParser.new
6
+ command, options, filenames = masticator.parse
7
7
 
8
- options = {}
9
- OptionParser.new do |opts|
10
- opts.banner = "Usage: example.rb [options]"
11
-
12
- opts.on("--format FORMAT", String, "Specify format") do |v|
13
- options[:format] = v
14
- end
15
-
16
- opts.on("--delim DELIMITER", String, "Specify field delimiter (character or TAB; default is ',')") do |v|
17
- options[:col_sep] = v
18
- options[:col_sep] = "\t" if options[:col_sep] == "TAB"
19
- end
20
-
21
- opts.on("--quote QUOTE-CHAR", String, "Specify character used for quoting fields (optional; default is no quoting)") do |char|
22
- options[:quote_char] = char
23
- end
24
-
25
- opts.on("--stats", "(for *sniff*) collect & display input stats") do
26
- options[:stats] = true
27
- end
28
-
29
- opts.on("--fields LIST", Array, "Specify fields to select") do |list|
30
- options[:fields] = list
31
- end
32
-
33
- opts.on("--field FIELD", String, "Specify field to convert") do |f|
34
- options[:field] = f
35
- end
36
-
37
- opts.on("--snip DIRECTIVE", String, "Specify header fields to snip: first N, or by name") do |f|
38
- options[:snip] = f.to_i
39
- end
40
-
41
- opts.on("--from REGEXP", String, "Regular expression for gsub conversion") do |s|
42
- options[:from] = s
43
- end
44
-
45
- opts.on("--to STRING", String, "Result string for gsub conversion") do |s|
46
- options[:to] = s
47
- end
48
-
49
- opts.on("--inlined", "(for *mend* only) Source file has headers inlined on each line") do |b|
50
- options[:inlined] = true
51
- end
52
-
53
- opts.on("--dejunk", "(for *mend* only) Expunge junk lines from source") do |b|
54
- options[:dejunk] = true
55
- end
56
-
57
- opts.on("--by FIELD", String, "(for *maxrows* only) Field to group by") do |f|
58
- options[:by] = f
59
- end
60
-
61
- opts.on("--max FIELD", String, "(for *maxrows* only) Field to find max value for") do |f|
62
- options[:max] = f
63
- end
64
- end.parse!
65
-
66
- filename = ARGV.shift # use stdin if no filename provided
67
-
68
- def logmessage(command, options, results)
69
- $stderr.puts <<-EOT
70
- * masticate #{command} (#{options.keys.join(', ')})
71
- Lines in input: #{results[:input_count]}
72
- Lines in output: #{results[:output_count]}
73
- EOT
74
- if results[:field_counts]
75
- $stderr.puts " Field counts: #{results[:field_counts].inspect}"
76
- end
77
- end
78
-
79
- case command
80
- when 'sniff'
81
- results = Masticate.sniff(filename, options)
82
- col_sep = results[:col_sep]
83
- col_sep = "TAB" if col_sep == "\t"
84
- quote_char = results[:quote_char] || "NONE"
85
- $stderr.puts <<-EOT
86
- Processing complete.
87
- Input delimiter: #{col_sep}
88
- Quote char: #{quote_char}
89
- Field counts: #{results[:field_counts].inspect}
90
- Headers: #{results[:headers].join(',')}
91
- EOT
92
-
93
- when 'mend'
94
- results = Masticate.mend(filename, options)
95
- logmessage(command, options, results)
96
-
97
- when 'csvify'
98
- results = Masticate.csvify(filename, options)
99
- logmessage(command, options, results)
100
-
101
- when 'pluck'
102
- results = Masticate.pluck(filename, options)
103
- logmessage(command, options, results)
104
-
105
- when 'datify'
106
- results = Masticate.datify(filename, options)
107
- logmessage(command, options, results)
108
-
109
- when 'gsub'
110
- results = Masticate.gsub(filename, options)
111
- logmessage(command, options, results)
112
-
113
- when 'maxrows'
114
- results = Masticate.maxrows(filename, options)
115
- logmessage(command, options, results)
116
-
117
- else
118
- raise "unknown command #{command}"
119
- end
8
+ masticator.execute(command, options, filenames)
@@ -4,8 +4,15 @@ class Masticate::Base
4
4
  attr_reader :input_count, :output_count
5
5
  attr_reader :csv_options
6
6
 
7
- def initialize(filename)
8
- @filename = filename
7
+ def initialize(args)
8
+ case args
9
+ when String
10
+ @filename = args
11
+ when Hash
12
+ configure(args)
13
+ else
14
+ raise "invalid initialization: #{args}"
15
+ end
9
16
  end
10
17
 
11
18
  def with_input
@@ -40,4 +47,8 @@ class Masticate::Base
40
47
  @csv_options[:quote_char] = opts[:quote_char] || "\0"
41
48
  end
42
49
  end
50
+
51
+ # def crunch(row)
52
+ # # noop
53
+ # end
43
54
  end
@@ -0,0 +1,21 @@
1
+ # concatenate input files:
2
+ # * assuming that each input file has a single header line
3
+ # * writing a single header line to the output (just use the header line from the first file)
4
+ # * trying that all the files have the same format (no validation)
5
+
6
+ class Masticate::Concat #< Masticate::Base
7
+ def initialize(filenames)
8
+ @filenames = filenames
9
+ end
10
+
11
+ def concat(opts)
12
+ File.unlink(opts[:output]) if opts[:output] && File.exists?(opts[:output])
13
+ redirect = ">>#{opts[:output]}" if opts[:output]
14
+
15
+ file1, *rest = @filenames
16
+ system "cat #{file1} #{redirect}"
17
+ rest.each do |file|
18
+ system "tail +2 #{file} #{redirect}"
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,48 @@
1
+ # cook up a recipe
2
+ # * single file as input
3
+ # * recipe from a file
4
+ # * multiple steps
5
+ # * single output
6
+
7
+ require "shellwords"
8
+
9
+ class Masticate::Cook < Masticate::Base
10
+ def initialize(filename)
11
+ @filename = filename
12
+ end
13
+
14
+ def cook(opts)
15
+ recipefile = opts[:recipe] or raise "missing recipe for cook"
16
+ recipe = File.read(recipefile).lines
17
+ standard_options(opts)
18
+
19
+ steps = recipe.map do |step|
20
+ # puts step
21
+ argv = Shellwords.split(step)
22
+ masticator = Masticate::MyOptionParser.new
23
+ command, options = masticator.parse(argv)
24
+ masticator.prepare(command, options)
25
+ end
26
+
27
+ @output_count = 0
28
+ headers = nil
29
+ with_input do |input|
30
+ while line = get
31
+ row = CSV.parse_line(line, csv_options)
32
+
33
+ steps.each do |step|
34
+ # puts "APPLY #{step} to #{row}"
35
+ row = step.crunch(row)
36
+ end
37
+
38
+ emit(row.to_csv) if row
39
+ end
40
+ end
41
+ @output.close if opts[:output]
42
+
43
+ {
44
+ :input_count => @input_count,
45
+ :output_count => @output_count
46
+ }
47
+ end
48
+ end
@@ -2,13 +2,21 @@
2
2
  require "csv"
3
3
 
4
4
  class Masticate::Gsubber < Masticate::Base
5
- def gsub(opts)
5
+ def configure(opts)
6
6
  standard_options(opts)
7
7
 
8
- field = opts[:field] or raise "missing field to gsub"
9
- from = Regexp.new(opts[:from]) or raise "Invalid regex '#{opts[:from]}' for conversion"
10
- to = opts[:to] or raise "missing 'to' string for gsub"
8
+ @field = opts[:field] or raise "missing field to gsub"
9
+ @from = Regexp.new(opts[:from]) or raise "Invalid regex '#{opts[:from]}' for conversion"
10
+ @to = opts[:to] or raise "missing 'to' string for gsub"
11
+ end
12
+
13
+ def set_headers(row)
14
+ @headers = row
15
+ @index = @headers.index(@field) or raise "Unable to find column '#{@field}' in headers"
16
+ end
11
17
 
18
+ def gsub(opts)
19
+ configure(opts)
12
20
  @output_count = 0
13
21
  headers = nil
14
22
  with_input do |input|
@@ -16,11 +24,11 @@ class Masticate::Gsubber < Masticate::Base
16
24
  row = CSV.parse_line(line, csv_options)
17
25
  if !headers
18
26
  headers = row
19
- index = headers.index(field) or raise "Unable to find column '#{field}' in headers"
27
+ index = headers.index(@field) or raise "Unable to find column '#{@field}' in headers"
20
28
  emit(line)
21
29
  else
22
30
  oldval = row[index]
23
- newval = oldval.gsub(from, to)
31
+ newval = oldval.gsub(@from, @to)
24
32
  row[index] = newval
25
33
  emit(row.to_csv)
26
34
  end
@@ -33,4 +41,13 @@ class Masticate::Gsubber < Masticate::Base
33
41
  :output_count => @output_count
34
42
  }
35
43
  end
44
+
45
+ def crunch(row)
46
+ if !@headers
47
+ set_headers(row)
48
+ else
49
+ row[@index] = row[@index].gsub(@from, @to)
50
+ end
51
+ row
52
+ end
36
53
  end
@@ -2,11 +2,15 @@
2
2
  require "csv"
3
3
 
4
4
  class Masticate::MaxRows < Masticate::Base
5
- def maxrows(opts)
5
+ def configure(opts)
6
6
  standard_options(opts)
7
7
 
8
- groupby = opts[:by] or raise "missing field to group by"
9
- maxon = opts[:max] or raise "missing field to max on"
8
+ @groupby = opts[:by] or raise "missing field to group by"
9
+ @maxon = opts[:max] or raise "missing field to max on"
10
+ end
11
+
12
+ def maxrows(opts)
13
+ configure(opts)
10
14
 
11
15
  @output_count = 0
12
16
  headers = nil
@@ -16,8 +20,8 @@ class Masticate::MaxRows < Masticate::Base
16
20
  row = CSV.parse_line(line, csv_options)
17
21
  if !headers
18
22
  headers = row
19
- index_by = headers.index(groupby) or raise "Unable to find column '#{groupby}'"
20
- index_max = headers.index(maxon) or raise "Unable to find column '#{maxon}'"
23
+ index_by = headers.index(@groupby) or raise "Unable to find column '#{@groupby}'"
24
+ index_max = headers.index(@maxon) or raise "Unable to find column '#{@maxon}'"
21
25
  emit(line)
22
26
  else
23
27
  key = row[index_by]
@@ -45,4 +49,30 @@ class Masticate::MaxRows < Masticate::Base
45
49
  :output_count => @output_count
46
50
  }
47
51
  end
52
+
53
+ def crunch(row)
54
+ if !@headers
55
+ @headers = row
56
+ @index_by = row.index(@groupby) or raise "Unable to find column '#{@groupby}'"
57
+ @index_max = row.index(@maxon) or raise "Unable to find column '#{@maxon}'"
58
+ @accum = {}
59
+ row
60
+ elsif row.nil?
61
+ # output the accumulated results
62
+ @accum.each do |k,row|
63
+ emit(row.to_csv)
64
+ end
65
+ else
66
+ key = row[@index_by]
67
+ if !@accum[key]
68
+ @accum[key] = row
69
+ else
70
+ oldscore = @accum[key][@index_max]
71
+ newscore = row[@index_max]
72
+ if newscore > oldscore
73
+ @accum[key] = row
74
+ end
75
+ end
76
+ end
77
+ end
48
78
  end
@@ -0,0 +1,163 @@
1
+ require "optparse"
2
+
3
+ class Masticate::MyOptionParser
4
+ attr_reader :command, :options
5
+
6
+ def initialize
7
+ @options = {}
8
+ @parser = OptionParser.new do |opts|
9
+ opts.banner = "Usage: masticate [command] [options]"
10
+
11
+ opts.on("--output FILENAME", String, "Redirect output from stdout to file") do |f|
12
+ @options[:output] = f
13
+ end
14
+
15
+ opts.on("--format FORMAT", String, "Specify format") do |v|
16
+ @options[:format] = v
17
+ end
18
+
19
+ opts.on("--delim DELIMITER", String, "Specify field delimiter (character or TAB; default is ',')") do |v|
20
+ @options[:col_sep] = v
21
+ @options[:col_sep] = "\t" if @options[:col_sep] == "TAB"
22
+ end
23
+
24
+ opts.on("--quote QUOTE-CHAR", String, "Specify character used for quoting fields (optional; default is no quoting)") do |char|
25
+ @options[:quote_char] = char
26
+ end
27
+
28
+ opts.on("--stats", "(for *sniff*) collect & display input stats") do
29
+ @options[:stats] = true
30
+ end
31
+
32
+ opts.on("--fields LIST", Array, "Specify fields to select") do |list|
33
+ @options[:fields] = list
34
+ end
35
+
36
+ opts.on("--field FIELD", String, "Specify field to convert") do |f|
37
+ @options[:field] = f
38
+ end
39
+
40
+ opts.on("--snip DIRECTIVE", String, "Specify header fields to snip: first N, or by name") do |f|
41
+ @options[:snip] = f.to_i
42
+ end
43
+
44
+ opts.on("--from REGEXP", String, "Regular expression for gsub conversion") do |s|
45
+ @options[:from] = s
46
+ end
47
+
48
+ # if I specify String here, then a blank string '' is considered invalid and triggers an exception.
49
+ opts.on("--to STRING", "Result string for gsub conversion") do |s|
50
+ @options[:to] = s
51
+ end
52
+
53
+ opts.on("--inlined", "(for *mend* only) Source file has headers inlined on each line") do |b|
54
+ @options[:inlined] = true
55
+ end
56
+
57
+ opts.on("--dejunk", "(for *mend* only) Expunge junk lines from source") do |b|
58
+ @options[:dejunk] = true
59
+ end
60
+
61
+ opts.on("--by FIELD", String, "(for *maxrows* only) Field to group by") do |f|
62
+ @options[:by] = f
63
+ end
64
+
65
+ opts.on("--max FIELD", String, "(for *maxrows* only) Field to find max value for") do |f|
66
+ @options[:max] = f
67
+ end
68
+
69
+ opts.on("--recipe FILENAME", String, "(*cook* only) Recipe file") do |f|
70
+ @options[:recipe] = f
71
+ end
72
+ end
73
+ end
74
+
75
+ def parse(argv = ARGV)
76
+ @command = argv.shift
77
+ filenames = @parser.parse(argv)
78
+ # argv remnants are filenames
79
+ [@command, @options, filenames]
80
+ end
81
+
82
+ def prepare(command, options)
83
+ klasses = {
84
+ 'gsub' => Masticate::Gsubber,
85
+ 'datify' => Masticate::Datify,
86
+ 'maxrows' => Masticate::MaxRows,
87
+ 'relabel' => Masticate::Relabel,
88
+ 'pluck' => Masticate::Plucker
89
+ }
90
+
91
+ klass = klasses[command]
92
+ klass.new(options)
93
+ end
94
+
95
+ def execute(command, options, filenames = nil)
96
+ filename = filenames.first
97
+
98
+ case command
99
+ when 'sniff'
100
+ results = Masticate.sniff(filename, options)
101
+ col_sep = results[:col_sep]
102
+ col_sep = "TAB" if col_sep == "\t"
103
+ quote_char = results[:quote_char] || "NONE"
104
+ $stderr.puts <<-EOT
105
+ Processing complete.
106
+ Input delimiter: #{col_sep}
107
+ Quote char: #{quote_char}
108
+ Field counts: #{results[:field_counts].inspect}
109
+ Headers: #{results[:headers].join(',')}
110
+ EOT
111
+
112
+ when 'mend'
113
+ results = Masticate.mend(filename, options)
114
+ logmessage(command, options, results)
115
+
116
+ when 'csvify'
117
+ results = Masticate.csvify(filename, options)
118
+ logmessage(command, options, results)
119
+
120
+ when 'pluck'
121
+ results = Masticate.pluck(filename, options)
122
+ logmessage(command, options, results)
123
+
124
+ when 'datify'
125
+ results = Masticate.datify(filename, options)
126
+ logmessage(command, options, results)
127
+
128
+ when 'gsub'
129
+ results = Masticate.gsub(filename, options)
130
+ logmessage(command, options, results)
131
+
132
+ when 'maxrows'
133
+ results = Masticate.maxrows(filename, options)
134
+ logmessage(command, options, results)
135
+
136
+ when 'concat'
137
+ results = Masticate.concat(ARGV, options)
138
+ # logmessage(command, options, results)
139
+
140
+ when 'relabel'
141
+ results = Masticate.relabel(filename, options)
142
+ # logmessage(command, options, results)
143
+
144
+ when 'cook'
145
+ results = Masticate.cook(filename, options)
146
+ logmessage(command, options, results)
147
+
148
+ else
149
+ raise "unknown command #{command}"
150
+ end
151
+ end
152
+
153
+ def logmessage(command, options, results)
154
+ $stderr.puts <<-EOT
155
+ * masticate #{command} (#{options.keys.join(', ')})
156
+ Lines in input: #{results[:input_count]}
157
+ Lines in output: #{results[:output_count]}
158
+ EOT
159
+ if results[:field_counts]
160
+ $stderr.puts " Field counts: #{results[:field_counts].inspect}"
161
+ end
162
+ end
163
+ end
@@ -2,10 +2,17 @@
2
2
  require "csv"
3
3
 
4
4
  class Masticate::Plucker < Masticate::Base
5
- def pluck(opts)
5
+ def configure(opts)
6
6
  standard_options(opts)
7
7
 
8
- fields = opts[:fields] or raise "missing fields to pluck"
8
+ @fields = opts[:fields] or raise "missing fields to pluck"
9
+ end
10
+
11
+ def pluck(opts)
12
+ configure(opts)
13
+ # standard_options(opts)
14
+ #
15
+ # fields = opts[:fields] or raise "missing fields to pluck"
9
16
 
10
17
  @output_count = 0
11
18
  headers = nil
@@ -14,7 +21,7 @@ class Masticate::Plucker < Masticate::Base
14
21
  row = CSV.parse_line(line, csv_options)
15
22
  if !headers
16
23
  headers = row
17
- indexes = fields.map do |f|
24
+ indexes = @fields.map do |f|
18
25
  case f
19
26
  when String
20
27
  headers.index(f) or raise "Unable to find column '#{f}'"
@@ -41,4 +48,27 @@ class Masticate::Plucker < Masticate::Base
41
48
  :output_count => @output_count
42
49
  }
43
50
  end
51
+
52
+ def crunch(row)
53
+ if !@headers
54
+ @headers = row
55
+ @indexes = @fields.map do |f|
56
+ case f
57
+ when String
58
+ row.index(f) or raise "Unable to find column '#{f}'"
59
+ when Fixnum
60
+ if f > row.count
61
+ raise "Cannot pluck column #{f}, there are only #{row.count} fields"
62
+ else
63
+ f-1
64
+ end
65
+ else
66
+ raise "Invalid field descriptor '#{f}'"
67
+ end
68
+ end
69
+ end
70
+
71
+ # output is just the selected columns
72
+ @indexes.map {|i| row[i]}
73
+ end
44
74
  end
@@ -0,0 +1,44 @@
1
+ # relabel a single input file
2
+ # * assuming that input file has a single header line
3
+ # * assuming that input file is in valid CSV format (no validation)
4
+
5
+ class Masticate::Relabel < Masticate::Base
6
+ def configure(opts)
7
+ standard_options(opts)
8
+
9
+ @fields = opts[:fields] or raise "missing fieldnames for relabel"
10
+ end
11
+
12
+ def relabel(opts)
13
+ configure(opts)
14
+
15
+ @output_count = 0
16
+ headers = nil
17
+ with_input do |input|
18
+ while line = get
19
+ row = CSV.parse_line(line, csv_options)
20
+ if !headers
21
+ headers = @fields
22
+ emit(headers.to_csv)
23
+ else
24
+ emit(row.to_csv)
25
+ end
26
+ end
27
+ end
28
+ @output.close if opts[:output]
29
+
30
+ # File.unlink(opts[:output]) if opts[:output] && File.exists?(opts[:output])
31
+ # redirect = ">>#{opts[:output]}" if opts[:output]
32
+ #
33
+ # system "/bin/echo -n '#{fields.to_csv}' #{redirect}"
34
+ # system "tail +2 #{@filename} #{redirect}"
35
+ end
36
+
37
+ def crunch(row)
38
+ if !@headers
39
+ @headers = @fields
40
+ row = @headers
41
+ end
42
+ row
43
+ end
44
+ end
@@ -4,7 +4,7 @@ class Masticate::Sniffer < Masticate::Base
4
4
  attr_reader :col_sep, :quote_char, :stats
5
5
  attr_reader :delimstats
6
6
 
7
- CandidateDelimiters = [',', '|', "\t"]
7
+ CandidateDelimiters = [',', '|', "\t", "~"]
8
8
 
9
9
  def initialize(filename)
10
10
  @filename = filename
@@ -1,3 +1,3 @@
1
1
  module Masticate
2
- VERSION = "0.1.5"
2
+ VERSION = "0.2"
3
3
  end
data/lib/masticate.rb CHANGED
@@ -1,7 +1,10 @@
1
1
  require "open-uri"
2
+ require "csv"
2
3
 
3
4
  require_relative "masticate/version"
4
5
  require_relative "masticate/base"
6
+ require_relative "masticate/myoptparse"
7
+
5
8
  require_relative "masticate/sniffer"
6
9
  require_relative "masticate/mender"
7
10
  require_relative "masticate/csvify"
@@ -9,6 +12,9 @@ require_relative "masticate/plucker"
9
12
  require_relative "masticate/datify"
10
13
  require_relative "masticate/gsubber"
11
14
  require_relative "masticate/max_rows"
15
+ require_relative "masticate/concat"
16
+ require_relative "masticate/relabel"
17
+ require_relative "masticate/cook"
12
18
 
13
19
  module Masticate
14
20
  def self.sniff(filename, opts = {})
@@ -38,4 +44,16 @@ module Masticate
38
44
  def self.maxrows(filename, opts)
39
45
  MaxRows.new(filename).maxrows(opts)
40
46
  end
47
+
48
+ def self.concat(filenames, opts)
49
+ Concat.new(filenames).concat(opts)
50
+ end
51
+
52
+ def self.relabel(filename, opts)
53
+ Relabel.new(filename).relabel(opts)
54
+ end
55
+
56
+ def self.cook(filename, opts)
57
+ Cook.new(filename).cook(opts)
58
+ end
41
59
  end
@@ -0,0 +1,108 @@
1
+ COL1 COL 2 Col 3 col-4 col5 col6
2
+ data data data d a t a data data
3
+ data data data d a t a data data
4
+ data data data d a t a data data
5
+ data data data d a t a data data
6
+ data| data |data |d a t a|data|data
7
+ data| data |data |d a t a|data|data
8
+ data| data |data |d a t a|data,data|data
9
+ data| data |data "more data" |d a t a|data|data
10
+ 1,20120106003230,2044272,L,407,15267,407,201201060140,407,201201060140,0,201201060309,L,"594,756"
11
+ 1,20120106003230,2044277,X,407,15267,381,201201060222,381,201201060222,0,201201060647,X,"594,761"
12
+ 1,20120106003230,2044309,L,407,15267,407,201201060311,407,201201060311,0,201201060339,L,"594,766"
13
+ 1,20120106003230,,Q,407,15267,407,201201060514,108,201201060515,108,201201060515,SEC,"594,787"
14
+ 1,20120106024355,,Q,407,15267,407,201201060309,90,201201060316,90,201201060316,IV,"594,764"
15
+ 1,20120106024355,2044306,L,407,15267,407,201201060309,407,201201060309,0,201201060345,L,"594,763"
16
+ 1,20120106024355,2044308,X,407,15267,407,201201060310,407,201201060310,0,201201060556,X,"594,765"
17
+ 1,20120106024355,2044307,L,407,15267,407,201201060309,407,201201060309,0,201201060333,L,"594,762"
18
+ 1,20120106024355,,Q,407,15267,407,201201060520,108,201201060522,108,201201060522,SEC,"594,789"
19
+ 1,20120106024355,2044579,L,407,15267,68,201201060826,68,201201060826,0,201201071149,L,"594,823"
20
+ 1,20120106032719,2044345,L,407,15267,407,201201060348,407,201201060348,0,201201060442,L,"594,775"
21
+ 1,20120106032719,2044344,L,407,15267,407,201201060348,407,201201060348,0,201201060442,L,"594,777"
22
+ 1,20120106032719,2044343,L,407,15267,407,201201060348,407,201201060348,0,201201060428,L,"594,773"
23
+ 1,20120106032719,,Q,407,15267,407,201201060348,426,201201060408,426,201201060408,IV,"594,774"
24
+ 1,20120106032719,,Q,407,15267,407,201201060348,426,201201060634,426,201201060634,URINE,"594,776"
25
+ 1,20120106032719,2044386,L,407,15267,407,201201060445,407,201201060445,0,201201060519,L,"594,785"
26
+ 1,20120106032719,2044401,X,407,15267,407,201201060521,407,201201060521,0,201201060646,X,"594,790"
27
+ 1,20120106033235,,Q,407,15267,407,201201060347,74,201201060353,74,201201060353,IV,"594,769"
28
+ 1,20120106033235,2044349,L,407,15267,407,201201060347,74,201201060353,0,201201060443,L,"594,771"
29
+ 1,20120106033235,2044350,L,407,15267,407,201201060347,74,201201060353,0,201201060434,URINE,"594,770"
30
+ 1,20120106033235,2044347,L,407,15267,407,201201060347,74,201201060353,0,201201060428,L,"594,768"
31
+ 1,20120106033235,2044348,L,407,15267,407,201201060347,74,201201060353,0,201201060443,L,"594,772"
32
+ 1,20120106033235,2044372,X,407,15267,407,201201060429,407,201201060429,0,201201060649,X,"594,780"
33
+ 1,20120106035346,,Q,407,15267,407,201201060446,426,201201060448,426,201201060448,N,"594,786"
34
+ 1,20120106041426,2044383,L,407,15267,407,201201060445,407,201201060445,0,201201060657,L,"594,784"
35
+ 1,20120106041426,2044384,L,407,15267,407,201201060445,407,201201060445,0,201201060657,L,"594,782"
36
+ 1,20120106041426,2044382,L,407,15267,407,201201060445,407,201201060445,0,201201060522,L,"594,781"
37
+ 1,20120106041426,,Q,407,15267,407,201201060445,381,201201060452,381,201201060452,IV,"594,783"
38
+ 1,20120106043025,2044400,X,407,15267,407,201201060515,407,201201060515,0,201201060554,X,"594,788"
39
+ 1,20120106045326,2044411,R,407,15267,407,201201060535,407,201201060535,0,201201060630,RS,"594,791"
40
+ 1,20120106045326,,Q,407,15267,407,201201060535,108,201201060540,108,201201060540,SEC,"594,794"
41
+ 1,20120106045326,2044412,R,407,15267,407,201201060535,407,201201060535,0,201201060629,RS,"594,795"
42
+ 1,20120106045326,2044413,X,407,15267,407,201201060536,407,201201060536,0,201201060649,X,"594,796"
43
+ 1,20120106045326,,Q,407,15267,407,201201060535,108,201201060541,108,201201060541,SEC,"594,792"
44
+ 1,20120106045326,2044410,R,407,15267,407,201201060535,407,201201060535,0,201201060628,RS,"594,793"
45
+ 1,20120106052714,2044421,L,407,15267,407,201201060544,407,201201060544,0,201201060605,L,"594,797"
46
+ 1,20120106052714,,Q,407,15267,407,201201060544,90,201201060545,90,201201060545,IV,"594,799"
47
+ 1,20120106052714,,Q,407,15267,407,201201060544,90,201201060545,90,201201060545,N,"594,800"
48
+ 1,20120106052714,2044422,L,407,15267,407,201201060544,407,201201060544,0,201201060621,L,"594,801"
49
+ 1,20120106052714,2044423,L,407,15267,407,201201060544,407,201201060544,0,201201060727,L,"594,798"
50
+ 1,20120106052714,2044424,L,407,15267,407,201201060551,407,201201060551,0,201201060714,L,"594,802"
51
+ 1,20120106070243,2044439,L,504,15550,504,201201060721,504,201201060721,0,201201060753,L,"594,803"
52
+ 1,20120106070243,2044440,L,504,15550,504,201201060721,504,201201060721,0,201201060748,L,"594,807"
53
+ 1,20120106070243,2044441,L,504,15550,504,201201060721,504,201201060721,0,201201060748,L,"594,806"
54
+ 1,20120106070243,,Q,504,15550,504,201201060721,155,201201060735,155,201201060735,IV,"594,805"
55
+ 1,20120106070243,,Q,504,15550,504,201201060806,155,201201060813,155,201201060813,N,"594,820"
56
+ 1,20120106070243,2044524,L,504,15550,504,201201060806,504,201201060806,0,201201061004,L,"594,816"
57
+ 1,20120106070243,,Q,504,15550,504,201201060807,195,201201060813,195,201201060813,SEC,"594,822"
58
+ 1,20120106070243,2044522,L,504,15550,504,201201060806,504,201201060806,0,201201060959,L,"594,819"
59
+ 1,20120106070243,,Q,504,15550,504,201201060807,195,201201060811,195,201201060811,SEC,"594,821"
60
+ 1,20120106070243,,Q,504,15550,504,201201060806,155,201201060813,155,201201060813,N,"594,818"
61
+ 1,20120106070243,,Q,504,15550,504,201201060910,155,201201060916,155,201201060916,N,"594,831"
62
+ 1,20120106070243,2044716,X,504,15550,504,201201060928,504,201201060928,0,201201060953,X,"594,834"
63
+ 1,20120106073142,2044480,X,504,15550,504,201201060757,504,201201060757,0,201201060819,X,"594,815"
64
+ 1,20120106073757,2044475,L,504,15550,504,201201060749,155,201201060755,0,201201060925,URINE,"594,810"
65
+ 1,20120106073757,2044466,L,504,15550,504,201201060749,504,201201060749,0,201201060827,L,"594,808"
66
+ 1,20120106073757,2044470,X,504,15550,504,201201060749,504,201201060749,0,201201060818,X,"594,809"
67
+ 1,20120106073757,2044467,L,504,15550,504,201201060749,504,201201060749,0,201201060826,L,"594,813"
68
+ 1,20120106073757,2044468,L,504,15550,504,201201060749,504,201201060749,0,201201060839,L,"594,811"
69
+ 1,20120106073757,2044469,L,504,15550,504,201201060749,504,201201060749,0,201201060825,L,"594,814"
70
+ 1,20120106073757,,Q,504,15550,504,201201060749,155,201201060755,155,201201060755,IV,"594,812"
71
+ 1,20120106073757,,Q,504,15550,504,201201060911,76,201201060933,76,201201060933,IV,"594,832"
72
+ 1,20120106073757,,Q,504,15550,504,201201060928,34,201201060934,34,201201060934,SEC,"594,833"
73
+ 1,20120106073757,,Q,504,15550,504,201201061022,155,201201061108,155,201201061108,IV,"594,862"
74
+ 1,20120106073757,,Q,504,15550,504,201201061019,155,201201061025,155,201201061025,IV,"594,861"
75
+ 1,20120106073757,,Q,504,15550,504,201201061131,195,201201061133,195,201201061133,SEC,"594,896"
76
+ 1,20120106073757,,Q,504,15550,504,201201061131,195,201201061133,195,201201061133,SEC,"594,895"
77
+ 1,20120106073757,2045028,X,504,15550,504,201201061131,504,201201061131,0,201201061209,X,"594,898"
78
+ 1,20120106073757,2045029,X,504,15550,504,201201061131,504,201201061131,0,201201061345,X,"594,897"
79
+ 1,20120106073757,,Q,504,15550,504,201201061131,155,201201061223,155,201201061223,N,"594,894"
80
+ 1,20120106084347,2044639,X,504,15550,76,201201060850,76,201201060850,0,201201060931,X,"594,828"
81
+ 1,20120106084720,2044670,X,55,4644,55,201201060909,55,201201060909,0,201201060934,X,"594,829"
82
+ 1,20120106084720,,Q,55,4644,55,201201060910,66,201201060914,66,201201060914,N,"594,830"
83
+ 1,20120106085558,2044755,L,55,4644,55,201201060949,55,201201060949,0,201201061018,L,"594,846"
84
+ 1,20120106085558,2044756,L,55,4644,55,201201060949,55,201201060949,0,201201061038,L,"594,851"
85
+ 1,20120106085558,2044793,L,55,4644,55,201201060949,76,201201061003,0,201201061239,URINE,"594,848"
86
+ 1,20120106085558,,Q,55,4644,55,201201060949,76,201201061003,76,201201061003,IV,"594,850"
87
+ 1,20120106085558,,Q,55,4644,55,201201060949,76,201201061003,76,201201061003,IV,"594,847"
88
+ 1,20120106085558,2044757,L,55,4644,55,201201060949,55,201201060949,0,201201061040,L,"594,849"
89
+ 1,20120106085558,2044843,L,55,4644,55,201201061033,55,201201061033,0,201201071505,L,"594,864"
90
+ 1,20120106085558,2044841,X,55,4644,55,201201061032,55,201201061032,0,201201061136,X,"594,863"
91
+ 1,20120106085558,2044844,L,55,4644,55,201201061033,55,201201061033,0,201201061119,L,"594,865"
92
+ 1,20120106085558,,Q,55,4644,55,201201061228,195,201201061240,195,201201061240,SEC,"594,961"
93
+ 1,20120106091726,2044741,L,504,15550,504,201201060942,504,201201060942,0,201201061024,L,"594,839"
94
+ 1,20120106091726,2044745,X,504,15550,504,201201060942,504,201201060942,0,201201061016,X,"594,835"
95
+ 1,20120106091726,2044746,L,504,15550,504,201201060942,504,201201060942,0,201201061107,L,"594,842"
96
+ 1,20120106091726,2044740,L,504,15550,504,201201060942,504,201201060942,0,201201061017,L,"594,836"
97
+ 1,20120106091726,2044744,L,504,15550,504,201201060942,504,201201060942,0,201201061024,L,"594,838"
98
+ 1,20120106091726,2044742,L,504,15550,504,201201060942,504,201201060942,0,201201061016,L,"594,841"
99
+ 1,20120106091726,,Q,504,15550,504,201201060942,66,201201060944,66,201201060944,IV,"594,837"
100
+ 1,20120106091726,2044743,L,504,15550,504,201201060942,504,201201060942,0,201201061016,L,"594,840"
101
+ 1,20120106095129,2044814,X,55,4644,55,201201061010,55,201201061010,0,201201061037,X,"594,853"
102
+ 1,20120106100014,,Q,504,15550,504,201201061011,885,201201061037,885,201201061037,IV,"594,857"
103
+ 1,20120106100014,,Q,504,15550,504,201201061011,885,201201061037,885,201201061037,N,"594,858"
104
+ 1,20120106100014,,Q,504,15550,504,201201061011,885,201201061037,885,201201061037,N,"594,859"
105
+ 1,20120106100014,2044815,L,504,15550,504,201201061011,504,201201061011,0,201201061023,L,"594,854"
106
+ 1,20120106100014,2044817,L,504,15550,504,201201061011,504,201201061011,0,201201061049,L,"594,856"
107
+ 1,20120106100014,2044818,X,504,15550,504,201201061011,504,201201061011,0,201201061038,X,"594,855"
108
+ 1,20120106100014,2044816,L,504,15550,504,201201061011,504,201201061011,0,201201061049,L,"594,860"
@@ -0,0 +1,41 @@
1
+ two,eight,fourteen
2
+ 20120106003230,201201060140,594756
3
+ 20120106003230,201201060222,594761
4
+ 20120106003230,201201060311,594766
5
+ 20120106003230,201201060514,594787
6
+ 20120106024355,201201060309,594764
7
+ 20120106024355,201201060310,594765
8
+ 20120106024355,201201060520,594789
9
+ 20120106024355,201201060826,594823
10
+ 20120106032719,201201060348,594775
11
+ 20120106032719,201201060445,594785
12
+ 20120106032719,201201060521,594790
13
+ 20120106033235,201201060347,594769
14
+ 20120106033235,201201060429,594780
15
+ 20120106035346,201201060446,594786
16
+ 20120106041426,201201060445,594784
17
+ 20120106043025,201201060515,594788
18
+ 20120106045326,201201060535,594791
19
+ 20120106045326,201201060536,594796
20
+ 20120106052714,201201060544,594797
21
+ 20120106052714,201201060551,594802
22
+ 20120106070243,201201060721,594803
23
+ 20120106070243,201201060806,594820
24
+ 20120106070243,201201060807,594822
25
+ 20120106070243,201201060910,594831
26
+ 20120106070243,201201060928,594834
27
+ 20120106073142,201201060757,594815
28
+ 20120106073757,201201060749,594810
29
+ 20120106073757,201201060911,594832
30
+ 20120106073757,201201060928,594833
31
+ 20120106073757,201201061022,594862
32
+ 20120106073757,201201061131,594896
33
+ 20120106084347,201201060850,594828
34
+ 20120106084720,201201060909,594829
35
+ 20120106084720,201201060910,594830
36
+ 20120106085558,201201060949,594846
37
+ 20120106085558,201201061033,594864
38
+ 20120106085558,201201061228,594961
39
+ 20120106091726,201201060942,594839
40
+ 20120106095129,201201061010,594853
41
+ 20120106100014,201201061011,594857
@@ -0,0 +1,4 @@
1
+ gsub --field order_number --from ',|(.00$)' --to ''
2
+ relabel --fields one,two,three,four,five,six,seven,eight,nine,ten,eleven,twelve,thirteen,fourteen
3
+ pluck --fields two,eight,fourteen
4
+ maxrows --by two --max eight
@@ -0,0 +1,4 @@
1
+ happy,birth,day,to,you
2
+ data1,data2,data3,data4,data5
3
+ 111,22,333,44,555
4
+ 91,92,93,94,95
@@ -0,0 +1,5 @@
1
+ COL1~COL 2~Col 3 ~col-4~ col5 ~col6
2
+ data~ data ~data ~d a t a~data~data
3
+ data~ data ~data ~d a t a~data~data
4
+ data~ data ~data ~d a t a~data,data~data
5
+ data~ data ~data "more data" ~d a t a~data~data
@@ -0,0 +1,17 @@
1
+ # spec for file concatenation
2
+
3
+ require "spec_helper"
4
+
5
+ describe "concatenation" do
6
+ it "should leave just one header row in the result" do
7
+ file1 = File.dirname(__FILE__) + "/../data/tabbed_data.txt"
8
+ file2 = File.dirname(__FILE__) + "/../data/pipe_data.txt"
9
+ file3 = File.dirname(__FILE__) + "/../data/quoted_csv_data.txt"
10
+ tmp = Tempfile.new('concat')
11
+ results = Masticate.concat([file1, file2, file3], :output => tmp.path)
12
+ output = File.read(tmp)
13
+ tmp.unlink
14
+ correct_output = File.read(File.dirname(__FILE__) + "/../data/concat_result.txt")
15
+ output.should == correct_output
16
+ end
17
+ end
@@ -0,0 +1,16 @@
1
+ # spec for cookery
2
+
3
+ require "spec_helper"
4
+
5
+ describe "cooking up a recipe" do
6
+ it "should handle multiple steps" do
7
+ input = File.dirname(__FILE__) + "/../data/quoted_csv_data.txt"
8
+ recipe = File.dirname(__FILE__) + "/../data/recipe.txt"
9
+ tmp = Tempfile.new('cooked')
10
+ results = Masticate.cook(input, :output => tmp, :recipe => recipe)
11
+ output = File.read(tmp)
12
+ correct_output = File.read(File.dirname(__FILE__) + "/../data/cooking_result.csv")
13
+
14
+ output.should == correct_output
15
+ end
16
+ end
@@ -1,7 +1,6 @@
1
1
  # spec for field regexp conversion
2
2
 
3
3
  require "spec_helper"
4
- require "tempfile"
5
4
 
6
5
  describe "gsubbing" do
7
6
  it "should apply conversion to a single column" do
@@ -0,0 +1,15 @@
1
+ # spec for file concatenation
2
+
3
+ require "spec_helper"
4
+
5
+ describe "relabeling" do
6
+ it "result should be same as original" do
7
+ input = File.dirname(__FILE__) + "/../data/namedcols.csv"
8
+ tmp = Tempfile.new('relabel')
9
+ results = Masticate.relabel(input, :fields => %w{happy birth day to you}, :output => tmp.path)
10
+ output = File.read(tmp)
11
+ tmp.unlink
12
+ correct_output = File.read(File.dirname(__FILE__) + "/../data/relabel_result.csv")
13
+ output.should == correct_output
14
+ end
15
+ end
@@ -31,4 +31,11 @@ describe "delimiter sniffing" do
31
31
  results[:quote_char].should == '"'
32
32
  results[:field_counts].should == {14 => 100}
33
33
  end
34
+
35
+ it "should find tilde delimiter" do
36
+ filename = File.dirname(__FILE__) + "/../data/tilde_data.txt"
37
+ results = Masticate.sniff(filename, :stats => true)
38
+ results[:col_sep].should == '~'
39
+ results[:field_counts].should == {6 => 5}
40
+ end
34
41
  end
data/spec/spec_helper.rb CHANGED
@@ -2,6 +2,7 @@
2
2
  # require 'rspec/rails'
3
3
  require 'rspec/autorun'
4
4
  # require 'capybara/rspec'
5
+ require "tempfile"
5
6
 
6
7
  # Requires supporting ruby files with custom matchers and macros, etc,
7
8
  # in spec/support/ and its subdirectories.
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: masticate
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.5
4
+ version: '0.2'
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-04-16 00:00:00.000000000 Z
12
+ date: 2012-04-23 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rspec
16
- requirement: &2156618420 !ruby/object:Gem::Requirement
16
+ requirement: &2153649040 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: 2.9.0
22
22
  type: :development
23
23
  prerelease: false
24
- version_requirements: *2156618420
24
+ version_requirements: *2153649040
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: guard-rspec
27
- requirement: &2156617660 !ruby/object:Gem::Requirement
27
+ requirement: &2153648360 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ~>
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: 0.7.0
33
33
  type: :development
34
34
  prerelease: false
35
- version_requirements: *2156617660
35
+ version_requirements: *2153648360
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: ruby_gntp
38
- requirement: &2156616980 !ruby/object:Gem::Requirement
38
+ requirement: &2153647700 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ~>
@@ -43,7 +43,7 @@ dependencies:
43
43
  version: 0.3.4
44
44
  type: :development
45
45
  prerelease: false
46
- version_requirements: *2156616980
46
+ version_requirements: *2153647700
47
47
  description: Data file crunching
48
48
  email:
49
49
  - jmay@pobox.com
@@ -61,18 +61,24 @@ files:
61
61
  - bin/masticate
62
62
  - lib/masticate.rb
63
63
  - lib/masticate/base.rb
64
+ - lib/masticate/concat.rb
65
+ - lib/masticate/cook.rb
64
66
  - lib/masticate/csvify.rb
65
67
  - lib/masticate/datify.rb
66
68
  - lib/masticate/gsubber.rb
67
69
  - lib/masticate/max_rows.rb
68
70
  - lib/masticate/mender.rb
71
+ - lib/masticate/myoptparse.rb
69
72
  - lib/masticate/plucker.rb
73
+ - lib/masticate/relabel.rb
70
74
  - lib/masticate/sniffer.rb
71
75
  - lib/masticate/version.rb
72
76
  - masticate.gemspec
73
77
  - spec/data/badnums.csv
74
78
  - spec/data/badnums_fixed.csv
75
79
  - spec/data/broken_psv.txt
80
+ - spec/data/concat_result.txt
81
+ - spec/data/cooking_result.csv
76
82
  - spec/data/events.csv
77
83
  - spec/data/events_reduced.csv
78
84
  - spec/data/inlined_headers.csv
@@ -83,12 +89,18 @@ files:
83
89
  - spec/data/namedcols.csv.output
84
90
  - spec/data/pipe_data.txt
85
91
  - spec/data/quoted_csv_data.txt
92
+ - spec/data/recipe.txt
93
+ - spec/data/relabel_result.csv
86
94
  - spec/data/tabbed_data.txt
95
+ - spec/data/tilde_data.txt
96
+ - spec/lib/concat_spec.rb
97
+ - spec/lib/cook_spec.rb
87
98
  - spec/lib/csvify_spec.rb
88
99
  - spec/lib/gsub_spec.rb
89
100
  - spec/lib/maxrow_spec.rb
90
101
  - spec/lib/mender_spec.rb
91
102
  - spec/lib/plucker_spec.rb
103
+ - spec/lib/relabel_spec.rb
92
104
  - spec/lib/sniffer_spec.rb
93
105
  - spec/spec_helper.rb
94
106
  homepage: ''
@@ -119,6 +131,8 @@ test_files:
119
131
  - spec/data/badnums.csv
120
132
  - spec/data/badnums_fixed.csv
121
133
  - spec/data/broken_psv.txt
134
+ - spec/data/concat_result.txt
135
+ - spec/data/cooking_result.csv
122
136
  - spec/data/events.csv
123
137
  - spec/data/events_reduced.csv
124
138
  - spec/data/inlined_headers.csv
@@ -129,12 +143,18 @@ test_files:
129
143
  - spec/data/namedcols.csv.output
130
144
  - spec/data/pipe_data.txt
131
145
  - spec/data/quoted_csv_data.txt
146
+ - spec/data/recipe.txt
147
+ - spec/data/relabel_result.csv
132
148
  - spec/data/tabbed_data.txt
149
+ - spec/data/tilde_data.txt
150
+ - spec/lib/concat_spec.rb
151
+ - spec/lib/cook_spec.rb
133
152
  - spec/lib/csvify_spec.rb
134
153
  - spec/lib/gsub_spec.rb
135
154
  - spec/lib/maxrow_spec.rb
136
155
  - spec/lib/mender_spec.rb
137
156
  - spec/lib/plucker_spec.rb
157
+ - spec/lib/relabel_spec.rb
138
158
  - spec/lib/sniffer_spec.rb
139
159
  - spec/spec_helper.rb
140
160
  has_rdoc: