masticate 0.1.5 → 0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/bin/masticate CHANGED
@@ -1,119 +1,8 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
3
  require_relative "../lib/masticate"
4
- require "optparse"
5
4
 
6
- command = ARGV.shift
5
+ masticator = Masticate::MyOptionParser.new
6
+ command, options, filenames = masticator.parse
7
7
 
8
- options = {}
9
- OptionParser.new do |opts|
10
- opts.banner = "Usage: example.rb [options]"
11
-
12
- opts.on("--format FORMAT", String, "Specify format") do |v|
13
- options[:format] = v
14
- end
15
-
16
- opts.on("--delim DELIMITER", String, "Specify field delimiter (character or TAB; default is ',')") do |v|
17
- options[:col_sep] = v
18
- options[:col_sep] = "\t" if options[:col_sep] == "TAB"
19
- end
20
-
21
- opts.on("--quote QUOTE-CHAR", String, "Specify character used for quoting fields (optional; default is no quoting)") do |char|
22
- options[:quote_char] = char
23
- end
24
-
25
- opts.on("--stats", "(for *sniff*) collect & display input stats") do
26
- options[:stats] = true
27
- end
28
-
29
- opts.on("--fields LIST", Array, "Specify fields to select") do |list|
30
- options[:fields] = list
31
- end
32
-
33
- opts.on("--field FIELD", String, "Specify field to convert") do |f|
34
- options[:field] = f
35
- end
36
-
37
- opts.on("--snip DIRECTIVE", String, "Specify header fields to snip: first N, or by name") do |f|
38
- options[:snip] = f.to_i
39
- end
40
-
41
- opts.on("--from REGEXP", String, "Regular expression for gsub conversion") do |s|
42
- options[:from] = s
43
- end
44
-
45
- opts.on("--to STRING", String, "Result string for gsub conversion") do |s|
46
- options[:to] = s
47
- end
48
-
49
- opts.on("--inlined", "(for *mend* only) Source file has headers inlined on each line") do |b|
50
- options[:inlined] = true
51
- end
52
-
53
- opts.on("--dejunk", "(for *mend* only) Expunge junk lines from source") do |b|
54
- options[:dejunk] = true
55
- end
56
-
57
- opts.on("--by FIELD", String, "(for *maxrows* only) Field to group by") do |f|
58
- options[:by] = f
59
- end
60
-
61
- opts.on("--max FIELD", String, "(for *maxrows* only) Field to find max value for") do |f|
62
- options[:max] = f
63
- end
64
- end.parse!
65
-
66
- filename = ARGV.shift # use stdin if no filename provided
67
-
68
- def logmessage(command, options, results)
69
- $stderr.puts <<-EOT
70
- * masticate #{command} (#{options.keys.join(', ')})
71
- Lines in input: #{results[:input_count]}
72
- Lines in output: #{results[:output_count]}
73
- EOT
74
- if results[:field_counts]
75
- $stderr.puts " Field counts: #{results[:field_counts].inspect}"
76
- end
77
- end
78
-
79
- case command
80
- when 'sniff'
81
- results = Masticate.sniff(filename, options)
82
- col_sep = results[:col_sep]
83
- col_sep = "TAB" if col_sep == "\t"
84
- quote_char = results[:quote_char] || "NONE"
85
- $stderr.puts <<-EOT
86
- Processing complete.
87
- Input delimiter: #{col_sep}
88
- Quote char: #{quote_char}
89
- Field counts: #{results[:field_counts].inspect}
90
- Headers: #{results[:headers].join(',')}
91
- EOT
92
-
93
- when 'mend'
94
- results = Masticate.mend(filename, options)
95
- logmessage(command, options, results)
96
-
97
- when 'csvify'
98
- results = Masticate.csvify(filename, options)
99
- logmessage(command, options, results)
100
-
101
- when 'pluck'
102
- results = Masticate.pluck(filename, options)
103
- logmessage(command, options, results)
104
-
105
- when 'datify'
106
- results = Masticate.datify(filename, options)
107
- logmessage(command, options, results)
108
-
109
- when 'gsub'
110
- results = Masticate.gsub(filename, options)
111
- logmessage(command, options, results)
112
-
113
- when 'maxrows'
114
- results = Masticate.maxrows(filename, options)
115
- logmessage(command, options, results)
116
-
117
- else
118
- raise "unknown command #{command}"
119
- end
8
+ masticator.execute(command, options, filenames)
@@ -4,8 +4,15 @@ class Masticate::Base
4
4
  attr_reader :input_count, :output_count
5
5
  attr_reader :csv_options
6
6
 
7
- def initialize(filename)
8
- @filename = filename
7
+ def initialize(args)
8
+ case args
9
+ when String
10
+ @filename = args
11
+ when Hash
12
+ configure(args)
13
+ else
14
+ raise "invalid initialization: #{args}"
15
+ end
9
16
  end
10
17
 
11
18
  def with_input
@@ -40,4 +47,8 @@ class Masticate::Base
40
47
  @csv_options[:quote_char] = opts[:quote_char] || "\0"
41
48
  end
42
49
  end
50
+
51
+ # def crunch(row)
52
+ # # noop
53
+ # end
43
54
  end
@@ -0,0 +1,21 @@
1
+ # concatenate input files:
2
+ # * assuming that each input file has a single header line
3
+ # * writing a single header line to the output (just use the header line from the first file)
4
+ # * trying that all the files have the same format (no validation)
5
+
6
+ class Masticate::Concat #< Masticate::Base
7
+ def initialize(filenames)
8
+ @filenames = filenames
9
+ end
10
+
11
+ def concat(opts)
12
+ File.unlink(opts[:output]) if opts[:output] && File.exists?(opts[:output])
13
+ redirect = ">>#{opts[:output]}" if opts[:output]
14
+
15
+ file1, *rest = @filenames
16
+ system "cat #{file1} #{redirect}"
17
+ rest.each do |file|
18
+ system "tail +2 #{file} #{redirect}"
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,48 @@
1
+ # cook up a recipe
2
+ # * single file as input
3
+ # * recipe from a file
4
+ # * multiple steps
5
+ # * single output
6
+
7
+ require "shellwords"
8
+
9
+ class Masticate::Cook < Masticate::Base
10
+ def initialize(filename)
11
+ @filename = filename
12
+ end
13
+
14
+ def cook(opts)
15
+ recipefile = opts[:recipe] or raise "missing recipe for cook"
16
+ recipe = File.read(recipefile).lines
17
+ standard_options(opts)
18
+
19
+ steps = recipe.map do |step|
20
+ # puts step
21
+ argv = Shellwords.split(step)
22
+ masticator = Masticate::MyOptionParser.new
23
+ command, options = masticator.parse(argv)
24
+ masticator.prepare(command, options)
25
+ end
26
+
27
+ @output_count = 0
28
+ headers = nil
29
+ with_input do |input|
30
+ while line = get
31
+ row = CSV.parse_line(line, csv_options)
32
+
33
+ steps.each do |step|
34
+ # puts "APPLY #{step} to #{row}"
35
+ row = step.crunch(row)
36
+ end
37
+
38
+ emit(row.to_csv) if row
39
+ end
40
+ end
41
+ @output.close if opts[:output]
42
+
43
+ {
44
+ :input_count => @input_count,
45
+ :output_count => @output_count
46
+ }
47
+ end
48
+ end
@@ -2,13 +2,21 @@
2
2
  require "csv"
3
3
 
4
4
  class Masticate::Gsubber < Masticate::Base
5
- def gsub(opts)
5
+ def configure(opts)
6
6
  standard_options(opts)
7
7
 
8
- field = opts[:field] or raise "missing field to gsub"
9
- from = Regexp.new(opts[:from]) or raise "Invalid regex '#{opts[:from]}' for conversion"
10
- to = opts[:to] or raise "missing 'to' string for gsub"
8
+ @field = opts[:field] or raise "missing field to gsub"
9
+ @from = Regexp.new(opts[:from]) or raise "Invalid regex '#{opts[:from]}' for conversion"
10
+ @to = opts[:to] or raise "missing 'to' string for gsub"
11
+ end
12
+
13
+ def set_headers(row)
14
+ @headers = row
15
+ @index = @headers.index(@field) or raise "Unable to find column '#{@field}' in headers"
16
+ end
11
17
 
18
+ def gsub(opts)
19
+ configure(opts)
12
20
  @output_count = 0
13
21
  headers = nil
14
22
  with_input do |input|
@@ -16,11 +24,11 @@ class Masticate::Gsubber < Masticate::Base
16
24
  row = CSV.parse_line(line, csv_options)
17
25
  if !headers
18
26
  headers = row
19
- index = headers.index(field) or raise "Unable to find column '#{field}' in headers"
27
+ index = headers.index(@field) or raise "Unable to find column '#{@field}' in headers"
20
28
  emit(line)
21
29
  else
22
30
  oldval = row[index]
23
- newval = oldval.gsub(from, to)
31
+ newval = oldval.gsub(@from, @to)
24
32
  row[index] = newval
25
33
  emit(row.to_csv)
26
34
  end
@@ -33,4 +41,13 @@ class Masticate::Gsubber < Masticate::Base
33
41
  :output_count => @output_count
34
42
  }
35
43
  end
44
+
45
+ def crunch(row)
46
+ if !@headers
47
+ set_headers(row)
48
+ else
49
+ row[@index] = row[@index].gsub(@from, @to)
50
+ end
51
+ row
52
+ end
36
53
  end
@@ -2,11 +2,15 @@
2
2
  require "csv"
3
3
 
4
4
  class Masticate::MaxRows < Masticate::Base
5
- def maxrows(opts)
5
+ def configure(opts)
6
6
  standard_options(opts)
7
7
 
8
- groupby = opts[:by] or raise "missing field to group by"
9
- maxon = opts[:max] or raise "missing field to max on"
8
+ @groupby = opts[:by] or raise "missing field to group by"
9
+ @maxon = opts[:max] or raise "missing field to max on"
10
+ end
11
+
12
+ def maxrows(opts)
13
+ configure(opts)
10
14
 
11
15
  @output_count = 0
12
16
  headers = nil
@@ -16,8 +20,8 @@ class Masticate::MaxRows < Masticate::Base
16
20
  row = CSV.parse_line(line, csv_options)
17
21
  if !headers
18
22
  headers = row
19
- index_by = headers.index(groupby) or raise "Unable to find column '#{groupby}'"
20
- index_max = headers.index(maxon) or raise "Unable to find column '#{maxon}'"
23
+ index_by = headers.index(@groupby) or raise "Unable to find column '#{@groupby}'"
24
+ index_max = headers.index(@maxon) or raise "Unable to find column '#{@maxon}'"
21
25
  emit(line)
22
26
  else
23
27
  key = row[index_by]
@@ -45,4 +49,30 @@ class Masticate::MaxRows < Masticate::Base
45
49
  :output_count => @output_count
46
50
  }
47
51
  end
52
+
53
+ def crunch(row)
54
+ if !@headers
55
+ @headers = row
56
+ @index_by = row.index(@groupby) or raise "Unable to find column '#{@groupby}'"
57
+ @index_max = row.index(@maxon) or raise "Unable to find column '#{@maxon}'"
58
+ @accum = {}
59
+ row
60
+ elsif row.nil?
61
+ # output the accumulated results
62
+ @accum.each do |k,row|
63
+ emit(row.to_csv)
64
+ end
65
+ else
66
+ key = row[@index_by]
67
+ if !@accum[key]
68
+ @accum[key] = row
69
+ else
70
+ oldscore = @accum[key][@index_max]
71
+ newscore = row[@index_max]
72
+ if newscore > oldscore
73
+ @accum[key] = row
74
+ end
75
+ end
76
+ end
77
+ end
48
78
  end
@@ -0,0 +1,163 @@
1
+ require "optparse"
2
+
3
+ class Masticate::MyOptionParser
4
+ attr_reader :command, :options
5
+
6
+ def initialize
7
+ @options = {}
8
+ @parser = OptionParser.new do |opts|
9
+ opts.banner = "Usage: masticate [command] [options]"
10
+
11
+ opts.on("--output FILENAME", String, "Redirect output from stdout to file") do |f|
12
+ @options[:output] = f
13
+ end
14
+
15
+ opts.on("--format FORMAT", String, "Specify format") do |v|
16
+ @options[:format] = v
17
+ end
18
+
19
+ opts.on("--delim DELIMITER", String, "Specify field delimiter (character or TAB; default is ',')") do |v|
20
+ @options[:col_sep] = v
21
+ @options[:col_sep] = "\t" if @options[:col_sep] == "TAB"
22
+ end
23
+
24
+ opts.on("--quote QUOTE-CHAR", String, "Specify character used for quoting fields (optional; default is no quoting)") do |char|
25
+ @options[:quote_char] = char
26
+ end
27
+
28
+ opts.on("--stats", "(for *sniff*) collect & display input stats") do
29
+ @options[:stats] = true
30
+ end
31
+
32
+ opts.on("--fields LIST", Array, "Specify fields to select") do |list|
33
+ @options[:fields] = list
34
+ end
35
+
36
+ opts.on("--field FIELD", String, "Specify field to convert") do |f|
37
+ @options[:field] = f
38
+ end
39
+
40
+ opts.on("--snip DIRECTIVE", String, "Specify header fields to snip: first N, or by name") do |f|
41
+ @options[:snip] = f.to_i
42
+ end
43
+
44
+ opts.on("--from REGEXP", String, "Regular expression for gsub conversion") do |s|
45
+ @options[:from] = s
46
+ end
47
+
48
+ # if I specify String here, then a blank string '' is considered invalid and triggers an exception.
49
+ opts.on("--to STRING", "Result string for gsub conversion") do |s|
50
+ @options[:to] = s
51
+ end
52
+
53
+ opts.on("--inlined", "(for *mend* only) Source file has headers inlined on each line") do |b|
54
+ @options[:inlined] = true
55
+ end
56
+
57
+ opts.on("--dejunk", "(for *mend* only) Expunge junk lines from source") do |b|
58
+ @options[:dejunk] = true
59
+ end
60
+
61
+ opts.on("--by FIELD", String, "(for *maxrows* only) Field to group by") do |f|
62
+ @options[:by] = f
63
+ end
64
+
65
+ opts.on("--max FIELD", String, "(for *maxrows* only) Field to find max value for") do |f|
66
+ @options[:max] = f
67
+ end
68
+
69
+ opts.on("--recipe FILENAME", String, "(*cook* only) Recipe file") do |f|
70
+ @options[:recipe] = f
71
+ end
72
+ end
73
+ end
74
+
75
+ def parse(argv = ARGV)
76
+ @command = argv.shift
77
+ filenames = @parser.parse(argv)
78
+ # argv remnants are filenames
79
+ [@command, @options, filenames]
80
+ end
81
+
82
+ def prepare(command, options)
83
+ klasses = {
84
+ 'gsub' => Masticate::Gsubber,
85
+ 'datify' => Masticate::Datify,
86
+ 'maxrows' => Masticate::MaxRows,
87
+ 'relabel' => Masticate::Relabel,
88
+ 'pluck' => Masticate::Plucker
89
+ }
90
+
91
+ klass = klasses[command]
92
+ klass.new(options)
93
+ end
94
+
95
+ def execute(command, options, filenames = nil)
96
+ filename = filenames.first
97
+
98
+ case command
99
+ when 'sniff'
100
+ results = Masticate.sniff(filename, options)
101
+ col_sep = results[:col_sep]
102
+ col_sep = "TAB" if col_sep == "\t"
103
+ quote_char = results[:quote_char] || "NONE"
104
+ $stderr.puts <<-EOT
105
+ Processing complete.
106
+ Input delimiter: #{col_sep}
107
+ Quote char: #{quote_char}
108
+ Field counts: #{results[:field_counts].inspect}
109
+ Headers: #{results[:headers].join(',')}
110
+ EOT
111
+
112
+ when 'mend'
113
+ results = Masticate.mend(filename, options)
114
+ logmessage(command, options, results)
115
+
116
+ when 'csvify'
117
+ results = Masticate.csvify(filename, options)
118
+ logmessage(command, options, results)
119
+
120
+ when 'pluck'
121
+ results = Masticate.pluck(filename, options)
122
+ logmessage(command, options, results)
123
+
124
+ when 'datify'
125
+ results = Masticate.datify(filename, options)
126
+ logmessage(command, options, results)
127
+
128
+ when 'gsub'
129
+ results = Masticate.gsub(filename, options)
130
+ logmessage(command, options, results)
131
+
132
+ when 'maxrows'
133
+ results = Masticate.maxrows(filename, options)
134
+ logmessage(command, options, results)
135
+
136
+ when 'concat'
137
+ results = Masticate.concat(ARGV, options)
138
+ # logmessage(command, options, results)
139
+
140
+ when 'relabel'
141
+ results = Masticate.relabel(filename, options)
142
+ # logmessage(command, options, results)
143
+
144
+ when 'cook'
145
+ results = Masticate.cook(filename, options)
146
+ logmessage(command, options, results)
147
+
148
+ else
149
+ raise "unknown command #{command}"
150
+ end
151
+ end
152
+
153
+ def logmessage(command, options, results)
154
+ $stderr.puts <<-EOT
155
+ * masticate #{command} (#{options.keys.join(', ')})
156
+ Lines in input: #{results[:input_count]}
157
+ Lines in output: #{results[:output_count]}
158
+ EOT
159
+ if results[:field_counts]
160
+ $stderr.puts " Field counts: #{results[:field_counts].inspect}"
161
+ end
162
+ end
163
+ end
@@ -2,10 +2,17 @@
2
2
  require "csv"
3
3
 
4
4
  class Masticate::Plucker < Masticate::Base
5
- def pluck(opts)
5
+ def configure(opts)
6
6
  standard_options(opts)
7
7
 
8
- fields = opts[:fields] or raise "missing fields to pluck"
8
+ @fields = opts[:fields] or raise "missing fields to pluck"
9
+ end
10
+
11
+ def pluck(opts)
12
+ configure(opts)
13
+ # standard_options(opts)
14
+ #
15
+ # fields = opts[:fields] or raise "missing fields to pluck"
9
16
 
10
17
  @output_count = 0
11
18
  headers = nil
@@ -14,7 +21,7 @@ class Masticate::Plucker < Masticate::Base
14
21
  row = CSV.parse_line(line, csv_options)
15
22
  if !headers
16
23
  headers = row
17
- indexes = fields.map do |f|
24
+ indexes = @fields.map do |f|
18
25
  case f
19
26
  when String
20
27
  headers.index(f) or raise "Unable to find column '#{f}'"
@@ -41,4 +48,27 @@ class Masticate::Plucker < Masticate::Base
41
48
  :output_count => @output_count
42
49
  }
43
50
  end
51
+
52
+ def crunch(row)
53
+ if !@headers
54
+ @headers = row
55
+ @indexes = @fields.map do |f|
56
+ case f
57
+ when String
58
+ row.index(f) or raise "Unable to find column '#{f}'"
59
+ when Fixnum
60
+ if f > row.count
61
+ raise "Cannot pluck column #{f}, there are only #{row.count} fields"
62
+ else
63
+ f-1
64
+ end
65
+ else
66
+ raise "Invalid field descriptor '#{f}'"
67
+ end
68
+ end
69
+ end
70
+
71
+ # output is just the selected columns
72
+ @indexes.map {|i| row[i]}
73
+ end
44
74
  end
@@ -0,0 +1,44 @@
1
+ # relabel a single input file
2
+ # * assuming that input file has a single header line
3
+ # * assuming that input file is in valid CSV format (no validation)
4
+
5
+ class Masticate::Relabel < Masticate::Base
6
+ def configure(opts)
7
+ standard_options(opts)
8
+
9
+ @fields = opts[:fields] or raise "missing fieldnames for relabel"
10
+ end
11
+
12
+ def relabel(opts)
13
+ configure(opts)
14
+
15
+ @output_count = 0
16
+ headers = nil
17
+ with_input do |input|
18
+ while line = get
19
+ row = CSV.parse_line(line, csv_options)
20
+ if !headers
21
+ headers = @fields
22
+ emit(headers.to_csv)
23
+ else
24
+ emit(row.to_csv)
25
+ end
26
+ end
27
+ end
28
+ @output.close if opts[:output]
29
+
30
+ # File.unlink(opts[:output]) if opts[:output] && File.exists?(opts[:output])
31
+ # redirect = ">>#{opts[:output]}" if opts[:output]
32
+ #
33
+ # system "/bin/echo -n '#{fields.to_csv}' #{redirect}"
34
+ # system "tail +2 #{@filename} #{redirect}"
35
+ end
36
+
37
+ def crunch(row)
38
+ if !@headers
39
+ @headers = @fields
40
+ row = @headers
41
+ end
42
+ row
43
+ end
44
+ end
@@ -4,7 +4,7 @@ class Masticate::Sniffer < Masticate::Base
4
4
  attr_reader :col_sep, :quote_char, :stats
5
5
  attr_reader :delimstats
6
6
 
7
- CandidateDelimiters = [',', '|', "\t"]
7
+ CandidateDelimiters = [',', '|', "\t", "~"]
8
8
 
9
9
  def initialize(filename)
10
10
  @filename = filename
@@ -1,3 +1,3 @@
1
1
  module Masticate
2
- VERSION = "0.1.5"
2
+ VERSION = "0.2"
3
3
  end
data/lib/masticate.rb CHANGED
@@ -1,7 +1,10 @@
1
1
  require "open-uri"
2
+ require "csv"
2
3
 
3
4
  require_relative "masticate/version"
4
5
  require_relative "masticate/base"
6
+ require_relative "masticate/myoptparse"
7
+
5
8
  require_relative "masticate/sniffer"
6
9
  require_relative "masticate/mender"
7
10
  require_relative "masticate/csvify"
@@ -9,6 +12,9 @@ require_relative "masticate/plucker"
9
12
  require_relative "masticate/datify"
10
13
  require_relative "masticate/gsubber"
11
14
  require_relative "masticate/max_rows"
15
+ require_relative "masticate/concat"
16
+ require_relative "masticate/relabel"
17
+ require_relative "masticate/cook"
12
18
 
13
19
  module Masticate
14
20
  def self.sniff(filename, opts = {})
@@ -38,4 +44,16 @@ module Masticate
38
44
  def self.maxrows(filename, opts)
39
45
  MaxRows.new(filename).maxrows(opts)
40
46
  end
47
+
48
+ def self.concat(filenames, opts)
49
+ Concat.new(filenames).concat(opts)
50
+ end
51
+
52
+ def self.relabel(filename, opts)
53
+ Relabel.new(filename).relabel(opts)
54
+ end
55
+
56
+ def self.cook(filename, opts)
57
+ Cook.new(filename).cook(opts)
58
+ end
41
59
  end
@@ -0,0 +1,108 @@
1
+ COL1 COL 2 Col 3 col-4 col5 col6
2
+ data data data d a t a data data
3
+ data data data d a t a data data
4
+ data data data d a t a data data
5
+ data data data d a t a data data
6
+ data| data |data |d a t a|data|data
7
+ data| data |data |d a t a|data|data
8
+ data| data |data |d a t a|data,data|data
9
+ data| data |data "more data" |d a t a|data|data
10
+ 1,20120106003230,2044272,L,407,15267,407,201201060140,407,201201060140,0,201201060309,L,"594,756"
11
+ 1,20120106003230,2044277,X,407,15267,381,201201060222,381,201201060222,0,201201060647,X,"594,761"
12
+ 1,20120106003230,2044309,L,407,15267,407,201201060311,407,201201060311,0,201201060339,L,"594,766"
13
+ 1,20120106003230,,Q,407,15267,407,201201060514,108,201201060515,108,201201060515,SEC,"594,787"
14
+ 1,20120106024355,,Q,407,15267,407,201201060309,90,201201060316,90,201201060316,IV,"594,764"
15
+ 1,20120106024355,2044306,L,407,15267,407,201201060309,407,201201060309,0,201201060345,L,"594,763"
16
+ 1,20120106024355,2044308,X,407,15267,407,201201060310,407,201201060310,0,201201060556,X,"594,765"
17
+ 1,20120106024355,2044307,L,407,15267,407,201201060309,407,201201060309,0,201201060333,L,"594,762"
18
+ 1,20120106024355,,Q,407,15267,407,201201060520,108,201201060522,108,201201060522,SEC,"594,789"
19
+ 1,20120106024355,2044579,L,407,15267,68,201201060826,68,201201060826,0,201201071149,L,"594,823"
20
+ 1,20120106032719,2044345,L,407,15267,407,201201060348,407,201201060348,0,201201060442,L,"594,775"
21
+ 1,20120106032719,2044344,L,407,15267,407,201201060348,407,201201060348,0,201201060442,L,"594,777"
22
+ 1,20120106032719,2044343,L,407,15267,407,201201060348,407,201201060348,0,201201060428,L,"594,773"
23
+ 1,20120106032719,,Q,407,15267,407,201201060348,426,201201060408,426,201201060408,IV,"594,774"
24
+ 1,20120106032719,,Q,407,15267,407,201201060348,426,201201060634,426,201201060634,URINE,"594,776"
25
+ 1,20120106032719,2044386,L,407,15267,407,201201060445,407,201201060445,0,201201060519,L,"594,785"
26
+ 1,20120106032719,2044401,X,407,15267,407,201201060521,407,201201060521,0,201201060646,X,"594,790"
27
+ 1,20120106033235,,Q,407,15267,407,201201060347,74,201201060353,74,201201060353,IV,"594,769"
28
+ 1,20120106033235,2044349,L,407,15267,407,201201060347,74,201201060353,0,201201060443,L,"594,771"
29
+ 1,20120106033235,2044350,L,407,15267,407,201201060347,74,201201060353,0,201201060434,URINE,"594,770"
30
+ 1,20120106033235,2044347,L,407,15267,407,201201060347,74,201201060353,0,201201060428,L,"594,768"
31
+ 1,20120106033235,2044348,L,407,15267,407,201201060347,74,201201060353,0,201201060443,L,"594,772"
32
+ 1,20120106033235,2044372,X,407,15267,407,201201060429,407,201201060429,0,201201060649,X,"594,780"
33
+ 1,20120106035346,,Q,407,15267,407,201201060446,426,201201060448,426,201201060448,N,"594,786"
34
+ 1,20120106041426,2044383,L,407,15267,407,201201060445,407,201201060445,0,201201060657,L,"594,784"
35
+ 1,20120106041426,2044384,L,407,15267,407,201201060445,407,201201060445,0,201201060657,L,"594,782"
36
+ 1,20120106041426,2044382,L,407,15267,407,201201060445,407,201201060445,0,201201060522,L,"594,781"
37
+ 1,20120106041426,,Q,407,15267,407,201201060445,381,201201060452,381,201201060452,IV,"594,783"
38
+ 1,20120106043025,2044400,X,407,15267,407,201201060515,407,201201060515,0,201201060554,X,"594,788"
39
+ 1,20120106045326,2044411,R,407,15267,407,201201060535,407,201201060535,0,201201060630,RS,"594,791"
40
+ 1,20120106045326,,Q,407,15267,407,201201060535,108,201201060540,108,201201060540,SEC,"594,794"
41
+ 1,20120106045326,2044412,R,407,15267,407,201201060535,407,201201060535,0,201201060629,RS,"594,795"
42
+ 1,20120106045326,2044413,X,407,15267,407,201201060536,407,201201060536,0,201201060649,X,"594,796"
43
+ 1,20120106045326,,Q,407,15267,407,201201060535,108,201201060541,108,201201060541,SEC,"594,792"
44
+ 1,20120106045326,2044410,R,407,15267,407,201201060535,407,201201060535,0,201201060628,RS,"594,793"
45
+ 1,20120106052714,2044421,L,407,15267,407,201201060544,407,201201060544,0,201201060605,L,"594,797"
46
+ 1,20120106052714,,Q,407,15267,407,201201060544,90,201201060545,90,201201060545,IV,"594,799"
47
+ 1,20120106052714,,Q,407,15267,407,201201060544,90,201201060545,90,201201060545,N,"594,800"
48
+ 1,20120106052714,2044422,L,407,15267,407,201201060544,407,201201060544,0,201201060621,L,"594,801"
49
+ 1,20120106052714,2044423,L,407,15267,407,201201060544,407,201201060544,0,201201060727,L,"594,798"
50
+ 1,20120106052714,2044424,L,407,15267,407,201201060551,407,201201060551,0,201201060714,L,"594,802"
51
+ 1,20120106070243,2044439,L,504,15550,504,201201060721,504,201201060721,0,201201060753,L,"594,803"
52
+ 1,20120106070243,2044440,L,504,15550,504,201201060721,504,201201060721,0,201201060748,L,"594,807"
53
+ 1,20120106070243,2044441,L,504,15550,504,201201060721,504,201201060721,0,201201060748,L,"594,806"
54
+ 1,20120106070243,,Q,504,15550,504,201201060721,155,201201060735,155,201201060735,IV,"594,805"
55
+ 1,20120106070243,,Q,504,15550,504,201201060806,155,201201060813,155,201201060813,N,"594,820"
56
+ 1,20120106070243,2044524,L,504,15550,504,201201060806,504,201201060806,0,201201061004,L,"594,816"
57
+ 1,20120106070243,,Q,504,15550,504,201201060807,195,201201060813,195,201201060813,SEC,"594,822"
58
+ 1,20120106070243,2044522,L,504,15550,504,201201060806,504,201201060806,0,201201060959,L,"594,819"
59
+ 1,20120106070243,,Q,504,15550,504,201201060807,195,201201060811,195,201201060811,SEC,"594,821"
60
+ 1,20120106070243,,Q,504,15550,504,201201060806,155,201201060813,155,201201060813,N,"594,818"
61
+ 1,20120106070243,,Q,504,15550,504,201201060910,155,201201060916,155,201201060916,N,"594,831"
62
+ 1,20120106070243,2044716,X,504,15550,504,201201060928,504,201201060928,0,201201060953,X,"594,834"
63
+ 1,20120106073142,2044480,X,504,15550,504,201201060757,504,201201060757,0,201201060819,X,"594,815"
64
+ 1,20120106073757,2044475,L,504,15550,504,201201060749,155,201201060755,0,201201060925,URINE,"594,810"
65
+ 1,20120106073757,2044466,L,504,15550,504,201201060749,504,201201060749,0,201201060827,L,"594,808"
66
+ 1,20120106073757,2044470,X,504,15550,504,201201060749,504,201201060749,0,201201060818,X,"594,809"
67
+ 1,20120106073757,2044467,L,504,15550,504,201201060749,504,201201060749,0,201201060826,L,"594,813"
68
+ 1,20120106073757,2044468,L,504,15550,504,201201060749,504,201201060749,0,201201060839,L,"594,811"
69
+ 1,20120106073757,2044469,L,504,15550,504,201201060749,504,201201060749,0,201201060825,L,"594,814"
70
+ 1,20120106073757,,Q,504,15550,504,201201060749,155,201201060755,155,201201060755,IV,"594,812"
71
+ 1,20120106073757,,Q,504,15550,504,201201060911,76,201201060933,76,201201060933,IV,"594,832"
72
+ 1,20120106073757,,Q,504,15550,504,201201060928,34,201201060934,34,201201060934,SEC,"594,833"
73
+ 1,20120106073757,,Q,504,15550,504,201201061022,155,201201061108,155,201201061108,IV,"594,862"
74
+ 1,20120106073757,,Q,504,15550,504,201201061019,155,201201061025,155,201201061025,IV,"594,861"
75
+ 1,20120106073757,,Q,504,15550,504,201201061131,195,201201061133,195,201201061133,SEC,"594,896"
76
+ 1,20120106073757,,Q,504,15550,504,201201061131,195,201201061133,195,201201061133,SEC,"594,895"
77
+ 1,20120106073757,2045028,X,504,15550,504,201201061131,504,201201061131,0,201201061209,X,"594,898"
78
+ 1,20120106073757,2045029,X,504,15550,504,201201061131,504,201201061131,0,201201061345,X,"594,897"
79
+ 1,20120106073757,,Q,504,15550,504,201201061131,155,201201061223,155,201201061223,N,"594,894"
80
+ 1,20120106084347,2044639,X,504,15550,76,201201060850,76,201201060850,0,201201060931,X,"594,828"
81
+ 1,20120106084720,2044670,X,55,4644,55,201201060909,55,201201060909,0,201201060934,X,"594,829"
82
+ 1,20120106084720,,Q,55,4644,55,201201060910,66,201201060914,66,201201060914,N,"594,830"
83
+ 1,20120106085558,2044755,L,55,4644,55,201201060949,55,201201060949,0,201201061018,L,"594,846"
84
+ 1,20120106085558,2044756,L,55,4644,55,201201060949,55,201201060949,0,201201061038,L,"594,851"
85
+ 1,20120106085558,2044793,L,55,4644,55,201201060949,76,201201061003,0,201201061239,URINE,"594,848"
86
+ 1,20120106085558,,Q,55,4644,55,201201060949,76,201201061003,76,201201061003,IV,"594,850"
87
+ 1,20120106085558,,Q,55,4644,55,201201060949,76,201201061003,76,201201061003,IV,"594,847"
88
+ 1,20120106085558,2044757,L,55,4644,55,201201060949,55,201201060949,0,201201061040,L,"594,849"
89
+ 1,20120106085558,2044843,L,55,4644,55,201201061033,55,201201061033,0,201201071505,L,"594,864"
90
+ 1,20120106085558,2044841,X,55,4644,55,201201061032,55,201201061032,0,201201061136,X,"594,863"
91
+ 1,20120106085558,2044844,L,55,4644,55,201201061033,55,201201061033,0,201201061119,L,"594,865"
92
+ 1,20120106085558,,Q,55,4644,55,201201061228,195,201201061240,195,201201061240,SEC,"594,961"
93
+ 1,20120106091726,2044741,L,504,15550,504,201201060942,504,201201060942,0,201201061024,L,"594,839"
94
+ 1,20120106091726,2044745,X,504,15550,504,201201060942,504,201201060942,0,201201061016,X,"594,835"
95
+ 1,20120106091726,2044746,L,504,15550,504,201201060942,504,201201060942,0,201201061107,L,"594,842"
96
+ 1,20120106091726,2044740,L,504,15550,504,201201060942,504,201201060942,0,201201061017,L,"594,836"
97
+ 1,20120106091726,2044744,L,504,15550,504,201201060942,504,201201060942,0,201201061024,L,"594,838"
98
+ 1,20120106091726,2044742,L,504,15550,504,201201060942,504,201201060942,0,201201061016,L,"594,841"
99
+ 1,20120106091726,,Q,504,15550,504,201201060942,66,201201060944,66,201201060944,IV,"594,837"
100
+ 1,20120106091726,2044743,L,504,15550,504,201201060942,504,201201060942,0,201201061016,L,"594,840"
101
+ 1,20120106095129,2044814,X,55,4644,55,201201061010,55,201201061010,0,201201061037,X,"594,853"
102
+ 1,20120106100014,,Q,504,15550,504,201201061011,885,201201061037,885,201201061037,IV,"594,857"
103
+ 1,20120106100014,,Q,504,15550,504,201201061011,885,201201061037,885,201201061037,N,"594,858"
104
+ 1,20120106100014,,Q,504,15550,504,201201061011,885,201201061037,885,201201061037,N,"594,859"
105
+ 1,20120106100014,2044815,L,504,15550,504,201201061011,504,201201061011,0,201201061023,L,"594,854"
106
+ 1,20120106100014,2044817,L,504,15550,504,201201061011,504,201201061011,0,201201061049,L,"594,856"
107
+ 1,20120106100014,2044818,X,504,15550,504,201201061011,504,201201061011,0,201201061038,X,"594,855"
108
+ 1,20120106100014,2044816,L,504,15550,504,201201061011,504,201201061011,0,201201061049,L,"594,860"
@@ -0,0 +1,41 @@
1
+ two,eight,fourteen
2
+ 20120106003230,201201060140,594756
3
+ 20120106003230,201201060222,594761
4
+ 20120106003230,201201060311,594766
5
+ 20120106003230,201201060514,594787
6
+ 20120106024355,201201060309,594764
7
+ 20120106024355,201201060310,594765
8
+ 20120106024355,201201060520,594789
9
+ 20120106024355,201201060826,594823
10
+ 20120106032719,201201060348,594775
11
+ 20120106032719,201201060445,594785
12
+ 20120106032719,201201060521,594790
13
+ 20120106033235,201201060347,594769
14
+ 20120106033235,201201060429,594780
15
+ 20120106035346,201201060446,594786
16
+ 20120106041426,201201060445,594784
17
+ 20120106043025,201201060515,594788
18
+ 20120106045326,201201060535,594791
19
+ 20120106045326,201201060536,594796
20
+ 20120106052714,201201060544,594797
21
+ 20120106052714,201201060551,594802
22
+ 20120106070243,201201060721,594803
23
+ 20120106070243,201201060806,594820
24
+ 20120106070243,201201060807,594822
25
+ 20120106070243,201201060910,594831
26
+ 20120106070243,201201060928,594834
27
+ 20120106073142,201201060757,594815
28
+ 20120106073757,201201060749,594810
29
+ 20120106073757,201201060911,594832
30
+ 20120106073757,201201060928,594833
31
+ 20120106073757,201201061022,594862
32
+ 20120106073757,201201061131,594896
33
+ 20120106084347,201201060850,594828
34
+ 20120106084720,201201060909,594829
35
+ 20120106084720,201201060910,594830
36
+ 20120106085558,201201060949,594846
37
+ 20120106085558,201201061033,594864
38
+ 20120106085558,201201061228,594961
39
+ 20120106091726,201201060942,594839
40
+ 20120106095129,201201061010,594853
41
+ 20120106100014,201201061011,594857
@@ -0,0 +1,4 @@
1
+ gsub --field order_number --from ',|(.00$)' --to ''
2
+ relabel --fields one,two,three,four,five,six,seven,eight,nine,ten,eleven,twelve,thirteen,fourteen
3
+ pluck --fields two,eight,fourteen
4
+ maxrows --by two --max eight
@@ -0,0 +1,4 @@
1
+ happy,birth,day,to,you
2
+ data1,data2,data3,data4,data5
3
+ 111,22,333,44,555
4
+ 91,92,93,94,95
@@ -0,0 +1,5 @@
1
+ COL1~COL 2~Col 3 ~col-4~ col5 ~col6
2
+ data~ data ~data ~d a t a~data~data
3
+ data~ data ~data ~d a t a~data~data
4
+ data~ data ~data ~d a t a~data,data~data
5
+ data~ data ~data "more data" ~d a t a~data~data
@@ -0,0 +1,17 @@
1
+ # spec for file concatenation
2
+
3
+ require "spec_helper"
4
+
5
+ describe "concatenation" do
6
+ it "should leave just one header row in the result" do
7
+ file1 = File.dirname(__FILE__) + "/../data/tabbed_data.txt"
8
+ file2 = File.dirname(__FILE__) + "/../data/pipe_data.txt"
9
+ file3 = File.dirname(__FILE__) + "/../data/quoted_csv_data.txt"
10
+ tmp = Tempfile.new('concat')
11
+ results = Masticate.concat([file1, file2, file3], :output => tmp.path)
12
+ output = File.read(tmp)
13
+ tmp.unlink
14
+ correct_output = File.read(File.dirname(__FILE__) + "/../data/concat_result.txt")
15
+ output.should == correct_output
16
+ end
17
+ end
@@ -0,0 +1,16 @@
1
+ # spec for cookery
2
+
3
+ require "spec_helper"
4
+
5
+ describe "cooking up a recipe" do
6
+ it "should handle multiple steps" do
7
+ input = File.dirname(__FILE__) + "/../data/quoted_csv_data.txt"
8
+ recipe = File.dirname(__FILE__) + "/../data/recipe.txt"
9
+ tmp = Tempfile.new('cooked')
10
+ results = Masticate.cook(input, :output => tmp, :recipe => recipe)
11
+ output = File.read(tmp)
12
+ correct_output = File.read(File.dirname(__FILE__) + "/../data/cooking_result.csv")
13
+
14
+ output.should == correct_output
15
+ end
16
+ end
@@ -1,7 +1,6 @@
1
1
  # spec for field regexp conversion
2
2
 
3
3
  require "spec_helper"
4
- require "tempfile"
5
4
 
6
5
  describe "gsubbing" do
7
6
  it "should apply conversion to a single column" do
@@ -0,0 +1,15 @@
1
+ # spec for file concatenation
2
+
3
+ require "spec_helper"
4
+
5
+ describe "relabeling" do
6
+ it "result should be same as original" do
7
+ input = File.dirname(__FILE__) + "/../data/namedcols.csv"
8
+ tmp = Tempfile.new('relabel')
9
+ results = Masticate.relabel(input, :fields => %w{happy birth day to you}, :output => tmp.path)
10
+ output = File.read(tmp)
11
+ tmp.unlink
12
+ correct_output = File.read(File.dirname(__FILE__) + "/../data/relabel_result.csv")
13
+ output.should == correct_output
14
+ end
15
+ end
@@ -31,4 +31,11 @@ describe "delimiter sniffing" do
31
31
  results[:quote_char].should == '"'
32
32
  results[:field_counts].should == {14 => 100}
33
33
  end
34
+
35
+ it "should find tilde delimiter" do
36
+ filename = File.dirname(__FILE__) + "/../data/tilde_data.txt"
37
+ results = Masticate.sniff(filename, :stats => true)
38
+ results[:col_sep].should == '~'
39
+ results[:field_counts].should == {6 => 5}
40
+ end
34
41
  end
data/spec/spec_helper.rb CHANGED
@@ -2,6 +2,7 @@
2
2
  # require 'rspec/rails'
3
3
  require 'rspec/autorun'
4
4
  # require 'capybara/rspec'
5
+ require "tempfile"
5
6
 
6
7
  # Requires supporting ruby files with custom matchers and macros, etc,
7
8
  # in spec/support/ and its subdirectories.
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: masticate
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.5
4
+ version: '0.2'
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-04-16 00:00:00.000000000 Z
12
+ date: 2012-04-23 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rspec
16
- requirement: &2156618420 !ruby/object:Gem::Requirement
16
+ requirement: &2153649040 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: 2.9.0
22
22
  type: :development
23
23
  prerelease: false
24
- version_requirements: *2156618420
24
+ version_requirements: *2153649040
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: guard-rspec
27
- requirement: &2156617660 !ruby/object:Gem::Requirement
27
+ requirement: &2153648360 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ~>
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: 0.7.0
33
33
  type: :development
34
34
  prerelease: false
35
- version_requirements: *2156617660
35
+ version_requirements: *2153648360
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: ruby_gntp
38
- requirement: &2156616980 !ruby/object:Gem::Requirement
38
+ requirement: &2153647700 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ~>
@@ -43,7 +43,7 @@ dependencies:
43
43
  version: 0.3.4
44
44
  type: :development
45
45
  prerelease: false
46
- version_requirements: *2156616980
46
+ version_requirements: *2153647700
47
47
  description: Data file crunching
48
48
  email:
49
49
  - jmay@pobox.com
@@ -61,18 +61,24 @@ files:
61
61
  - bin/masticate
62
62
  - lib/masticate.rb
63
63
  - lib/masticate/base.rb
64
+ - lib/masticate/concat.rb
65
+ - lib/masticate/cook.rb
64
66
  - lib/masticate/csvify.rb
65
67
  - lib/masticate/datify.rb
66
68
  - lib/masticate/gsubber.rb
67
69
  - lib/masticate/max_rows.rb
68
70
  - lib/masticate/mender.rb
71
+ - lib/masticate/myoptparse.rb
69
72
  - lib/masticate/plucker.rb
73
+ - lib/masticate/relabel.rb
70
74
  - lib/masticate/sniffer.rb
71
75
  - lib/masticate/version.rb
72
76
  - masticate.gemspec
73
77
  - spec/data/badnums.csv
74
78
  - spec/data/badnums_fixed.csv
75
79
  - spec/data/broken_psv.txt
80
+ - spec/data/concat_result.txt
81
+ - spec/data/cooking_result.csv
76
82
  - spec/data/events.csv
77
83
  - spec/data/events_reduced.csv
78
84
  - spec/data/inlined_headers.csv
@@ -83,12 +89,18 @@ files:
83
89
  - spec/data/namedcols.csv.output
84
90
  - spec/data/pipe_data.txt
85
91
  - spec/data/quoted_csv_data.txt
92
+ - spec/data/recipe.txt
93
+ - spec/data/relabel_result.csv
86
94
  - spec/data/tabbed_data.txt
95
+ - spec/data/tilde_data.txt
96
+ - spec/lib/concat_spec.rb
97
+ - spec/lib/cook_spec.rb
87
98
  - spec/lib/csvify_spec.rb
88
99
  - spec/lib/gsub_spec.rb
89
100
  - spec/lib/maxrow_spec.rb
90
101
  - spec/lib/mender_spec.rb
91
102
  - spec/lib/plucker_spec.rb
103
+ - spec/lib/relabel_spec.rb
92
104
  - spec/lib/sniffer_spec.rb
93
105
  - spec/spec_helper.rb
94
106
  homepage: ''
@@ -119,6 +131,8 @@ test_files:
119
131
  - spec/data/badnums.csv
120
132
  - spec/data/badnums_fixed.csv
121
133
  - spec/data/broken_psv.txt
134
+ - spec/data/concat_result.txt
135
+ - spec/data/cooking_result.csv
122
136
  - spec/data/events.csv
123
137
  - spec/data/events_reduced.csv
124
138
  - spec/data/inlined_headers.csv
@@ -129,12 +143,18 @@ test_files:
129
143
  - spec/data/namedcols.csv.output
130
144
  - spec/data/pipe_data.txt
131
145
  - spec/data/quoted_csv_data.txt
146
+ - spec/data/recipe.txt
147
+ - spec/data/relabel_result.csv
132
148
  - spec/data/tabbed_data.txt
149
+ - spec/data/tilde_data.txt
150
+ - spec/lib/concat_spec.rb
151
+ - spec/lib/cook_spec.rb
133
152
  - spec/lib/csvify_spec.rb
134
153
  - spec/lib/gsub_spec.rb
135
154
  - spec/lib/maxrow_spec.rb
136
155
  - spec/lib/mender_spec.rb
137
156
  - spec/lib/plucker_spec.rb
157
+ - spec/lib/relabel_spec.rb
138
158
  - spec/lib/sniffer_spec.rb
139
159
  - spec/spec_helper.rb
140
160
  has_rdoc: