masticate 0.2 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -30,9 +30,15 @@ class Masticate::Base
30
30
  end
31
31
 
32
32
  def emit(line)
33
+ @output_count ||= 0
33
34
  @output_count += 1
34
35
  begin
35
- @output.puts line
36
+ case line
37
+ when Array
38
+ @output.puts line.to_csv
39
+ else
40
+ @output.puts line
41
+ end
36
42
  rescue Errno::EPIPE
37
43
  # output was closed, e.g. ran piped into `head`
38
44
  # silently ignore this condition, it's not fatal and doesn't need a warning
@@ -48,7 +54,23 @@ class Masticate::Base
48
54
  end
49
55
  end
50
56
 
51
- # def crunch(row)
52
- # # noop
53
- # end
57
+ def execute(opts)
58
+ configure(opts)
59
+
60
+ @output_count = 0
61
+ with_input do |input|
62
+ while line = get
63
+ row = CSV.parse_line(line, csv_options)
64
+ output = crunch(row)
65
+ emit(output) if output
66
+ end
67
+ end
68
+ crunch(nil) {|row| emit(row)}
69
+ @output.close if opts[:output]
70
+
71
+ {
72
+ :input_count => input_count,
73
+ :output_count => @output_count
74
+ }
75
+ end
54
76
  end
@@ -12,6 +12,8 @@ class Masticate::Cook < Masticate::Base
12
12
  end
13
13
 
14
14
  def cook(opts)
15
+ standard_options(opts)
16
+
15
17
  recipefile = opts[:recipe] or raise "missing recipe for cook"
16
18
  recipe = File.read(recipefile).lines
17
19
  standard_options(opts)
@@ -31,13 +33,16 @@ class Masticate::Cook < Masticate::Base
31
33
  row = CSV.parse_line(line, csv_options)
32
34
 
33
35
  steps.each do |step|
34
- # puts "APPLY #{step} to #{row}"
35
- row = step.crunch(row)
36
+ row = step.crunch(row) if row
36
37
  end
37
38
 
38
- emit(row.to_csv) if row
39
+ emit(row) if row
39
40
  end
40
41
  end
42
+ steps.each do |step|
43
+ step.crunch(nil) {|row| emit(row)}
44
+ end
45
+
41
46
  @output.close if opts[:output]
42
47
 
43
48
  {
@@ -2,32 +2,23 @@
2
2
  require "csv"
3
3
 
4
4
  class Masticate::Datify < Masticate::Base
5
- def datify(opts)
5
+ def configure(opts)
6
6
  standard_options(opts)
7
+ @field = opts[:field] or raise "missing field to datify"
8
+ @format = opts[:format] or raise "strptime format required for parsing timestamps"
9
+ end
7
10
 
8
- field = opts[:field] or raise "missing field to datify"
9
- format = opts[:format] or raise "strptime format required for parsing timestamps"
11
+ def datify(opts)
12
+ execute(opts)
13
+ end
10
14
 
11
- @output_count = 0
12
- headers = nil
13
- with_input do |input|
14
- while line = get
15
- row = CSV.parse_line(line, csv_options)
16
- if !headers
17
- headers = row
18
- index = headers.index(field) or raise "Unable to find column '#{field}'"
19
- emit(headers.to_csv)
20
- else
21
- row[index] = DateTime.strptime(row[index], format).to_time.to_i rescue nil
22
- emit(row.to_csv)
23
- end
24
- end
15
+ def crunch(row)
16
+ if !@index
17
+ @index = row.index(@field) or raise "Unable to find column '#{@field}'"
18
+ elsif row
19
+ ts = DateTime.strptime(row[@index], @format).to_time
20
+ row[@index] = ts.to_i rescue nil
25
21
  end
26
- @output.close if opts[:output]
27
-
28
- {
29
- :input_count => @input_count,
30
- :output_count => @output_count
31
- }
22
+ row
32
23
  end
33
24
  end
@@ -16,36 +16,13 @@ class Masticate::Gsubber < Masticate::Base
16
16
  end
17
17
 
18
18
  def gsub(opts)
19
- configure(opts)
20
- @output_count = 0
21
- headers = nil
22
- with_input do |input|
23
- while line = get
24
- row = CSV.parse_line(line, csv_options)
25
- if !headers
26
- headers = row
27
- index = headers.index(@field) or raise "Unable to find column '#{@field}' in headers"
28
- emit(line)
29
- else
30
- oldval = row[index]
31
- newval = oldval.gsub(@from, @to)
32
- row[index] = newval
33
- emit(row.to_csv)
34
- end
35
- end
36
- end
37
- @output.close if opts[:output]
38
-
39
- {
40
- :input_count => input_count,
41
- :output_count => @output_count
42
- }
19
+ execute(opts)
43
20
  end
44
21
 
45
22
  def crunch(row)
46
23
  if !@headers
47
24
  set_headers(row)
48
- else
25
+ elsif row
49
26
  row[@index] = row[@index].gsub(@from, @to)
50
27
  end
51
28
  row
@@ -10,44 +10,7 @@ class Masticate::MaxRows < Masticate::Base
10
10
  end
11
11
 
12
12
  def maxrows(opts)
13
- configure(opts)
14
-
15
- @output_count = 0
16
- headers = nil
17
- accum = {}
18
- with_input do |input|
19
- while line = get
20
- row = CSV.parse_line(line, csv_options)
21
- if !headers
22
- headers = row
23
- index_by = headers.index(@groupby) or raise "Unable to find column '#{@groupby}'"
24
- index_max = headers.index(@maxon) or raise "Unable to find column '#{@maxon}'"
25
- emit(line)
26
- else
27
- key = row[index_by]
28
- if !accum[key]
29
- accum[key] = row
30
- else
31
- oldscore = accum[key][index_max]
32
- newscore = row[index_max]
33
- if newscore > oldscore
34
- accum[key] = row
35
- end
36
- end
37
- end
38
- end
39
- end
40
-
41
- accum.each do |k,row|
42
- emit(row.to_csv)
43
- end
44
-
45
- @output.close if opts[:output]
46
-
47
- {
48
- :input_count => @input_count,
49
- :output_count => @output_count
50
- }
13
+ execute(opts)
51
14
  end
52
15
 
53
16
  def crunch(row)
@@ -60,7 +23,7 @@ class Masticate::MaxRows < Masticate::Base
60
23
  elsif row.nil?
61
24
  # output the accumulated results
62
25
  @accum.each do |k,row|
63
- emit(row.to_csv)
26
+ yield row
64
27
  end
65
28
  else
66
29
  key = row[@index_by]
@@ -73,6 +36,7 @@ class Masticate::MaxRows < Masticate::Base
73
36
  @accum[key] = row
74
37
  end
75
38
  end
39
+ nil
76
40
  end
77
41
  end
78
42
  end
@@ -9,44 +9,7 @@ class Masticate::Plucker < Masticate::Base
9
9
  end
10
10
 
11
11
  def pluck(opts)
12
- configure(opts)
13
- # standard_options(opts)
14
- #
15
- # fields = opts[:fields] or raise "missing fields to pluck"
16
-
17
- @output_count = 0
18
- headers = nil
19
- with_input do |input|
20
- while line = get
21
- row = CSV.parse_line(line, csv_options)
22
- if !headers
23
- headers = row
24
- indexes = @fields.map do |f|
25
- case f
26
- when String
27
- headers.index(f) or raise "Unable to find column '#{f}'"
28
- when Fixnum
29
- if f > headers.count
30
- raise "Cannot pluck column #{f}, there are only #{headers.count} fields"
31
- else
32
- f-1
33
- end
34
- else
35
- raise "Invalid field descriptor '#{f}'"
36
- end
37
- end
38
- emit(indexes.map {|i| headers[i]}.to_csv)
39
- else
40
- emit(indexes.map {|i| row[i]}.to_csv) if row
41
- end
42
- end
43
- end
44
- @output.close if opts[:output]
45
-
46
- {
47
- :input_count => input_count,
48
- :output_count => @output_count
49
- }
12
+ execute(opts)
50
13
  end
51
14
 
52
15
  def crunch(row)
@@ -66,9 +29,10 @@ class Masticate::Plucker < Masticate::Base
66
29
  raise "Invalid field descriptor '#{f}'"
67
30
  end
68
31
  end
32
+ @indexes.map {|i| row[i]}
33
+ elsif row
34
+ # output is just the selected columns
35
+ @indexes.map {|i| row[i]}
69
36
  end
70
-
71
- # output is just the selected columns
72
- @indexes.map {|i| row[i]}
73
37
  end
74
38
  end
@@ -10,28 +10,7 @@ class Masticate::Relabel < Masticate::Base
10
10
  end
11
11
 
12
12
  def relabel(opts)
13
- configure(opts)
14
-
15
- @output_count = 0
16
- headers = nil
17
- with_input do |input|
18
- while line = get
19
- row = CSV.parse_line(line, csv_options)
20
- if !headers
21
- headers = @fields
22
- emit(headers.to_csv)
23
- else
24
- emit(row.to_csv)
25
- end
26
- end
27
- end
28
- @output.close if opts[:output]
29
-
30
- # File.unlink(opts[:output]) if opts[:output] && File.exists?(opts[:output])
31
- # redirect = ">>#{opts[:output]}" if opts[:output]
32
- #
33
- # system "/bin/echo -n '#{fields.to_csv}' #{redirect}"
34
- # system "tail +2 #{@filename} #{redirect}"
13
+ execute(opts)
35
14
  end
36
15
 
37
16
  def crunch(row)
@@ -1,3 +1,3 @@
1
1
  module Masticate
2
- VERSION = "0.2"
2
+ VERSION = "0.2.1"
3
3
  end
@@ -1,40 +1,18 @@
1
1
  two,eight,fourteen
2
- 20120106003230,201201060140,594756
3
- 20120106003230,201201060222,594761
4
- 20120106003230,201201060311,594766
5
2
  20120106003230,201201060514,594787
6
- 20120106024355,201201060309,594764
7
- 20120106024355,201201060310,594765
8
- 20120106024355,201201060520,594789
9
3
  20120106024355,201201060826,594823
10
- 20120106032719,201201060348,594775
11
- 20120106032719,201201060445,594785
12
4
  20120106032719,201201060521,594790
13
- 20120106033235,201201060347,594769
14
5
  20120106033235,201201060429,594780
15
6
  20120106035346,201201060446,594786
16
7
  20120106041426,201201060445,594784
17
8
  20120106043025,201201060515,594788
18
- 20120106045326,201201060535,594791
19
9
  20120106045326,201201060536,594796
20
- 20120106052714,201201060544,594797
21
10
  20120106052714,201201060551,594802
22
- 20120106070243,201201060721,594803
23
- 20120106070243,201201060806,594820
24
- 20120106070243,201201060807,594822
25
- 20120106070243,201201060910,594831
26
11
  20120106070243,201201060928,594834
27
12
  20120106073142,201201060757,594815
28
- 20120106073757,201201060749,594810
29
- 20120106073757,201201060911,594832
30
- 20120106073757,201201060928,594833
31
- 20120106073757,201201061022,594862
32
13
  20120106073757,201201061131,594896
33
14
  20120106084347,201201060850,594828
34
- 20120106084720,201201060909,594829
35
15
  20120106084720,201201060910,594830
36
- 20120106085558,201201060949,594846
37
- 20120106085558,201201061033,594864
38
16
  20120106085558,201201061228,594961
39
17
  20120106091726,201201060942,594839
40
18
  20120106095129,201201061010,594853
@@ -0,0 +1,20 @@
1
+ ChangeDTTM|AuditEventID|AuditEventCode|AuditEventTypeID|AuditEventName|AuditEventDesc|AuditByID
2
+ 12/31/2011 3:41:54PM|57,314,552.00|LOGGEDOUT|30|Logged out|Logged out|4,426.00
3
+ 1/8/2012 9:21:23AM|57,486,988.00|LOGGEDOUT|30|Logged out|Logged out|4,573.00
4
+ 1/8/2012 9:21:24AM|57,486,989.00|LOGGEDOUT|30|Logged out|Logged out|4,573.00
5
+ 2/19/2012 6:08:59PM|58,597,521.00|LOGGEDOUT|30|Logged out|Logged out|4,573.00
6
+ 2/19/2012 7:02:50PM|58,597,816.00|LOGGEDOUT|30|Logged out|Logged out|4,573.00
7
+ 2/19/2012 7:02:50PM|58,597,817.00|LOGGEDOUT|30|Logged out|Logged out|4,573.00
8
+ 2/4/2012 9:29:20AM|58,201,259.00|LOGGEDOUT|30|Logged out|Logged out|4,573.00
9
+ 2/4/2012 12:19:30AM|58,196,973.00|LOGGEDOUT|30|Logged out|Logged out|4,573.00
10
+ 2/4/2012 12:20:53AM|58,196,975.00|LOGGEDOUT|30|Logged out|Logged out|4,573.00
11
+ 2/3/2012 10:44:25PM|58,196,507.00|LOGGEDOUT|30|Logged out|Logged out|4,573.00
12
+ 1/20/2012 8:01:07PM|57,833,496.00|LOGGEDOUT|30|Logged out|Logged out|4,573.00
13
+ 1/22/2012 4:20:48AM|57,845,095.00|LOGGEDOUT|30|Logged out|Logged out|4,573.00
14
+ 1/22/2012 4:20:49AM|57,845,096.00|LOGGEDOUT|30|Logged out|Logged out|4,573.00
15
+ 1/14/2012 7:50:40AM|57,661,613.00|LOGGEDOUT|30|Logged out|Logged out|5,019.00
16
+ 2/26/2012 7:48:07AM|58,776,243.00|LOGGEDOUT|30|Logged out|Logged out|5,019.00
17
+ 2/23/2012 7:47:39AM|58,699,535.00|LOGGEDOUT|30|Logged out|Logged out|5,019.00
18
+ 2/23/2012 7:39:56AM|58,699,387.00|LOGGEDOUT|30|Logged out|Logged out|5,019.00
19
+ 2/23/2012 7:39:56AM|58,699,388.00|LOGGEDOUT|30|Logged out|Logged out|5,019.00
20
+ 2/15/2012 3:15:06PM|58,502,037.00|LOGGEDOUT|30|Logged out|Logged out|5,019.00
data/spec/data/recipe.txt CHANGED
@@ -1,4 +1,5 @@
1
1
  gsub --field order_number --from ',|(.00$)' --to ''
2
+ datify --field dtedone --format "%Y%m%d%H%M"
2
3
  relabel --fields one,two,three,four,five,six,seven,eight,nine,ten,eleven,twelve,thirteen,fourteen
3
4
  pluck --fields two,eight,fourteen
4
5
  maxrows --by two --max eight
@@ -1,7 +1,6 @@
1
1
  # spec for file-sniffing functions
2
2
 
3
3
  require "spec_helper"
4
- require "tempfile"
5
4
 
6
5
  describe "csvification" do
7
6
  it "should convert pipes to standard commas" do
@@ -0,0 +1,16 @@
1
+ # spec for data translation functions
2
+
3
+ require "spec_helper"
4
+
5
+ describe "csvification" do
6
+ it "should convert pipes to standard commas" do
7
+ filename = File.dirname(__FILE__) + "/../data/pipe_data.txt"
8
+ tmp = Tempfile.new('csvify')
9
+ results = Masticate.csvify(filename, :output => tmp, :col_sep => '|')
10
+ output = File.read(tmp)
11
+ tmp.unlink
12
+ output.lines.count.should == 5
13
+ results[:input_count].should == 6
14
+ results[:output_count].should == 5
15
+ end
16
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: masticate
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.2'
4
+ version: 0.2.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -13,7 +13,7 @@ date: 2012-04-23 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rspec
16
- requirement: &2153649040 !ruby/object:Gem::Requirement
16
+ requirement: &2157087600 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: 2.9.0
22
22
  type: :development
23
23
  prerelease: false
24
- version_requirements: *2153649040
24
+ version_requirements: *2157087600
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: guard-rspec
27
- requirement: &2153648360 !ruby/object:Gem::Requirement
27
+ requirement: &2157087060 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ~>
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: 0.7.0
33
33
  type: :development
34
34
  prerelease: false
35
- version_requirements: *2153648360
35
+ version_requirements: *2157087060
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: ruby_gntp
38
- requirement: &2153647700 !ruby/object:Gem::Requirement
38
+ requirement: &2157086580 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ~>
@@ -43,7 +43,7 @@ dependencies:
43
43
  version: 0.3.4
44
44
  type: :development
45
45
  prerelease: false
46
- version_requirements: *2153647700
46
+ version_requirements: *2157086580
47
47
  description: Data file crunching
48
48
  email:
49
49
  - jmay@pobox.com
@@ -79,6 +79,7 @@ files:
79
79
  - spec/data/broken_psv.txt
80
80
  - spec/data/concat_result.txt
81
81
  - spec/data/cooking_result.csv
82
+ - spec/data/datify_input.txt
82
83
  - spec/data/events.csv
83
84
  - spec/data/events_reduced.csv
84
85
  - spec/data/inlined_headers.csv
@@ -96,6 +97,7 @@ files:
96
97
  - spec/lib/concat_spec.rb
97
98
  - spec/lib/cook_spec.rb
98
99
  - spec/lib/csvify_spec.rb
100
+ - spec/lib/datify_spec.rb
99
101
  - spec/lib/gsub_spec.rb
100
102
  - spec/lib/maxrow_spec.rb
101
103
  - spec/lib/mender_spec.rb
@@ -133,6 +135,7 @@ test_files:
133
135
  - spec/data/broken_psv.txt
134
136
  - spec/data/concat_result.txt
135
137
  - spec/data/cooking_result.csv
138
+ - spec/data/datify_input.txt
136
139
  - spec/data/events.csv
137
140
  - spec/data/events_reduced.csv
138
141
  - spec/data/inlined_headers.csv
@@ -150,6 +153,7 @@ test_files:
150
153
  - spec/lib/concat_spec.rb
151
154
  - spec/lib/cook_spec.rb
152
155
  - spec/lib/csvify_spec.rb
156
+ - spec/lib/datify_spec.rb
153
157
  - spec/lib/gsub_spec.rb
154
158
  - spec/lib/maxrow_spec.rb
155
159
  - spec/lib/mender_spec.rb