masticate 0.2 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -30,9 +30,15 @@ class Masticate::Base
30
30
  end
31
31
 
32
32
  def emit(line)
33
+ @output_count ||= 0
33
34
  @output_count += 1
34
35
  begin
35
- @output.puts line
36
+ case line
37
+ when Array
38
+ @output.puts line.to_csv
39
+ else
40
+ @output.puts line
41
+ end
36
42
  rescue Errno::EPIPE
37
43
  # output was closed, e.g. ran piped into `head`
38
44
  # silently ignore this condition, it's not fatal and doesn't need a warning
@@ -48,7 +54,23 @@ class Masticate::Base
48
54
  end
49
55
  end
50
56
 
51
- # def crunch(row)
52
- # # noop
53
- # end
57
+ def execute(opts)
58
+ configure(opts)
59
+
60
+ @output_count = 0
61
+ with_input do |input|
62
+ while line = get
63
+ row = CSV.parse_line(line, csv_options)
64
+ output = crunch(row)
65
+ emit(output) if output
66
+ end
67
+ end
68
+ crunch(nil) {|row| emit(row)}
69
+ @output.close if opts[:output]
70
+
71
+ {
72
+ :input_count => input_count,
73
+ :output_count => @output_count
74
+ }
75
+ end
54
76
  end
@@ -12,6 +12,8 @@ class Masticate::Cook < Masticate::Base
12
12
  end
13
13
 
14
14
  def cook(opts)
15
+ standard_options(opts)
16
+
15
17
  recipefile = opts[:recipe] or raise "missing recipe for cook"
16
18
  recipe = File.read(recipefile).lines
17
19
  standard_options(opts)
@@ -31,13 +33,16 @@ class Masticate::Cook < Masticate::Base
31
33
  row = CSV.parse_line(line, csv_options)
32
34
 
33
35
  steps.each do |step|
34
- # puts "APPLY #{step} to #{row}"
35
- row = step.crunch(row)
36
+ row = step.crunch(row) if row
36
37
  end
37
38
 
38
- emit(row.to_csv) if row
39
+ emit(row) if row
39
40
  end
40
41
  end
42
+ steps.each do |step|
43
+ step.crunch(nil) {|row| emit(row)}
44
+ end
45
+
41
46
  @output.close if opts[:output]
42
47
 
43
48
  {
@@ -2,32 +2,23 @@
2
2
  require "csv"
3
3
 
4
4
  class Masticate::Datify < Masticate::Base
5
- def datify(opts)
5
+ def configure(opts)
6
6
  standard_options(opts)
7
+ @field = opts[:field] or raise "missing field to datify"
8
+ @format = opts[:format] or raise "strptime format required for parsing timestamps"
9
+ end
7
10
 
8
- field = opts[:field] or raise "missing field to datify"
9
- format = opts[:format] or raise "strptime format required for parsing timestamps"
11
+ def datify(opts)
12
+ execute(opts)
13
+ end
10
14
 
11
- @output_count = 0
12
- headers = nil
13
- with_input do |input|
14
- while line = get
15
- row = CSV.parse_line(line, csv_options)
16
- if !headers
17
- headers = row
18
- index = headers.index(field) or raise "Unable to find column '#{field}'"
19
- emit(headers.to_csv)
20
- else
21
- row[index] = DateTime.strptime(row[index], format).to_time.to_i rescue nil
22
- emit(row.to_csv)
23
- end
24
- end
15
+ def crunch(row)
16
+ if !@index
17
+ @index = row.index(@field) or raise "Unable to find column '#{@field}'"
18
+ elsif row
19
+ ts = DateTime.strptime(row[@index], @format).to_time
20
+ row[@index] = ts.to_i rescue nil
25
21
  end
26
- @output.close if opts[:output]
27
-
28
- {
29
- :input_count => @input_count,
30
- :output_count => @output_count
31
- }
22
+ row
32
23
  end
33
24
  end
@@ -16,36 +16,13 @@ class Masticate::Gsubber < Masticate::Base
16
16
  end
17
17
 
18
18
  def gsub(opts)
19
- configure(opts)
20
- @output_count = 0
21
- headers = nil
22
- with_input do |input|
23
- while line = get
24
- row = CSV.parse_line(line, csv_options)
25
- if !headers
26
- headers = row
27
- index = headers.index(@field) or raise "Unable to find column '#{@field}' in headers"
28
- emit(line)
29
- else
30
- oldval = row[index]
31
- newval = oldval.gsub(@from, @to)
32
- row[index] = newval
33
- emit(row.to_csv)
34
- end
35
- end
36
- end
37
- @output.close if opts[:output]
38
-
39
- {
40
- :input_count => input_count,
41
- :output_count => @output_count
42
- }
19
+ execute(opts)
43
20
  end
44
21
 
45
22
  def crunch(row)
46
23
  if !@headers
47
24
  set_headers(row)
48
- else
25
+ elsif row
49
26
  row[@index] = row[@index].gsub(@from, @to)
50
27
  end
51
28
  row
@@ -10,44 +10,7 @@ class Masticate::MaxRows < Masticate::Base
10
10
  end
11
11
 
12
12
  def maxrows(opts)
13
- configure(opts)
14
-
15
- @output_count = 0
16
- headers = nil
17
- accum = {}
18
- with_input do |input|
19
- while line = get
20
- row = CSV.parse_line(line, csv_options)
21
- if !headers
22
- headers = row
23
- index_by = headers.index(@groupby) or raise "Unable to find column '#{@groupby}'"
24
- index_max = headers.index(@maxon) or raise "Unable to find column '#{@maxon}'"
25
- emit(line)
26
- else
27
- key = row[index_by]
28
- if !accum[key]
29
- accum[key] = row
30
- else
31
- oldscore = accum[key][index_max]
32
- newscore = row[index_max]
33
- if newscore > oldscore
34
- accum[key] = row
35
- end
36
- end
37
- end
38
- end
39
- end
40
-
41
- accum.each do |k,row|
42
- emit(row.to_csv)
43
- end
44
-
45
- @output.close if opts[:output]
46
-
47
- {
48
- :input_count => @input_count,
49
- :output_count => @output_count
50
- }
13
+ execute(opts)
51
14
  end
52
15
 
53
16
  def crunch(row)
@@ -60,7 +23,7 @@ class Masticate::MaxRows < Masticate::Base
60
23
  elsif row.nil?
61
24
  # output the accumulated results
62
25
  @accum.each do |k,row|
63
- emit(row.to_csv)
26
+ yield row
64
27
  end
65
28
  else
66
29
  key = row[@index_by]
@@ -73,6 +36,7 @@ class Masticate::MaxRows < Masticate::Base
73
36
  @accum[key] = row
74
37
  end
75
38
  end
39
+ nil
76
40
  end
77
41
  end
78
42
  end
@@ -9,44 +9,7 @@ class Masticate::Plucker < Masticate::Base
9
9
  end
10
10
 
11
11
  def pluck(opts)
12
- configure(opts)
13
- # standard_options(opts)
14
- #
15
- # fields = opts[:fields] or raise "missing fields to pluck"
16
-
17
- @output_count = 0
18
- headers = nil
19
- with_input do |input|
20
- while line = get
21
- row = CSV.parse_line(line, csv_options)
22
- if !headers
23
- headers = row
24
- indexes = @fields.map do |f|
25
- case f
26
- when String
27
- headers.index(f) or raise "Unable to find column '#{f}'"
28
- when Fixnum
29
- if f > headers.count
30
- raise "Cannot pluck column #{f}, there are only #{headers.count} fields"
31
- else
32
- f-1
33
- end
34
- else
35
- raise "Invalid field descriptor '#{f}'"
36
- end
37
- end
38
- emit(indexes.map {|i| headers[i]}.to_csv)
39
- else
40
- emit(indexes.map {|i| row[i]}.to_csv) if row
41
- end
42
- end
43
- end
44
- @output.close if opts[:output]
45
-
46
- {
47
- :input_count => input_count,
48
- :output_count => @output_count
49
- }
12
+ execute(opts)
50
13
  end
51
14
 
52
15
  def crunch(row)
@@ -66,9 +29,10 @@ class Masticate::Plucker < Masticate::Base
66
29
  raise "Invalid field descriptor '#{f}'"
67
30
  end
68
31
  end
32
+ @indexes.map {|i| row[i]}
33
+ elsif row
34
+ # output is just the selected columns
35
+ @indexes.map {|i| row[i]}
69
36
  end
70
-
71
- # output is just the selected columns
72
- @indexes.map {|i| row[i]}
73
37
  end
74
38
  end
@@ -10,28 +10,7 @@ class Masticate::Relabel < Masticate::Base
10
10
  end
11
11
 
12
12
  def relabel(opts)
13
- configure(opts)
14
-
15
- @output_count = 0
16
- headers = nil
17
- with_input do |input|
18
- while line = get
19
- row = CSV.parse_line(line, csv_options)
20
- if !headers
21
- headers = @fields
22
- emit(headers.to_csv)
23
- else
24
- emit(row.to_csv)
25
- end
26
- end
27
- end
28
- @output.close if opts[:output]
29
-
30
- # File.unlink(opts[:output]) if opts[:output] && File.exists?(opts[:output])
31
- # redirect = ">>#{opts[:output]}" if opts[:output]
32
- #
33
- # system "/bin/echo -n '#{fields.to_csv}' #{redirect}"
34
- # system "tail +2 #{@filename} #{redirect}"
13
+ execute(opts)
35
14
  end
36
15
 
37
16
  def crunch(row)
@@ -1,3 +1,3 @@
1
1
  module Masticate
2
- VERSION = "0.2"
2
+ VERSION = "0.2.1"
3
3
  end
@@ -1,40 +1,18 @@
1
1
  two,eight,fourteen
2
- 20120106003230,201201060140,594756
3
- 20120106003230,201201060222,594761
4
- 20120106003230,201201060311,594766
5
2
  20120106003230,201201060514,594787
6
- 20120106024355,201201060309,594764
7
- 20120106024355,201201060310,594765
8
- 20120106024355,201201060520,594789
9
3
  20120106024355,201201060826,594823
10
- 20120106032719,201201060348,594775
11
- 20120106032719,201201060445,594785
12
4
  20120106032719,201201060521,594790
13
- 20120106033235,201201060347,594769
14
5
  20120106033235,201201060429,594780
15
6
  20120106035346,201201060446,594786
16
7
  20120106041426,201201060445,594784
17
8
  20120106043025,201201060515,594788
18
- 20120106045326,201201060535,594791
19
9
  20120106045326,201201060536,594796
20
- 20120106052714,201201060544,594797
21
10
  20120106052714,201201060551,594802
22
- 20120106070243,201201060721,594803
23
- 20120106070243,201201060806,594820
24
- 20120106070243,201201060807,594822
25
- 20120106070243,201201060910,594831
26
11
  20120106070243,201201060928,594834
27
12
  20120106073142,201201060757,594815
28
- 20120106073757,201201060749,594810
29
- 20120106073757,201201060911,594832
30
- 20120106073757,201201060928,594833
31
- 20120106073757,201201061022,594862
32
13
  20120106073757,201201061131,594896
33
14
  20120106084347,201201060850,594828
34
- 20120106084720,201201060909,594829
35
15
  20120106084720,201201060910,594830
36
- 20120106085558,201201060949,594846
37
- 20120106085558,201201061033,594864
38
16
  20120106085558,201201061228,594961
39
17
  20120106091726,201201060942,594839
40
18
  20120106095129,201201061010,594853
@@ -0,0 +1,20 @@
1
+ ChangeDTTM|AuditEventID|AuditEventCode|AuditEventTypeID|AuditEventName|AuditEventDesc|AuditByID
2
+ 12/31/2011 3:41:54PM|57,314,552.00|LOGGEDOUT|30|Logged out|Logged out|4,426.00
3
+ 1/8/2012 9:21:23AM|57,486,988.00|LOGGEDOUT|30|Logged out|Logged out|4,573.00
4
+ 1/8/2012 9:21:24AM|57,486,989.00|LOGGEDOUT|30|Logged out|Logged out|4,573.00
5
+ 2/19/2012 6:08:59PM|58,597,521.00|LOGGEDOUT|30|Logged out|Logged out|4,573.00
6
+ 2/19/2012 7:02:50PM|58,597,816.00|LOGGEDOUT|30|Logged out|Logged out|4,573.00
7
+ 2/19/2012 7:02:50PM|58,597,817.00|LOGGEDOUT|30|Logged out|Logged out|4,573.00
8
+ 2/4/2012 9:29:20AM|58,201,259.00|LOGGEDOUT|30|Logged out|Logged out|4,573.00
9
+ 2/4/2012 12:19:30AM|58,196,973.00|LOGGEDOUT|30|Logged out|Logged out|4,573.00
10
+ 2/4/2012 12:20:53AM|58,196,975.00|LOGGEDOUT|30|Logged out|Logged out|4,573.00
11
+ 2/3/2012 10:44:25PM|58,196,507.00|LOGGEDOUT|30|Logged out|Logged out|4,573.00
12
+ 1/20/2012 8:01:07PM|57,833,496.00|LOGGEDOUT|30|Logged out|Logged out|4,573.00
13
+ 1/22/2012 4:20:48AM|57,845,095.00|LOGGEDOUT|30|Logged out|Logged out|4,573.00
14
+ 1/22/2012 4:20:49AM|57,845,096.00|LOGGEDOUT|30|Logged out|Logged out|4,573.00
15
+ 1/14/2012 7:50:40AM|57,661,613.00|LOGGEDOUT|30|Logged out|Logged out|5,019.00
16
+ 2/26/2012 7:48:07AM|58,776,243.00|LOGGEDOUT|30|Logged out|Logged out|5,019.00
17
+ 2/23/2012 7:47:39AM|58,699,535.00|LOGGEDOUT|30|Logged out|Logged out|5,019.00
18
+ 2/23/2012 7:39:56AM|58,699,387.00|LOGGEDOUT|30|Logged out|Logged out|5,019.00
19
+ 2/23/2012 7:39:56AM|58,699,388.00|LOGGEDOUT|30|Logged out|Logged out|5,019.00
20
+ 2/15/2012 3:15:06PM|58,502,037.00|LOGGEDOUT|30|Logged out|Logged out|5,019.00
data/spec/data/recipe.txt CHANGED
@@ -1,4 +1,5 @@
1
1
  gsub --field order_number --from ',|(.00$)' --to ''
2
+ datify --field dtedone --format "%Y%m%d%H%M"
2
3
  relabel --fields one,two,three,four,five,six,seven,eight,nine,ten,eleven,twelve,thirteen,fourteen
3
4
  pluck --fields two,eight,fourteen
4
5
  maxrows --by two --max eight
@@ -1,7 +1,6 @@
1
1
  # spec for file-sniffing functions
2
2
 
3
3
  require "spec_helper"
4
- require "tempfile"
5
4
 
6
5
  describe "csvification" do
7
6
  it "should convert pipes to standard commas" do
@@ -0,0 +1,16 @@
1
+ # spec for data translation functions
2
+
3
+ require "spec_helper"
4
+
5
+ describe "csvification" do
6
+ it "should convert pipes to standard commas" do
7
+ filename = File.dirname(__FILE__) + "/../data/pipe_data.txt"
8
+ tmp = Tempfile.new('csvify')
9
+ results = Masticate.csvify(filename, :output => tmp, :col_sep => '|')
10
+ output = File.read(tmp)
11
+ tmp.unlink
12
+ output.lines.count.should == 5
13
+ results[:input_count].should == 6
14
+ results[:output_count].should == 5
15
+ end
16
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: masticate
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.2'
4
+ version: 0.2.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -13,7 +13,7 @@ date: 2012-04-23 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rspec
16
- requirement: &2153649040 !ruby/object:Gem::Requirement
16
+ requirement: &2157087600 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: 2.9.0
22
22
  type: :development
23
23
  prerelease: false
24
- version_requirements: *2153649040
24
+ version_requirements: *2157087600
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: guard-rspec
27
- requirement: &2153648360 !ruby/object:Gem::Requirement
27
+ requirement: &2157087060 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ~>
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: 0.7.0
33
33
  type: :development
34
34
  prerelease: false
35
- version_requirements: *2153648360
35
+ version_requirements: *2157087060
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: ruby_gntp
38
- requirement: &2153647700 !ruby/object:Gem::Requirement
38
+ requirement: &2157086580 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ~>
@@ -43,7 +43,7 @@ dependencies:
43
43
  version: 0.3.4
44
44
  type: :development
45
45
  prerelease: false
46
- version_requirements: *2153647700
46
+ version_requirements: *2157086580
47
47
  description: Data file crunching
48
48
  email:
49
49
  - jmay@pobox.com
@@ -79,6 +79,7 @@ files:
79
79
  - spec/data/broken_psv.txt
80
80
  - spec/data/concat_result.txt
81
81
  - spec/data/cooking_result.csv
82
+ - spec/data/datify_input.txt
82
83
  - spec/data/events.csv
83
84
  - spec/data/events_reduced.csv
84
85
  - spec/data/inlined_headers.csv
@@ -96,6 +97,7 @@ files:
96
97
  - spec/lib/concat_spec.rb
97
98
  - spec/lib/cook_spec.rb
98
99
  - spec/lib/csvify_spec.rb
100
+ - spec/lib/datify_spec.rb
99
101
  - spec/lib/gsub_spec.rb
100
102
  - spec/lib/maxrow_spec.rb
101
103
  - spec/lib/mender_spec.rb
@@ -133,6 +135,7 @@ test_files:
133
135
  - spec/data/broken_psv.txt
134
136
  - spec/data/concat_result.txt
135
137
  - spec/data/cooking_result.csv
138
+ - spec/data/datify_input.txt
136
139
  - spec/data/events.csv
137
140
  - spec/data/events_reduced.csv
138
141
  - spec/data/inlined_headers.csv
@@ -150,6 +153,7 @@ test_files:
150
153
  - spec/lib/concat_spec.rb
151
154
  - spec/lib/cook_spec.rb
152
155
  - spec/lib/csvify_spec.rb
156
+ - spec/lib/datify_spec.rb
153
157
  - spec/lib/gsub_spec.rb
154
158
  - spec/lib/maxrow_spec.rb
155
159
  - spec/lib/mender_spec.rb