masticate 0.2 → 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/masticate/base.rb +26 -4
- data/lib/masticate/cook.rb +8 -3
- data/lib/masticate/datify.rb +14 -23
- data/lib/masticate/gsubber.rb +2 -25
- data/lib/masticate/max_rows.rb +3 -39
- data/lib/masticate/plucker.rb +5 -41
- data/lib/masticate/relabel.rb +1 -22
- data/lib/masticate/version.rb +1 -1
- data/spec/data/cooking_result.csv +0 -22
- data/spec/data/datify_input.txt +20 -0
- data/spec/data/recipe.txt +1 -0
- data/spec/lib/csvify_spec.rb +0 -1
- data/spec/lib/datify_spec.rb +16 -0
- metadata +11 -7
data/lib/masticate/base.rb
CHANGED
@@ -30,9 +30,15 @@ class Masticate::Base
|
|
30
30
|
end
|
31
31
|
|
32
32
|
def emit(line)
|
33
|
+
@output_count ||= 0
|
33
34
|
@output_count += 1
|
34
35
|
begin
|
35
|
-
|
36
|
+
case line
|
37
|
+
when Array
|
38
|
+
@output.puts line.to_csv
|
39
|
+
else
|
40
|
+
@output.puts line
|
41
|
+
end
|
36
42
|
rescue Errno::EPIPE
|
37
43
|
# output was closed, e.g. ran piped into `head`
|
38
44
|
# silently ignore this condition, it's not fatal and doesn't need a warning
|
@@ -48,7 +54,23 @@ class Masticate::Base
|
|
48
54
|
end
|
49
55
|
end
|
50
56
|
|
51
|
-
|
52
|
-
|
53
|
-
|
57
|
+
def execute(opts)
|
58
|
+
configure(opts)
|
59
|
+
|
60
|
+
@output_count = 0
|
61
|
+
with_input do |input|
|
62
|
+
while line = get
|
63
|
+
row = CSV.parse_line(line, csv_options)
|
64
|
+
output = crunch(row)
|
65
|
+
emit(output) if output
|
66
|
+
end
|
67
|
+
end
|
68
|
+
crunch(nil) {|row| emit(row)}
|
69
|
+
@output.close if opts[:output]
|
70
|
+
|
71
|
+
{
|
72
|
+
:input_count => input_count,
|
73
|
+
:output_count => @output_count
|
74
|
+
}
|
75
|
+
end
|
54
76
|
end
|
data/lib/masticate/cook.rb
CHANGED
@@ -12,6 +12,8 @@ class Masticate::Cook < Masticate::Base
|
|
12
12
|
end
|
13
13
|
|
14
14
|
def cook(opts)
|
15
|
+
standard_options(opts)
|
16
|
+
|
15
17
|
recipefile = opts[:recipe] or raise "missing recipe for cook"
|
16
18
|
recipe = File.read(recipefile).lines
|
17
19
|
standard_options(opts)
|
@@ -31,13 +33,16 @@ class Masticate::Cook < Masticate::Base
|
|
31
33
|
row = CSV.parse_line(line, csv_options)
|
32
34
|
|
33
35
|
steps.each do |step|
|
34
|
-
|
35
|
-
row = step.crunch(row)
|
36
|
+
row = step.crunch(row) if row
|
36
37
|
end
|
37
38
|
|
38
|
-
emit(row
|
39
|
+
emit(row) if row
|
39
40
|
end
|
40
41
|
end
|
42
|
+
steps.each do |step|
|
43
|
+
step.crunch(nil) {|row| emit(row)}
|
44
|
+
end
|
45
|
+
|
41
46
|
@output.close if opts[:output]
|
42
47
|
|
43
48
|
{
|
data/lib/masticate/datify.rb
CHANGED
@@ -2,32 +2,23 @@
|
|
2
2
|
require "csv"
|
3
3
|
|
4
4
|
class Masticate::Datify < Masticate::Base
|
5
|
-
def
|
5
|
+
def configure(opts)
|
6
6
|
standard_options(opts)
|
7
|
+
@field = opts[:field] or raise "missing field to datify"
|
8
|
+
@format = opts[:format] or raise "strptime format required for parsing timestamps"
|
9
|
+
end
|
7
10
|
|
8
|
-
|
9
|
-
|
11
|
+
def datify(opts)
|
12
|
+
execute(opts)
|
13
|
+
end
|
10
14
|
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
headers = row
|
18
|
-
index = headers.index(field) or raise "Unable to find column '#{field}'"
|
19
|
-
emit(headers.to_csv)
|
20
|
-
else
|
21
|
-
row[index] = DateTime.strptime(row[index], format).to_time.to_i rescue nil
|
22
|
-
emit(row.to_csv)
|
23
|
-
end
|
24
|
-
end
|
15
|
+
def crunch(row)
|
16
|
+
if !@index
|
17
|
+
@index = row.index(@field) or raise "Unable to find column '#{@field}'"
|
18
|
+
elsif row
|
19
|
+
ts = DateTime.strptime(row[@index], @format).to_time
|
20
|
+
row[@index] = ts.to_i rescue nil
|
25
21
|
end
|
26
|
-
|
27
|
-
|
28
|
-
{
|
29
|
-
:input_count => @input_count,
|
30
|
-
:output_count => @output_count
|
31
|
-
}
|
22
|
+
row
|
32
23
|
end
|
33
24
|
end
|
data/lib/masticate/gsubber.rb
CHANGED
@@ -16,36 +16,13 @@ class Masticate::Gsubber < Masticate::Base
|
|
16
16
|
end
|
17
17
|
|
18
18
|
def gsub(opts)
|
19
|
-
|
20
|
-
@output_count = 0
|
21
|
-
headers = nil
|
22
|
-
with_input do |input|
|
23
|
-
while line = get
|
24
|
-
row = CSV.parse_line(line, csv_options)
|
25
|
-
if !headers
|
26
|
-
headers = row
|
27
|
-
index = headers.index(@field) or raise "Unable to find column '#{@field}' in headers"
|
28
|
-
emit(line)
|
29
|
-
else
|
30
|
-
oldval = row[index]
|
31
|
-
newval = oldval.gsub(@from, @to)
|
32
|
-
row[index] = newval
|
33
|
-
emit(row.to_csv)
|
34
|
-
end
|
35
|
-
end
|
36
|
-
end
|
37
|
-
@output.close if opts[:output]
|
38
|
-
|
39
|
-
{
|
40
|
-
:input_count => input_count,
|
41
|
-
:output_count => @output_count
|
42
|
-
}
|
19
|
+
execute(opts)
|
43
20
|
end
|
44
21
|
|
45
22
|
def crunch(row)
|
46
23
|
if !@headers
|
47
24
|
set_headers(row)
|
48
|
-
|
25
|
+
elsif row
|
49
26
|
row[@index] = row[@index].gsub(@from, @to)
|
50
27
|
end
|
51
28
|
row
|
data/lib/masticate/max_rows.rb
CHANGED
@@ -10,44 +10,7 @@ class Masticate::MaxRows < Masticate::Base
|
|
10
10
|
end
|
11
11
|
|
12
12
|
def maxrows(opts)
|
13
|
-
|
14
|
-
|
15
|
-
@output_count = 0
|
16
|
-
headers = nil
|
17
|
-
accum = {}
|
18
|
-
with_input do |input|
|
19
|
-
while line = get
|
20
|
-
row = CSV.parse_line(line, csv_options)
|
21
|
-
if !headers
|
22
|
-
headers = row
|
23
|
-
index_by = headers.index(@groupby) or raise "Unable to find column '#{@groupby}'"
|
24
|
-
index_max = headers.index(@maxon) or raise "Unable to find column '#{@maxon}'"
|
25
|
-
emit(line)
|
26
|
-
else
|
27
|
-
key = row[index_by]
|
28
|
-
if !accum[key]
|
29
|
-
accum[key] = row
|
30
|
-
else
|
31
|
-
oldscore = accum[key][index_max]
|
32
|
-
newscore = row[index_max]
|
33
|
-
if newscore > oldscore
|
34
|
-
accum[key] = row
|
35
|
-
end
|
36
|
-
end
|
37
|
-
end
|
38
|
-
end
|
39
|
-
end
|
40
|
-
|
41
|
-
accum.each do |k,row|
|
42
|
-
emit(row.to_csv)
|
43
|
-
end
|
44
|
-
|
45
|
-
@output.close if opts[:output]
|
46
|
-
|
47
|
-
{
|
48
|
-
:input_count => @input_count,
|
49
|
-
:output_count => @output_count
|
50
|
-
}
|
13
|
+
execute(opts)
|
51
14
|
end
|
52
15
|
|
53
16
|
def crunch(row)
|
@@ -60,7 +23,7 @@ class Masticate::MaxRows < Masticate::Base
|
|
60
23
|
elsif row.nil?
|
61
24
|
# output the accumulated results
|
62
25
|
@accum.each do |k,row|
|
63
|
-
|
26
|
+
yield row
|
64
27
|
end
|
65
28
|
else
|
66
29
|
key = row[@index_by]
|
@@ -73,6 +36,7 @@ class Masticate::MaxRows < Masticate::Base
|
|
73
36
|
@accum[key] = row
|
74
37
|
end
|
75
38
|
end
|
39
|
+
nil
|
76
40
|
end
|
77
41
|
end
|
78
42
|
end
|
data/lib/masticate/plucker.rb
CHANGED
@@ -9,44 +9,7 @@ class Masticate::Plucker < Masticate::Base
|
|
9
9
|
end
|
10
10
|
|
11
11
|
def pluck(opts)
|
12
|
-
|
13
|
-
# standard_options(opts)
|
14
|
-
#
|
15
|
-
# fields = opts[:fields] or raise "missing fields to pluck"
|
16
|
-
|
17
|
-
@output_count = 0
|
18
|
-
headers = nil
|
19
|
-
with_input do |input|
|
20
|
-
while line = get
|
21
|
-
row = CSV.parse_line(line, csv_options)
|
22
|
-
if !headers
|
23
|
-
headers = row
|
24
|
-
indexes = @fields.map do |f|
|
25
|
-
case f
|
26
|
-
when String
|
27
|
-
headers.index(f) or raise "Unable to find column '#{f}'"
|
28
|
-
when Fixnum
|
29
|
-
if f > headers.count
|
30
|
-
raise "Cannot pluck column #{f}, there are only #{headers.count} fields"
|
31
|
-
else
|
32
|
-
f-1
|
33
|
-
end
|
34
|
-
else
|
35
|
-
raise "Invalid field descriptor '#{f}'"
|
36
|
-
end
|
37
|
-
end
|
38
|
-
emit(indexes.map {|i| headers[i]}.to_csv)
|
39
|
-
else
|
40
|
-
emit(indexes.map {|i| row[i]}.to_csv) if row
|
41
|
-
end
|
42
|
-
end
|
43
|
-
end
|
44
|
-
@output.close if opts[:output]
|
45
|
-
|
46
|
-
{
|
47
|
-
:input_count => input_count,
|
48
|
-
:output_count => @output_count
|
49
|
-
}
|
12
|
+
execute(opts)
|
50
13
|
end
|
51
14
|
|
52
15
|
def crunch(row)
|
@@ -66,9 +29,10 @@ class Masticate::Plucker < Masticate::Base
|
|
66
29
|
raise "Invalid field descriptor '#{f}'"
|
67
30
|
end
|
68
31
|
end
|
32
|
+
@indexes.map {|i| row[i]}
|
33
|
+
elsif row
|
34
|
+
# output is just the selected columns
|
35
|
+
@indexes.map {|i| row[i]}
|
69
36
|
end
|
70
|
-
|
71
|
-
# output is just the selected columns
|
72
|
-
@indexes.map {|i| row[i]}
|
73
37
|
end
|
74
38
|
end
|
data/lib/masticate/relabel.rb
CHANGED
@@ -10,28 +10,7 @@ class Masticate::Relabel < Masticate::Base
|
|
10
10
|
end
|
11
11
|
|
12
12
|
def relabel(opts)
|
13
|
-
|
14
|
-
|
15
|
-
@output_count = 0
|
16
|
-
headers = nil
|
17
|
-
with_input do |input|
|
18
|
-
while line = get
|
19
|
-
row = CSV.parse_line(line, csv_options)
|
20
|
-
if !headers
|
21
|
-
headers = @fields
|
22
|
-
emit(headers.to_csv)
|
23
|
-
else
|
24
|
-
emit(row.to_csv)
|
25
|
-
end
|
26
|
-
end
|
27
|
-
end
|
28
|
-
@output.close if opts[:output]
|
29
|
-
|
30
|
-
# File.unlink(opts[:output]) if opts[:output] && File.exists?(opts[:output])
|
31
|
-
# redirect = ">>#{opts[:output]}" if opts[:output]
|
32
|
-
#
|
33
|
-
# system "/bin/echo -n '#{fields.to_csv}' #{redirect}"
|
34
|
-
# system "tail +2 #{@filename} #{redirect}"
|
13
|
+
execute(opts)
|
35
14
|
end
|
36
15
|
|
37
16
|
def crunch(row)
|
data/lib/masticate/version.rb
CHANGED
@@ -1,40 +1,18 @@
|
|
1
1
|
two,eight,fourteen
|
2
|
-
20120106003230,201201060140,594756
|
3
|
-
20120106003230,201201060222,594761
|
4
|
-
20120106003230,201201060311,594766
|
5
2
|
20120106003230,201201060514,594787
|
6
|
-
20120106024355,201201060309,594764
|
7
|
-
20120106024355,201201060310,594765
|
8
|
-
20120106024355,201201060520,594789
|
9
3
|
20120106024355,201201060826,594823
|
10
|
-
20120106032719,201201060348,594775
|
11
|
-
20120106032719,201201060445,594785
|
12
4
|
20120106032719,201201060521,594790
|
13
|
-
20120106033235,201201060347,594769
|
14
5
|
20120106033235,201201060429,594780
|
15
6
|
20120106035346,201201060446,594786
|
16
7
|
20120106041426,201201060445,594784
|
17
8
|
20120106043025,201201060515,594788
|
18
|
-
20120106045326,201201060535,594791
|
19
9
|
20120106045326,201201060536,594796
|
20
|
-
20120106052714,201201060544,594797
|
21
10
|
20120106052714,201201060551,594802
|
22
|
-
20120106070243,201201060721,594803
|
23
|
-
20120106070243,201201060806,594820
|
24
|
-
20120106070243,201201060807,594822
|
25
|
-
20120106070243,201201060910,594831
|
26
11
|
20120106070243,201201060928,594834
|
27
12
|
20120106073142,201201060757,594815
|
28
|
-
20120106073757,201201060749,594810
|
29
|
-
20120106073757,201201060911,594832
|
30
|
-
20120106073757,201201060928,594833
|
31
|
-
20120106073757,201201061022,594862
|
32
13
|
20120106073757,201201061131,594896
|
33
14
|
20120106084347,201201060850,594828
|
34
|
-
20120106084720,201201060909,594829
|
35
15
|
20120106084720,201201060910,594830
|
36
|
-
20120106085558,201201060949,594846
|
37
|
-
20120106085558,201201061033,594864
|
38
16
|
20120106085558,201201061228,594961
|
39
17
|
20120106091726,201201060942,594839
|
40
18
|
20120106095129,201201061010,594853
|
@@ -0,0 +1,20 @@
|
|
1
|
+
ChangeDTTM|AuditEventID|AuditEventCode|AuditEventTypeID|AuditEventName|AuditEventDesc|AuditByID
|
2
|
+
12/31/2011 3:41:54PM|57,314,552.00|LOGGEDOUT|30|Logged out|Logged out|4,426.00
|
3
|
+
1/8/2012 9:21:23AM|57,486,988.00|LOGGEDOUT|30|Logged out|Logged out|4,573.00
|
4
|
+
1/8/2012 9:21:24AM|57,486,989.00|LOGGEDOUT|30|Logged out|Logged out|4,573.00
|
5
|
+
2/19/2012 6:08:59PM|58,597,521.00|LOGGEDOUT|30|Logged out|Logged out|4,573.00
|
6
|
+
2/19/2012 7:02:50PM|58,597,816.00|LOGGEDOUT|30|Logged out|Logged out|4,573.00
|
7
|
+
2/19/2012 7:02:50PM|58,597,817.00|LOGGEDOUT|30|Logged out|Logged out|4,573.00
|
8
|
+
2/4/2012 9:29:20AM|58,201,259.00|LOGGEDOUT|30|Logged out|Logged out|4,573.00
|
9
|
+
2/4/2012 12:19:30AM|58,196,973.00|LOGGEDOUT|30|Logged out|Logged out|4,573.00
|
10
|
+
2/4/2012 12:20:53AM|58,196,975.00|LOGGEDOUT|30|Logged out|Logged out|4,573.00
|
11
|
+
2/3/2012 10:44:25PM|58,196,507.00|LOGGEDOUT|30|Logged out|Logged out|4,573.00
|
12
|
+
1/20/2012 8:01:07PM|57,833,496.00|LOGGEDOUT|30|Logged out|Logged out|4,573.00
|
13
|
+
1/22/2012 4:20:48AM|57,845,095.00|LOGGEDOUT|30|Logged out|Logged out|4,573.00
|
14
|
+
1/22/2012 4:20:49AM|57,845,096.00|LOGGEDOUT|30|Logged out|Logged out|4,573.00
|
15
|
+
1/14/2012 7:50:40AM|57,661,613.00|LOGGEDOUT|30|Logged out|Logged out|5,019.00
|
16
|
+
2/26/2012 7:48:07AM|58,776,243.00|LOGGEDOUT|30|Logged out|Logged out|5,019.00
|
17
|
+
2/23/2012 7:47:39AM|58,699,535.00|LOGGEDOUT|30|Logged out|Logged out|5,019.00
|
18
|
+
2/23/2012 7:39:56AM|58,699,387.00|LOGGEDOUT|30|Logged out|Logged out|5,019.00
|
19
|
+
2/23/2012 7:39:56AM|58,699,388.00|LOGGEDOUT|30|Logged out|Logged out|5,019.00
|
20
|
+
2/15/2012 3:15:06PM|58,502,037.00|LOGGEDOUT|30|Logged out|Logged out|5,019.00
|
data/spec/data/recipe.txt
CHANGED
data/spec/lib/csvify_spec.rb
CHANGED
@@ -0,0 +1,16 @@
|
|
1
|
+
# spec for data translation functions
|
2
|
+
|
3
|
+
require "spec_helper"
|
4
|
+
|
5
|
+
describe "csvification" do
|
6
|
+
it "should convert pipes to standard commas" do
|
7
|
+
filename = File.dirname(__FILE__) + "/../data/pipe_data.txt"
|
8
|
+
tmp = Tempfile.new('csvify')
|
9
|
+
results = Masticate.csvify(filename, :output => tmp, :col_sep => '|')
|
10
|
+
output = File.read(tmp)
|
11
|
+
tmp.unlink
|
12
|
+
output.lines.count.should == 5
|
13
|
+
results[:input_count].should == 6
|
14
|
+
results[:output_count].should == 5
|
15
|
+
end
|
16
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: masticate
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 0.2.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -13,7 +13,7 @@ date: 2012-04-23 00:00:00.000000000 Z
|
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rspec
|
16
|
-
requirement: &
|
16
|
+
requirement: &2157087600 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: 2.9.0
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *2157087600
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: guard-rspec
|
27
|
-
requirement: &
|
27
|
+
requirement: &2157087060 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ~>
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: 0.7.0
|
33
33
|
type: :development
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *2157087060
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: ruby_gntp
|
38
|
-
requirement: &
|
38
|
+
requirement: &2157086580 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ~>
|
@@ -43,7 +43,7 @@ dependencies:
|
|
43
43
|
version: 0.3.4
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *2157086580
|
47
47
|
description: Data file crunching
|
48
48
|
email:
|
49
49
|
- jmay@pobox.com
|
@@ -79,6 +79,7 @@ files:
|
|
79
79
|
- spec/data/broken_psv.txt
|
80
80
|
- spec/data/concat_result.txt
|
81
81
|
- spec/data/cooking_result.csv
|
82
|
+
- spec/data/datify_input.txt
|
82
83
|
- spec/data/events.csv
|
83
84
|
- spec/data/events_reduced.csv
|
84
85
|
- spec/data/inlined_headers.csv
|
@@ -96,6 +97,7 @@ files:
|
|
96
97
|
- spec/lib/concat_spec.rb
|
97
98
|
- spec/lib/cook_spec.rb
|
98
99
|
- spec/lib/csvify_spec.rb
|
100
|
+
- spec/lib/datify_spec.rb
|
99
101
|
- spec/lib/gsub_spec.rb
|
100
102
|
- spec/lib/maxrow_spec.rb
|
101
103
|
- spec/lib/mender_spec.rb
|
@@ -133,6 +135,7 @@ test_files:
|
|
133
135
|
- spec/data/broken_psv.txt
|
134
136
|
- spec/data/concat_result.txt
|
135
137
|
- spec/data/cooking_result.csv
|
138
|
+
- spec/data/datify_input.txt
|
136
139
|
- spec/data/events.csv
|
137
140
|
- spec/data/events_reduced.csv
|
138
141
|
- spec/data/inlined_headers.csv
|
@@ -150,6 +153,7 @@ test_files:
|
|
150
153
|
- spec/lib/concat_spec.rb
|
151
154
|
- spec/lib/cook_spec.rb
|
152
155
|
- spec/lib/csvify_spec.rb
|
156
|
+
- spec/lib/datify_spec.rb
|
153
157
|
- spec/lib/gsub_spec.rb
|
154
158
|
- spec/lib/maxrow_spec.rb
|
155
159
|
- spec/lib/mender_spec.rb
|