masticate 0.2 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/masticate/base.rb +26 -4
- data/lib/masticate/cook.rb +8 -3
- data/lib/masticate/datify.rb +14 -23
- data/lib/masticate/gsubber.rb +2 -25
- data/lib/masticate/max_rows.rb +3 -39
- data/lib/masticate/plucker.rb +5 -41
- data/lib/masticate/relabel.rb +1 -22
- data/lib/masticate/version.rb +1 -1
- data/spec/data/cooking_result.csv +0 -22
- data/spec/data/datify_input.txt +20 -0
- data/spec/data/recipe.txt +1 -0
- data/spec/lib/csvify_spec.rb +0 -1
- data/spec/lib/datify_spec.rb +16 -0
- metadata +11 -7
data/lib/masticate/base.rb
CHANGED
@@ -30,9 +30,15 @@ class Masticate::Base
|
|
30
30
|
end
|
31
31
|
|
32
32
|
def emit(line)
|
33
|
+
@output_count ||= 0
|
33
34
|
@output_count += 1
|
34
35
|
begin
|
35
|
-
|
36
|
+
case line
|
37
|
+
when Array
|
38
|
+
@output.puts line.to_csv
|
39
|
+
else
|
40
|
+
@output.puts line
|
41
|
+
end
|
36
42
|
rescue Errno::EPIPE
|
37
43
|
# output was closed, e.g. ran piped into `head`
|
38
44
|
# silently ignore this condition, it's not fatal and doesn't need a warning
|
@@ -48,7 +54,23 @@ class Masticate::Base
|
|
48
54
|
end
|
49
55
|
end
|
50
56
|
|
51
|
-
|
52
|
-
|
53
|
-
|
57
|
+
def execute(opts)
|
58
|
+
configure(opts)
|
59
|
+
|
60
|
+
@output_count = 0
|
61
|
+
with_input do |input|
|
62
|
+
while line = get
|
63
|
+
row = CSV.parse_line(line, csv_options)
|
64
|
+
output = crunch(row)
|
65
|
+
emit(output) if output
|
66
|
+
end
|
67
|
+
end
|
68
|
+
crunch(nil) {|row| emit(row)}
|
69
|
+
@output.close if opts[:output]
|
70
|
+
|
71
|
+
{
|
72
|
+
:input_count => input_count,
|
73
|
+
:output_count => @output_count
|
74
|
+
}
|
75
|
+
end
|
54
76
|
end
|
data/lib/masticate/cook.rb
CHANGED
@@ -12,6 +12,8 @@ class Masticate::Cook < Masticate::Base
|
|
12
12
|
end
|
13
13
|
|
14
14
|
def cook(opts)
|
15
|
+
standard_options(opts)
|
16
|
+
|
15
17
|
recipefile = opts[:recipe] or raise "missing recipe for cook"
|
16
18
|
recipe = File.read(recipefile).lines
|
17
19
|
standard_options(opts)
|
@@ -31,13 +33,16 @@ class Masticate::Cook < Masticate::Base
|
|
31
33
|
row = CSV.parse_line(line, csv_options)
|
32
34
|
|
33
35
|
steps.each do |step|
|
34
|
-
|
35
|
-
row = step.crunch(row)
|
36
|
+
row = step.crunch(row) if row
|
36
37
|
end
|
37
38
|
|
38
|
-
emit(row
|
39
|
+
emit(row) if row
|
39
40
|
end
|
40
41
|
end
|
42
|
+
steps.each do |step|
|
43
|
+
step.crunch(nil) {|row| emit(row)}
|
44
|
+
end
|
45
|
+
|
41
46
|
@output.close if opts[:output]
|
42
47
|
|
43
48
|
{
|
data/lib/masticate/datify.rb
CHANGED
@@ -2,32 +2,23 @@
|
|
2
2
|
require "csv"
|
3
3
|
|
4
4
|
class Masticate::Datify < Masticate::Base
|
5
|
-
def
|
5
|
+
def configure(opts)
|
6
6
|
standard_options(opts)
|
7
|
+
@field = opts[:field] or raise "missing field to datify"
|
8
|
+
@format = opts[:format] or raise "strptime format required for parsing timestamps"
|
9
|
+
end
|
7
10
|
|
8
|
-
|
9
|
-
|
11
|
+
def datify(opts)
|
12
|
+
execute(opts)
|
13
|
+
end
|
10
14
|
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
headers = row
|
18
|
-
index = headers.index(field) or raise "Unable to find column '#{field}'"
|
19
|
-
emit(headers.to_csv)
|
20
|
-
else
|
21
|
-
row[index] = DateTime.strptime(row[index], format).to_time.to_i rescue nil
|
22
|
-
emit(row.to_csv)
|
23
|
-
end
|
24
|
-
end
|
15
|
+
def crunch(row)
|
16
|
+
if !@index
|
17
|
+
@index = row.index(@field) or raise "Unable to find column '#{@field}'"
|
18
|
+
elsif row
|
19
|
+
ts = DateTime.strptime(row[@index], @format).to_time
|
20
|
+
row[@index] = ts.to_i rescue nil
|
25
21
|
end
|
26
|
-
|
27
|
-
|
28
|
-
{
|
29
|
-
:input_count => @input_count,
|
30
|
-
:output_count => @output_count
|
31
|
-
}
|
22
|
+
row
|
32
23
|
end
|
33
24
|
end
|
data/lib/masticate/gsubber.rb
CHANGED
@@ -16,36 +16,13 @@ class Masticate::Gsubber < Masticate::Base
|
|
16
16
|
end
|
17
17
|
|
18
18
|
def gsub(opts)
|
19
|
-
|
20
|
-
@output_count = 0
|
21
|
-
headers = nil
|
22
|
-
with_input do |input|
|
23
|
-
while line = get
|
24
|
-
row = CSV.parse_line(line, csv_options)
|
25
|
-
if !headers
|
26
|
-
headers = row
|
27
|
-
index = headers.index(@field) or raise "Unable to find column '#{@field}' in headers"
|
28
|
-
emit(line)
|
29
|
-
else
|
30
|
-
oldval = row[index]
|
31
|
-
newval = oldval.gsub(@from, @to)
|
32
|
-
row[index] = newval
|
33
|
-
emit(row.to_csv)
|
34
|
-
end
|
35
|
-
end
|
36
|
-
end
|
37
|
-
@output.close if opts[:output]
|
38
|
-
|
39
|
-
{
|
40
|
-
:input_count => input_count,
|
41
|
-
:output_count => @output_count
|
42
|
-
}
|
19
|
+
execute(opts)
|
43
20
|
end
|
44
21
|
|
45
22
|
def crunch(row)
|
46
23
|
if !@headers
|
47
24
|
set_headers(row)
|
48
|
-
|
25
|
+
elsif row
|
49
26
|
row[@index] = row[@index].gsub(@from, @to)
|
50
27
|
end
|
51
28
|
row
|
data/lib/masticate/max_rows.rb
CHANGED
@@ -10,44 +10,7 @@ class Masticate::MaxRows < Masticate::Base
|
|
10
10
|
end
|
11
11
|
|
12
12
|
def maxrows(opts)
|
13
|
-
|
14
|
-
|
15
|
-
@output_count = 0
|
16
|
-
headers = nil
|
17
|
-
accum = {}
|
18
|
-
with_input do |input|
|
19
|
-
while line = get
|
20
|
-
row = CSV.parse_line(line, csv_options)
|
21
|
-
if !headers
|
22
|
-
headers = row
|
23
|
-
index_by = headers.index(@groupby) or raise "Unable to find column '#{@groupby}'"
|
24
|
-
index_max = headers.index(@maxon) or raise "Unable to find column '#{@maxon}'"
|
25
|
-
emit(line)
|
26
|
-
else
|
27
|
-
key = row[index_by]
|
28
|
-
if !accum[key]
|
29
|
-
accum[key] = row
|
30
|
-
else
|
31
|
-
oldscore = accum[key][index_max]
|
32
|
-
newscore = row[index_max]
|
33
|
-
if newscore > oldscore
|
34
|
-
accum[key] = row
|
35
|
-
end
|
36
|
-
end
|
37
|
-
end
|
38
|
-
end
|
39
|
-
end
|
40
|
-
|
41
|
-
accum.each do |k,row|
|
42
|
-
emit(row.to_csv)
|
43
|
-
end
|
44
|
-
|
45
|
-
@output.close if opts[:output]
|
46
|
-
|
47
|
-
{
|
48
|
-
:input_count => @input_count,
|
49
|
-
:output_count => @output_count
|
50
|
-
}
|
13
|
+
execute(opts)
|
51
14
|
end
|
52
15
|
|
53
16
|
def crunch(row)
|
@@ -60,7 +23,7 @@ class Masticate::MaxRows < Masticate::Base
|
|
60
23
|
elsif row.nil?
|
61
24
|
# output the accumulated results
|
62
25
|
@accum.each do |k,row|
|
63
|
-
|
26
|
+
yield row
|
64
27
|
end
|
65
28
|
else
|
66
29
|
key = row[@index_by]
|
@@ -73,6 +36,7 @@ class Masticate::MaxRows < Masticate::Base
|
|
73
36
|
@accum[key] = row
|
74
37
|
end
|
75
38
|
end
|
39
|
+
nil
|
76
40
|
end
|
77
41
|
end
|
78
42
|
end
|
data/lib/masticate/plucker.rb
CHANGED
@@ -9,44 +9,7 @@ class Masticate::Plucker < Masticate::Base
|
|
9
9
|
end
|
10
10
|
|
11
11
|
def pluck(opts)
|
12
|
-
|
13
|
-
# standard_options(opts)
|
14
|
-
#
|
15
|
-
# fields = opts[:fields] or raise "missing fields to pluck"
|
16
|
-
|
17
|
-
@output_count = 0
|
18
|
-
headers = nil
|
19
|
-
with_input do |input|
|
20
|
-
while line = get
|
21
|
-
row = CSV.parse_line(line, csv_options)
|
22
|
-
if !headers
|
23
|
-
headers = row
|
24
|
-
indexes = @fields.map do |f|
|
25
|
-
case f
|
26
|
-
when String
|
27
|
-
headers.index(f) or raise "Unable to find column '#{f}'"
|
28
|
-
when Fixnum
|
29
|
-
if f > headers.count
|
30
|
-
raise "Cannot pluck column #{f}, there are only #{headers.count} fields"
|
31
|
-
else
|
32
|
-
f-1
|
33
|
-
end
|
34
|
-
else
|
35
|
-
raise "Invalid field descriptor '#{f}'"
|
36
|
-
end
|
37
|
-
end
|
38
|
-
emit(indexes.map {|i| headers[i]}.to_csv)
|
39
|
-
else
|
40
|
-
emit(indexes.map {|i| row[i]}.to_csv) if row
|
41
|
-
end
|
42
|
-
end
|
43
|
-
end
|
44
|
-
@output.close if opts[:output]
|
45
|
-
|
46
|
-
{
|
47
|
-
:input_count => input_count,
|
48
|
-
:output_count => @output_count
|
49
|
-
}
|
12
|
+
execute(opts)
|
50
13
|
end
|
51
14
|
|
52
15
|
def crunch(row)
|
@@ -66,9 +29,10 @@ class Masticate::Plucker < Masticate::Base
|
|
66
29
|
raise "Invalid field descriptor '#{f}'"
|
67
30
|
end
|
68
31
|
end
|
32
|
+
@indexes.map {|i| row[i]}
|
33
|
+
elsif row
|
34
|
+
# output is just the selected columns
|
35
|
+
@indexes.map {|i| row[i]}
|
69
36
|
end
|
70
|
-
|
71
|
-
# output is just the selected columns
|
72
|
-
@indexes.map {|i| row[i]}
|
73
37
|
end
|
74
38
|
end
|
data/lib/masticate/relabel.rb
CHANGED
@@ -10,28 +10,7 @@ class Masticate::Relabel < Masticate::Base
|
|
10
10
|
end
|
11
11
|
|
12
12
|
def relabel(opts)
|
13
|
-
|
14
|
-
|
15
|
-
@output_count = 0
|
16
|
-
headers = nil
|
17
|
-
with_input do |input|
|
18
|
-
while line = get
|
19
|
-
row = CSV.parse_line(line, csv_options)
|
20
|
-
if !headers
|
21
|
-
headers = @fields
|
22
|
-
emit(headers.to_csv)
|
23
|
-
else
|
24
|
-
emit(row.to_csv)
|
25
|
-
end
|
26
|
-
end
|
27
|
-
end
|
28
|
-
@output.close if opts[:output]
|
29
|
-
|
30
|
-
# File.unlink(opts[:output]) if opts[:output] && File.exists?(opts[:output])
|
31
|
-
# redirect = ">>#{opts[:output]}" if opts[:output]
|
32
|
-
#
|
33
|
-
# system "/bin/echo -n '#{fields.to_csv}' #{redirect}"
|
34
|
-
# system "tail +2 #{@filename} #{redirect}"
|
13
|
+
execute(opts)
|
35
14
|
end
|
36
15
|
|
37
16
|
def crunch(row)
|
data/lib/masticate/version.rb
CHANGED
@@ -1,40 +1,18 @@
|
|
1
1
|
two,eight,fourteen
|
2
|
-
20120106003230,201201060140,594756
|
3
|
-
20120106003230,201201060222,594761
|
4
|
-
20120106003230,201201060311,594766
|
5
2
|
20120106003230,201201060514,594787
|
6
|
-
20120106024355,201201060309,594764
|
7
|
-
20120106024355,201201060310,594765
|
8
|
-
20120106024355,201201060520,594789
|
9
3
|
20120106024355,201201060826,594823
|
10
|
-
20120106032719,201201060348,594775
|
11
|
-
20120106032719,201201060445,594785
|
12
4
|
20120106032719,201201060521,594790
|
13
|
-
20120106033235,201201060347,594769
|
14
5
|
20120106033235,201201060429,594780
|
15
6
|
20120106035346,201201060446,594786
|
16
7
|
20120106041426,201201060445,594784
|
17
8
|
20120106043025,201201060515,594788
|
18
|
-
20120106045326,201201060535,594791
|
19
9
|
20120106045326,201201060536,594796
|
20
|
-
20120106052714,201201060544,594797
|
21
10
|
20120106052714,201201060551,594802
|
22
|
-
20120106070243,201201060721,594803
|
23
|
-
20120106070243,201201060806,594820
|
24
|
-
20120106070243,201201060807,594822
|
25
|
-
20120106070243,201201060910,594831
|
26
11
|
20120106070243,201201060928,594834
|
27
12
|
20120106073142,201201060757,594815
|
28
|
-
20120106073757,201201060749,594810
|
29
|
-
20120106073757,201201060911,594832
|
30
|
-
20120106073757,201201060928,594833
|
31
|
-
20120106073757,201201061022,594862
|
32
13
|
20120106073757,201201061131,594896
|
33
14
|
20120106084347,201201060850,594828
|
34
|
-
20120106084720,201201060909,594829
|
35
15
|
20120106084720,201201060910,594830
|
36
|
-
20120106085558,201201060949,594846
|
37
|
-
20120106085558,201201061033,594864
|
38
16
|
20120106085558,201201061228,594961
|
39
17
|
20120106091726,201201060942,594839
|
40
18
|
20120106095129,201201061010,594853
|
@@ -0,0 +1,20 @@
|
|
1
|
+
ChangeDTTM|AuditEventID|AuditEventCode|AuditEventTypeID|AuditEventName|AuditEventDesc|AuditByID
|
2
|
+
12/31/2011 3:41:54PM|57,314,552.00|LOGGEDOUT|30|Logged out|Logged out|4,426.00
|
3
|
+
1/8/2012 9:21:23AM|57,486,988.00|LOGGEDOUT|30|Logged out|Logged out|4,573.00
|
4
|
+
1/8/2012 9:21:24AM|57,486,989.00|LOGGEDOUT|30|Logged out|Logged out|4,573.00
|
5
|
+
2/19/2012 6:08:59PM|58,597,521.00|LOGGEDOUT|30|Logged out|Logged out|4,573.00
|
6
|
+
2/19/2012 7:02:50PM|58,597,816.00|LOGGEDOUT|30|Logged out|Logged out|4,573.00
|
7
|
+
2/19/2012 7:02:50PM|58,597,817.00|LOGGEDOUT|30|Logged out|Logged out|4,573.00
|
8
|
+
2/4/2012 9:29:20AM|58,201,259.00|LOGGEDOUT|30|Logged out|Logged out|4,573.00
|
9
|
+
2/4/2012 12:19:30AM|58,196,973.00|LOGGEDOUT|30|Logged out|Logged out|4,573.00
|
10
|
+
2/4/2012 12:20:53AM|58,196,975.00|LOGGEDOUT|30|Logged out|Logged out|4,573.00
|
11
|
+
2/3/2012 10:44:25PM|58,196,507.00|LOGGEDOUT|30|Logged out|Logged out|4,573.00
|
12
|
+
1/20/2012 8:01:07PM|57,833,496.00|LOGGEDOUT|30|Logged out|Logged out|4,573.00
|
13
|
+
1/22/2012 4:20:48AM|57,845,095.00|LOGGEDOUT|30|Logged out|Logged out|4,573.00
|
14
|
+
1/22/2012 4:20:49AM|57,845,096.00|LOGGEDOUT|30|Logged out|Logged out|4,573.00
|
15
|
+
1/14/2012 7:50:40AM|57,661,613.00|LOGGEDOUT|30|Logged out|Logged out|5,019.00
|
16
|
+
2/26/2012 7:48:07AM|58,776,243.00|LOGGEDOUT|30|Logged out|Logged out|5,019.00
|
17
|
+
2/23/2012 7:47:39AM|58,699,535.00|LOGGEDOUT|30|Logged out|Logged out|5,019.00
|
18
|
+
2/23/2012 7:39:56AM|58,699,387.00|LOGGEDOUT|30|Logged out|Logged out|5,019.00
|
19
|
+
2/23/2012 7:39:56AM|58,699,388.00|LOGGEDOUT|30|Logged out|Logged out|5,019.00
|
20
|
+
2/15/2012 3:15:06PM|58,502,037.00|LOGGEDOUT|30|Logged out|Logged out|5,019.00
|
data/spec/data/recipe.txt
CHANGED
data/spec/lib/csvify_spec.rb
CHANGED
@@ -0,0 +1,16 @@
|
|
1
|
+
# spec for data translation functions
|
2
|
+
|
3
|
+
require "spec_helper"
|
4
|
+
|
5
|
+
describe "csvification" do
|
6
|
+
it "should convert pipes to standard commas" do
|
7
|
+
filename = File.dirname(__FILE__) + "/../data/pipe_data.txt"
|
8
|
+
tmp = Tempfile.new('csvify')
|
9
|
+
results = Masticate.csvify(filename, :output => tmp, :col_sep => '|')
|
10
|
+
output = File.read(tmp)
|
11
|
+
tmp.unlink
|
12
|
+
output.lines.count.should == 5
|
13
|
+
results[:input_count].should == 6
|
14
|
+
results[:output_count].should == 5
|
15
|
+
end
|
16
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: masticate
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 0.2.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -13,7 +13,7 @@ date: 2012-04-23 00:00:00.000000000 Z
|
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rspec
|
16
|
-
requirement: &
|
16
|
+
requirement: &2157087600 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: 2.9.0
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *2157087600
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: guard-rspec
|
27
|
-
requirement: &
|
27
|
+
requirement: &2157087060 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ~>
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: 0.7.0
|
33
33
|
type: :development
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *2157087060
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: ruby_gntp
|
38
|
-
requirement: &
|
38
|
+
requirement: &2157086580 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ~>
|
@@ -43,7 +43,7 @@ dependencies:
|
|
43
43
|
version: 0.3.4
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *2157086580
|
47
47
|
description: Data file crunching
|
48
48
|
email:
|
49
49
|
- jmay@pobox.com
|
@@ -79,6 +79,7 @@ files:
|
|
79
79
|
- spec/data/broken_psv.txt
|
80
80
|
- spec/data/concat_result.txt
|
81
81
|
- spec/data/cooking_result.csv
|
82
|
+
- spec/data/datify_input.txt
|
82
83
|
- spec/data/events.csv
|
83
84
|
- spec/data/events_reduced.csv
|
84
85
|
- spec/data/inlined_headers.csv
|
@@ -96,6 +97,7 @@ files:
|
|
96
97
|
- spec/lib/concat_spec.rb
|
97
98
|
- spec/lib/cook_spec.rb
|
98
99
|
- spec/lib/csvify_spec.rb
|
100
|
+
- spec/lib/datify_spec.rb
|
99
101
|
- spec/lib/gsub_spec.rb
|
100
102
|
- spec/lib/maxrow_spec.rb
|
101
103
|
- spec/lib/mender_spec.rb
|
@@ -133,6 +135,7 @@ test_files:
|
|
133
135
|
- spec/data/broken_psv.txt
|
134
136
|
- spec/data/concat_result.txt
|
135
137
|
- spec/data/cooking_result.csv
|
138
|
+
- spec/data/datify_input.txt
|
136
139
|
- spec/data/events.csv
|
137
140
|
- spec/data/events_reduced.csv
|
138
141
|
- spec/data/inlined_headers.csv
|
@@ -150,6 +153,7 @@ test_files:
|
|
150
153
|
- spec/lib/concat_spec.rb
|
151
154
|
- spec/lib/cook_spec.rb
|
152
155
|
- spec/lib/csvify_spec.rb
|
156
|
+
- spec/lib/datify_spec.rb
|
153
157
|
- spec/lib/gsub_spec.rb
|
154
158
|
- spec/lib/maxrow_spec.rb
|
155
159
|
- spec/lib/mender_spec.rb
|