cascading.jruby 0.0.8 → 0.0.9
Sign up to get free protection for your applications and to get access to all the features.
- data/HACKING.md +1 -1
- data/History.txt +6 -0
- data/README.md +1 -1
- data/ivy.xml +3 -3
- data/lib/cascading/aggregations.rb +14 -13
- data/lib/cascading/base.rb +27 -9
- data/lib/cascading/cascade.rb +12 -3
- data/lib/cascading/cascading.rb +35 -44
- data/lib/cascading/cascading_exception.rb +5 -5
- data/lib/cascading/flow.rb +23 -32
- data/lib/cascading/mode.rb +78 -0
- data/lib/cascading/operations.rb +10 -4
- data/lib/cascading/scope.rb +8 -2
- data/lib/cascading/sub_assembly.rb +6 -6
- data/lib/cascading/tap.rb +81 -0
- data/lib/cascading.rb +3 -1
- data/samples/branch.rb +2 -2
- data/samples/copy.rb +2 -2
- data/samples/group_by.rb +5 -5
- data/samples/join.rb +2 -2
- data/samples/logwordcount.rb +3 -4
- data/samples/project.rb +2 -2
- data/samples/rename.rb +2 -2
- data/samples/scorenames.rb +2 -2
- data/samples/splitter.rb +2 -2
- data/samples/sub_assembly.rb +2 -2
- data/samples/union.rb +2 -2
- data/spec/cascading_spec.rb +17 -17
- data/spec/spec_util.rb +9 -9
- data/tags +92 -41
- data/tasks/ant.rake +6 -1
- data/test/test_assembly.rb +14 -7
- data/test/test_cascade.rb +55 -0
- data/test/test_cascading.rb +12 -15
- data/test/test_flow.rb +53 -36
- data/test/test_local_execution.rb +7 -7
- data/test/test_operations.rb +61 -0
- metadata +4 -2
data/samples/branch.rb
CHANGED
@@ -4,7 +4,7 @@ $: << File.join(File.dirname(__FILE__), '..', 'lib')
|
|
4
4
|
|
5
5
|
require 'cascading'
|
6
6
|
|
7
|
-
cascade 'branch' do
|
7
|
+
cascade 'branch', :mode => :local do
|
8
8
|
flow 'branch' do
|
9
9
|
source 'input', tap('samples/data/data2.txt')
|
10
10
|
|
@@ -27,4 +27,4 @@ cascade 'branch' do
|
|
27
27
|
sink 'branch1', tap('output/branch1', :sink_mode => :replace)
|
28
28
|
sink 'branch2', tap('output/branch2', :sink_mode => :replace)
|
29
29
|
end
|
30
|
-
end.complete
|
30
|
+
end.complete
|
data/samples/copy.rb
CHANGED
@@ -3,7 +3,7 @@ $: << File.join(File.dirname(__FILE__), '..', 'lib')
|
|
3
3
|
|
4
4
|
require 'cascading'
|
5
5
|
|
6
|
-
cascade 'copy' do
|
6
|
+
cascade 'copy', :mode => :local do
|
7
7
|
flow 'copy' do
|
8
8
|
# You don't have to curl and cache inputs: tap can fetch via HTTP
|
9
9
|
#source 'input', tap('http://www.census.gov/genealogy/names/dist.all.last')
|
@@ -17,4 +17,4 @@ cascade 'copy' do
|
|
17
17
|
|
18
18
|
sink 'input', tap('output/copy', :sink_mode => :replace)
|
19
19
|
end
|
20
|
-
end.complete
|
20
|
+
end.complete
|
data/samples/group_by.rb
CHANGED
@@ -3,7 +3,7 @@ $: << File.join(File.dirname(__FILE__), '..', 'lib')
|
|
3
3
|
|
4
4
|
require 'cascading'
|
5
5
|
|
6
|
-
cascade 'group_by' do
|
6
|
+
cascade 'group_by', :mode => :local do
|
7
7
|
flow 'group_by' do
|
8
8
|
source 'input', tap('samples/data/data_group_by.txt')
|
9
9
|
|
@@ -53,9 +53,9 @@ cascade 'group_by' do
|
|
53
53
|
sink 'empty_group_by', tap('output/empty_group_by', :sink_mode => :replace)
|
54
54
|
sink 'blockless_group_by', tap('output/blockless_group_by', :sink_mode => :replace)
|
55
55
|
sink 'aggregate_by', tap('output/aggregate_by', :sink_mode => :replace)
|
56
|
-
sink 'empty_aggregate_by', tap('output/empty_aggregate_by', :sink_mode => :replace)
|
57
|
-
sink 'blockless_aggregate_by', tap('output/blockless_aggregate_by', :sink_mode => :replace)
|
56
|
+
#sink 'empty_aggregate_by', tap('output/empty_aggregate_by', :sink_mode => :replace)
|
57
|
+
#sink 'blockless_aggregate_by', tap('output/blockless_aggregate_by', :sink_mode => :replace)
|
58
58
|
sink 'unique', tap('output/unique', :sink_mode => :replace)
|
59
59
|
end
|
60
|
-
#end.draw(ARGV[0]
|
61
|
-
end.complete
|
60
|
+
#end.draw(ARGV[0])
|
61
|
+
end.complete
|
data/samples/join.rb
CHANGED
@@ -3,7 +3,7 @@ $: << File.join(File.dirname(__FILE__), '..', 'lib')
|
|
3
3
|
|
4
4
|
require 'cascading'
|
5
5
|
|
6
|
-
cascade 'join' do
|
6
|
+
cascade 'join', :mode => :local do
|
7
7
|
flow 'join' do
|
8
8
|
source 'input1', tap('samples/data/data_join1.txt')
|
9
9
|
source 'input2', tap('samples/data/data_join2.txt')
|
@@ -28,4 +28,4 @@ cascade 'join' do
|
|
28
28
|
|
29
29
|
sink 'join', tap('output/join', :sink_mode => :replace)
|
30
30
|
end
|
31
|
-
end.complete
|
31
|
+
end.complete
|
data/samples/logwordcount.rb
CHANGED
@@ -3,15 +3,14 @@ $: << File.join(File.dirname(__FILE__), '..', 'lib')
|
|
3
3
|
|
4
4
|
require 'cascading'
|
5
5
|
|
6
|
-
cascade 'logwordcount' do
|
6
|
+
cascade 'logwordcount', :mode => :local do
|
7
7
|
flow 'logwordcount' do
|
8
8
|
# This works just as well, but will get you blocked by Project Gutenberg
|
9
9
|
#source 'input', tap('http://www.gutenberg.org/files/20417/20417-8.txt')
|
10
10
|
source 'input', tap('samples/data/gutenberg/the_outline_of_science_vol_1')
|
11
11
|
|
12
12
|
assembly 'input' do
|
13
|
-
|
14
|
-
each 'line', :function => regex_split_generator('word', :pattern => /[.,]*\s+/)
|
13
|
+
split_rows 'line', 'word', :pattern => /[.,]*\s+/, :output => 'word'
|
15
14
|
group_by 'word' do
|
16
15
|
count
|
17
16
|
end
|
@@ -20,4 +19,4 @@ cascade 'logwordcount' do
|
|
20
19
|
|
21
20
|
sink 'input', tap('output/logwordcount', :sink_mode => :replace)
|
22
21
|
end
|
23
|
-
end.complete
|
22
|
+
end.complete
|
data/samples/project.rb
CHANGED
@@ -5,7 +5,7 @@ $: << File.join(File.dirname(__FILE__), '..', 'lib')
|
|
5
5
|
|
6
6
|
require 'cascading'
|
7
7
|
|
8
|
-
cascade 'project' do
|
8
|
+
cascade 'project', :mode => :local do
|
9
9
|
flow 'project' do
|
10
10
|
source 'input', tap('samples/data/data2.txt')
|
11
11
|
|
@@ -20,4 +20,4 @@ cascade 'project' do
|
|
20
20
|
|
21
21
|
sink 'input', tap('output/project', :sink_mode => :replace)
|
22
22
|
end
|
23
|
-
end.complete
|
23
|
+
end.complete
|
data/samples/rename.rb
CHANGED
@@ -3,7 +3,7 @@ $: << File.join(File.dirname(__FILE__), '..', 'lib')
|
|
3
3
|
|
4
4
|
require 'cascading'
|
5
5
|
|
6
|
-
cascade 'rename' do
|
6
|
+
cascade 'rename', :mode => :local do
|
7
7
|
flow 'rename' do
|
8
8
|
source 'input', tap('samples/data/data2.txt')
|
9
9
|
|
@@ -17,4 +17,4 @@ cascade 'rename' do
|
|
17
17
|
|
18
18
|
sink 'input', tap('output/rename', :sink_mode => :replace)
|
19
19
|
end
|
20
|
-
end.complete
|
20
|
+
end.complete
|
data/samples/scorenames.rb
CHANGED
@@ -3,7 +3,7 @@ $: << File.join(File.dirname(__FILE__), '..', 'lib')
|
|
3
3
|
|
4
4
|
require 'cascading'
|
5
5
|
|
6
|
-
cascade 'scorenames' do
|
6
|
+
cascade 'scorenames', :mode => :local do
|
7
7
|
flow 'scorenames' do
|
8
8
|
# You don't have to curl and cache inputs: tap can fetch via HTTP
|
9
9
|
#source 'input', tap('http://www.census.gov/genealogy/names/dist.all.last')
|
@@ -17,4 +17,4 @@ cascade 'scorenames' do
|
|
17
17
|
|
18
18
|
sink 'input', tap('output/scorenames', :sink_mode => :replace)
|
19
19
|
end
|
20
|
-
end.complete
|
20
|
+
end.complete
|
data/samples/splitter.rb
CHANGED
@@ -3,7 +3,7 @@ $: << File.join(File.dirname(__FILE__), '..', 'lib')
|
|
3
3
|
|
4
4
|
require 'cascading'
|
5
5
|
|
6
|
-
cascade 'splitter' do
|
6
|
+
cascade 'splitter', :mode => :local do
|
7
7
|
flow 'splitter' do
|
8
8
|
source 'input', tap('samples/data/data2.txt')
|
9
9
|
|
@@ -16,4 +16,4 @@ cascade 'splitter' do
|
|
16
16
|
|
17
17
|
sink 'input', tap('output/splitter', :sink_mode => :replace)
|
18
18
|
end
|
19
|
-
end.complete
|
19
|
+
end.complete
|
data/samples/sub_assembly.rb
CHANGED
@@ -3,7 +3,7 @@ $: << File.join(File.dirname(__FILE__), '..', 'lib')
|
|
3
3
|
|
4
4
|
require 'cascading'
|
5
5
|
|
6
|
-
cascade 'sub_assembly' do
|
6
|
+
cascade 'sub_assembly', :mode => :local do
|
7
7
|
flow 'sub_assembly' do
|
8
8
|
source 'input', tap('samples/data/data2.txt')
|
9
9
|
|
@@ -27,4 +27,4 @@ cascade 'sub_assembly' do
|
|
27
27
|
|
28
28
|
sink 'input', tap('output/sub_assembly', :sink_mode => :replace)
|
29
29
|
end
|
30
|
-
end.complete
|
30
|
+
end.complete
|
data/samples/union.rb
CHANGED
@@ -3,7 +3,7 @@ $: << File.join(File.dirname(__FILE__), '..', 'lib')
|
|
3
3
|
|
4
4
|
require 'cascading'
|
5
5
|
|
6
|
-
cascade 'union' do
|
6
|
+
cascade 'union', :mode => :local do
|
7
7
|
flow 'union' do
|
8
8
|
# You don't have to curl and cache inputs: tap can fetch via HTTP
|
9
9
|
#source 'input', tap('http://www.census.gov/genealogy/names/dist.all.last')
|
@@ -33,4 +33,4 @@ cascade 'union' do
|
|
33
33
|
|
34
34
|
sink 'union', tap('output/union', :sink_mode => :replace)
|
35
35
|
end
|
36
|
-
end.complete
|
36
|
+
end.complete
|
data/spec/cascading_spec.rb
CHANGED
@@ -33,56 +33,56 @@ context Cascading do
|
|
33
33
|
end
|
34
34
|
|
35
35
|
it 'should find branches to sink' do
|
36
|
-
cascade 'branched_pass' do
|
36
|
+
cascade 'branched_pass', :mode => :local do
|
37
37
|
flow 'branched_pass' do
|
38
|
-
source 'input', tap('spec/resource/test_input.txt', :
|
38
|
+
source 'input', tap('spec/resource/test_input.txt', :scheme => text_line_scheme)
|
39
39
|
assembly 'input' do
|
40
40
|
branch 'branched_input' do
|
41
41
|
project 'line'
|
42
42
|
end
|
43
43
|
end
|
44
|
-
sink 'branched_input', tap("#{OUTPUT_DIR}/branched_pass_out", :
|
44
|
+
sink 'branched_input', tap("#{OUTPUT_DIR}/branched_pass_out", :sink_mode => :replace)
|
45
45
|
end
|
46
46
|
end.complete
|
47
47
|
|
48
|
-
ilc = `wc -l spec/resource/test_input.txt`.split(/\s+/).first
|
49
|
-
olc = `wc -l #{OUTPUT_DIR}/branched_pass_out
|
50
|
-
|
48
|
+
ilc = `wc -l spec/resource/test_input.txt`.strip.split(/\s+/).first
|
49
|
+
olc = `wc -l #{OUTPUT_DIR}/branched_pass_out`.strip.split(/\s+/).first
|
50
|
+
olc.should == ilc
|
51
51
|
end
|
52
52
|
|
53
53
|
it 'should create an isolated namespace per cascade' do
|
54
|
-
cascade 'double' do
|
54
|
+
cascade 'double', :mode => :local do
|
55
55
|
flow 'double' do
|
56
|
-
source 'input', tap('spec/resource/test_input.txt', :
|
56
|
+
source 'input', tap('spec/resource/test_input.txt', :scheme => text_line_scheme)
|
57
57
|
assembly 'input' do # Dup name
|
58
58
|
insert 'doubled' => expr('line:string + "," + line:string')
|
59
59
|
project 'doubled'
|
60
60
|
end
|
61
|
-
sink 'input', tap("#{OUTPUT_DIR}/double_out", :
|
61
|
+
sink 'input', tap("#{OUTPUT_DIR}/double_out", :sink_mode => :replace)
|
62
62
|
end
|
63
63
|
end
|
64
64
|
|
65
|
-
cascade 'pass' do
|
65
|
+
cascade 'pass', :mode => :local do
|
66
66
|
flow 'pass' do
|
67
|
-
source 'input', tap('spec/resource/test_input.txt', :
|
67
|
+
source 'input', tap('spec/resource/test_input.txt', :scheme => text_line_scheme)
|
68
68
|
assembly 'input' do # Dup name
|
69
69
|
project 'line'
|
70
70
|
end
|
71
|
-
sink 'input', tap("#{OUTPUT_DIR}/pass_out", :
|
71
|
+
sink 'input', tap("#{OUTPUT_DIR}/pass_out", :sink_mode => :replace)
|
72
72
|
end
|
73
73
|
end
|
74
74
|
|
75
75
|
Cascade.get('double').complete
|
76
76
|
Cascade.get('pass').complete
|
77
|
-
diff = `diff #{OUTPUT_DIR}/double_out
|
77
|
+
diff = `diff #{OUTPUT_DIR}/double_out #{OUTPUT_DIR}/pass_out`
|
78
78
|
diff.should_not be_empty
|
79
79
|
end
|
80
80
|
|
81
81
|
it 'should support joins in branches' do
|
82
|
-
cascade 'branch_join' do
|
82
|
+
cascade 'branch_join', :mode => :local do
|
83
83
|
flow 'branch_join' do
|
84
|
-
source 'left', tap('spec/resource/join_input.txt', :
|
85
|
-
source 'right', tap('spec/resource/join_input.txt', :
|
84
|
+
source 'left', tap('spec/resource/join_input.txt', :scheme => text_line_scheme)
|
85
|
+
source 'right', tap('spec/resource/join_input.txt', :scheme => text_line_scheme)
|
86
86
|
|
87
87
|
assembly 'left' do
|
88
88
|
split 'line', ['x', 'y', 'z'], :pattern => /,/
|
@@ -98,7 +98,7 @@ context Cascading do
|
|
98
98
|
end
|
99
99
|
end
|
100
100
|
|
101
|
-
sink 'branch_join', tap("#{OUTPUT_DIR}/branch_join_out.txt", :
|
101
|
+
sink 'branch_join', tap("#{OUTPUT_DIR}/branch_join_out.txt", :sink_mode => :replace)
|
102
102
|
end
|
103
103
|
end.complete
|
104
104
|
end
|
data/spec/spec_util.rb
CHANGED
@@ -23,28 +23,28 @@ module Cascading
|
|
23
23
|
end
|
24
24
|
|
25
25
|
def test_flow(&block)
|
26
|
-
cascade = cascade 'test_app' do
|
26
|
+
cascade = cascade 'test_app', :mode => :local do
|
27
27
|
flow 'test', &block
|
28
28
|
end
|
29
|
-
cascade.complete
|
29
|
+
cascade.complete
|
30
30
|
end
|
31
31
|
|
32
32
|
def test_assembly(params = {}, &block)
|
33
33
|
branches = params[:branches] || []
|
34
34
|
|
35
35
|
test_flow do
|
36
|
-
source 'input', tap('spec/resource/test_input.txt', :
|
36
|
+
source 'input', tap('spec/resource/test_input.txt', :scheme => text_line_scheme)
|
37
37
|
|
38
38
|
# Default Fields defined by TextLineScheme
|
39
39
|
check_scope :source => 'input', :values_fields => ['offset', 'line']
|
40
40
|
|
41
41
|
assembly 'input', &block
|
42
42
|
|
43
|
-
sink 'input', tap("#{OUTPUT_DIR}/out.txt", :
|
43
|
+
sink 'input', tap("#{OUTPUT_DIR}/out.txt", :sink_mode => :replace)
|
44
44
|
|
45
45
|
# Branches must be sunk so that they (and their assertions) will be run
|
46
46
|
branches.each do |branch|
|
47
|
-
sink branch, tap("#{OUTPUT_DIR}/#{branch}_out.txt", :
|
47
|
+
sink branch, tap("#{OUTPUT_DIR}/#{branch}_out.txt", :sink_mode => :replace)
|
48
48
|
end
|
49
49
|
end
|
50
50
|
end
|
@@ -54,8 +54,8 @@ def test_join_assembly(params = {}, &block)
|
|
54
54
|
post_join_block = params[:post_join_block]
|
55
55
|
|
56
56
|
test_flow do
|
57
|
-
source 'left', tap('spec/resource/join_input.txt', :
|
58
|
-
source 'right', tap('spec/resource/join_input.txt', :
|
57
|
+
source 'left', tap('spec/resource/join_input.txt', :scheme => text_line_scheme)
|
58
|
+
source 'right', tap('spec/resource/join_input.txt', :scheme => text_line_scheme)
|
59
59
|
|
60
60
|
# Default Fields defined by TextLineScheme
|
61
61
|
check_scope :source => 'left', :values_fields => ['offset', 'line']
|
@@ -82,11 +82,11 @@ def test_join_assembly(params = {}, &block)
|
|
82
82
|
instance_eval &post_join_block if post_join_block
|
83
83
|
end
|
84
84
|
|
85
|
-
sink 'join', tap("#{OUTPUT_DIR}/join_out.txt", :
|
85
|
+
sink 'join', tap("#{OUTPUT_DIR}/join_out.txt", :sink_mode => :replace)
|
86
86
|
|
87
87
|
# Branches must be sunk so that they (and their assertions) will be run
|
88
88
|
branches.each do |branch|
|
89
|
-
sink branch, tap("#{OUTPUT_DIR}/#{branch}_out.txt", :
|
89
|
+
sink branch, tap("#{OUTPUT_DIR}/#{branch}_out.txt", :sink_mode => :replace)
|
90
90
|
end
|
91
91
|
end
|
92
92
|
end
|