cascading.jruby 0.0.7 → 0.0.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.travis.yml +6 -0
- data/Gemfile +6 -0
- data/Gemfile.lock +12 -0
- data/HACKING.md +12 -4
- data/History.txt +8 -0
- data/README.md +2 -2
- data/Rakefile +3 -2
- data/TODO +2 -2
- data/ivy.xml +25 -0
- data/ivysettings.xml +7 -0
- data/lib/cascading.rb +1 -1
- data/lib/cascading/aggregations.rb +190 -0
- data/lib/cascading/assembly.rb +138 -194
- data/lib/cascading/cascading.rb +8 -3
- data/lib/cascading/cascading_exception.rb +12 -10
- data/lib/cascading/flow.rb +3 -2
- data/lib/cascading/operations.rb +1 -23
- data/lib/cascading/scope.rb +27 -24
- data/lib/cascading/sub_assembly.rb +93 -0
- data/samples/copy.rb +3 -1
- data/samples/data/data_group_by.txt +7 -0
- data/samples/data/genealogy/names/dist.all.last +88799 -0
- data/samples/data/gutenberg/the_outline_of_science_vol_1 +12761 -0
- data/samples/group_by.rb +61 -0
- data/samples/logwordcount.rb +3 -1
- data/samples/scorenames.rb +2 -1
- data/samples/sub_assembly.rb +30 -0
- data/samples/union.rb +3 -1
- data/spec/scope_spec.rb +47 -66
- data/spec/spec_util.rb +4 -4
- data/tags +69 -44
- data/tasks/ant.rake +9 -5
- data/tasks/samples.rake +6 -0
- data/tasks/test.rake +1 -1
- data/test/mock_assemblies.rb +55 -0
- data/test/test_aggregations.rb +443 -0
- data/test/test_assembly.rb +437 -196
- data/test/test_exceptions.rb +3 -3
- data/test/test_local_execution.rb +168 -0
- data/test/test_operations.rb +0 -7
- metadata +23 -2
data/test/test_assembly.rb
CHANGED
@@ -1,41 +1,29 @@
|
|
1
1
|
require 'test/unit'
|
2
2
|
require 'cascading'
|
3
|
-
|
4
|
-
def compare_with_references(test_name)
|
5
|
-
result = compare_files("test/references/#{test_name}.txt", "output/#{test_name}/part-00000")
|
6
|
-
assert_nil(result)
|
7
|
-
end
|
8
|
-
|
9
|
-
# Convenience for basic assembly tests; not valid for applications
|
10
|
-
def assembly(name, &block)
|
11
|
-
assembly = Assembly.new(name, nil)
|
12
|
-
assembly.instance_eval(&block)
|
13
|
-
assembly
|
14
|
-
end
|
3
|
+
require 'test/mock_assemblies'
|
15
4
|
|
16
5
|
class TC_Assembly < Test::Unit::TestCase
|
17
|
-
|
18
|
-
assembly = nil
|
19
|
-
flow 'test' do
|
20
|
-
source 'test', tap('test/data/data1.txt')
|
21
|
-
assembly = assembly 'test', &block
|
22
|
-
end
|
23
|
-
assembly
|
24
|
-
end
|
6
|
+
include MockAssemblies
|
25
7
|
|
26
8
|
def test_create_assembly_simple
|
27
|
-
assembly =
|
28
|
-
|
9
|
+
assembly = nil
|
10
|
+
flow 'test_create_assembly_simple' do
|
11
|
+
assembly = assembly 'assembly1' do
|
12
|
+
# Empty assembly
|
13
|
+
end
|
29
14
|
end
|
30
15
|
|
31
16
|
assert_not_nil assembly
|
17
|
+
assert_equal assembly.name, 'assembly1'
|
18
|
+
assert_equal 0, assembly.children.size
|
19
|
+
|
32
20
|
pipe = assembly.tail_pipe
|
33
|
-
|
21
|
+
assert_equal Java::CascadingPipe::Pipe, pipe.class
|
34
22
|
end
|
35
23
|
|
36
24
|
def test_each_identity
|
37
25
|
assembly = mock_assembly do
|
38
|
-
each 'offset', :
|
26
|
+
each 'offset', :function => identity
|
39
27
|
end
|
40
28
|
|
41
29
|
flow = assembly.parent
|
@@ -47,148 +35,424 @@ class TC_Assembly < Test::Unit::TestCase
|
|
47
35
|
def test_create_each
|
48
36
|
# You can apply an Each to 0 fields
|
49
37
|
assembly = mock_assembly do
|
50
|
-
each(:
|
38
|
+
each(:function => identity)
|
51
39
|
end
|
52
|
-
|
40
|
+
assert_equal Java::CascadingPipe::Each, assembly.tail_pipe.class
|
53
41
|
|
54
42
|
# In which case, it has empty argument and output selectors
|
55
43
|
assert_equal 0, assembly.tail_pipe.argument_selector.size
|
56
44
|
assert_equal 0, assembly.tail_pipe.output_selector.size
|
57
45
|
|
58
46
|
assembly = mock_assembly do
|
59
|
-
each
|
60
|
-
:filter => Java::CascadingOperation::Identity.new(fields('offset_copy')))
|
47
|
+
each 'offset', :output => 'offset_copy', :function => Java::CascadingOperation::Identity.new(fields('offset_copy'))
|
61
48
|
end
|
62
49
|
pipe = assembly.tail_pipe
|
63
50
|
|
64
|
-
|
51
|
+
assert_equal Java::CascadingPipe::Each, pipe.class
|
65
52
|
|
66
|
-
assert_equal 'offset', pipe.argument_selector.
|
67
|
-
assert_equal 'offset_copy', pipe.output_selector.
|
53
|
+
assert_equal ['offset'], pipe.argument_selector.to_a
|
54
|
+
assert_equal ['offset_copy'], pipe.output_selector.to_a
|
68
55
|
end
|
69
56
|
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
assert_raise CascadingException do
|
57
|
+
def test_every_cannot_follow_tap
|
58
|
+
# Assembly#count is no longer defined; instead, it has moved to
|
59
|
+
# Aggregations#count
|
60
|
+
assert_raise NameError do
|
75
61
|
assembly = mock_assembly do
|
76
|
-
|
62
|
+
count
|
77
63
|
end
|
78
64
|
pipe = assembly.tail_pipe
|
79
|
-
assert
|
65
|
+
assert Java::CascadingPipe::Every, pipe.class
|
80
66
|
end
|
67
|
+
end
|
81
68
|
|
82
|
-
|
69
|
+
def test_create_every
|
83
70
|
assembly = mock_assembly do
|
84
|
-
|
71
|
+
group_by 'line' do
|
72
|
+
count_aggregator = Java::CascadingOperationAggregator::Count.new(fields('count'))
|
73
|
+
every 'line', :aggregator => count_aggregator, :output => 'count'
|
74
|
+
end
|
85
75
|
end
|
86
|
-
assert
|
87
|
-
|
76
|
+
assert Java::CascadingPipe::Every, assembly.tail_pipe.class
|
77
|
+
assert_equal ['line'], assembly.tail_pipe.argument_selector.to_a
|
78
|
+
assert_equal ['count'], assembly.tail_pipe.output_selector.to_a
|
88
79
|
|
89
|
-
assert_raise CascadingException do
|
90
80
|
assembly = mock_assembly do
|
91
|
-
|
81
|
+
group_by 'line' do
|
82
|
+
count
|
83
|
+
end
|
92
84
|
end
|
93
|
-
assert
|
94
|
-
assert_equal "Field1", assembly.tail_pipe.argument_selector.get(0)
|
95
|
-
end
|
85
|
+
assert Java::CascadingPipe::Every, assembly.tail_pipe.class
|
96
86
|
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
assert_equal 'line', assembly.tail_pipe.argument_selector.get(0)
|
103
|
-
assert_equal 'line_count', assembly.tail_pipe.output_selector.get(0)
|
104
|
-
end
|
87
|
+
# NOTE: this is not valid when we optimize using CountBy
|
88
|
+
#assert_equal last_grouping_fields, assembly.tail_pipe.argument_selector
|
89
|
+
assert_equal fields('count'), assembly.tail_pipe.argument_selector
|
90
|
+
|
91
|
+
assert_equal all_fields, assembly.tail_pipe.output_selector
|
105
92
|
end
|
106
93
|
|
107
94
|
def test_create_group_by
|
108
95
|
assembly = mock_assembly do
|
109
|
-
group_by
|
96
|
+
group_by 'line'
|
110
97
|
end
|
111
98
|
|
112
|
-
|
99
|
+
assert_equal Java::CascadingPipe::GroupBy, assembly.tail_pipe.class
|
113
100
|
grouping_fields = assembly.tail_pipe.key_selectors['test']
|
114
|
-
assert_equal 'line', grouping_fields.
|
101
|
+
assert_equal ['line'], grouping_fields.to_a
|
102
|
+
|
103
|
+
assert_equal ['offset', 'line'], assembly.scope.values_fields.to_a
|
104
|
+
assert_equal ['line'], assembly.scope.grouping_fields.to_a
|
115
105
|
|
116
106
|
assembly = mock_assembly do
|
117
|
-
group_by
|
107
|
+
group_by 'offset'
|
118
108
|
end
|
119
109
|
|
120
|
-
|
110
|
+
assert_equal Java::CascadingPipe::GroupBy, assembly.tail_pipe.class
|
121
111
|
grouping_fields = assembly.tail_pipe.key_selectors['test']
|
122
|
-
assert_equal '
|
112
|
+
assert_equal ['offset'], grouping_fields.to_a
|
113
|
+
|
114
|
+
assert_equal ['offset', 'line'], assembly.scope.values_fields.to_a
|
115
|
+
assert_equal ['offset'], assembly.scope.grouping_fields.to_a
|
123
116
|
end
|
124
117
|
|
125
118
|
def test_create_group_by_many_fields
|
126
119
|
assembly = mock_assembly do
|
127
|
-
group_by
|
120
|
+
group_by 'offset', 'line'
|
128
121
|
end
|
129
122
|
|
130
|
-
|
123
|
+
assert_equal Java::CascadingPipe::GroupBy, assembly.tail_pipe.class
|
131
124
|
grouping_fields = assembly.tail_pipe.key_selectors['test']
|
132
|
-
assert_equal 'offset', grouping_fields.
|
133
|
-
|
125
|
+
assert_equal ['offset', 'line'], grouping_fields.to_a
|
126
|
+
|
127
|
+
assert_equal ['offset', 'line'], assembly.scope.values_fields.to_a
|
128
|
+
assert_equal ['offset', 'line'], assembly.scope.grouping_fields.to_a
|
134
129
|
end
|
135
130
|
|
136
131
|
def test_create_group_by_with_sort
|
137
132
|
assembly = mock_assembly do
|
138
|
-
group_by
|
133
|
+
group_by 'offset', 'line', :sort_by => 'line'
|
139
134
|
end
|
140
135
|
|
141
|
-
|
136
|
+
assert_equal Java::CascadingPipe::GroupBy, assembly.tail_pipe.class
|
142
137
|
grouping_fields = assembly.tail_pipe.key_selectors['test']
|
143
138
|
sorting_fields = assembly.tail_pipe.sorting_selectors['test']
|
144
139
|
|
145
|
-
|
146
|
-
|
140
|
+
assert assembly.tail_pipe.is_sorted
|
141
|
+
assert !assembly.tail_pipe.is_sort_reversed
|
147
142
|
|
148
|
-
assert_equal 'offset', grouping_fields.
|
149
|
-
assert_equal 'line',
|
150
|
-
|
151
|
-
|
152
|
-
assert_equal 'line',
|
143
|
+
assert_equal ['offset', 'line'], grouping_fields.to_a
|
144
|
+
assert_equal ['line'], sorting_fields.to_a
|
145
|
+
|
146
|
+
assert_equal ['offset', 'line'], assembly.scope.values_fields.to_a
|
147
|
+
assert_equal ['offset', 'line'], assembly.scope.grouping_fields.to_a
|
153
148
|
end
|
154
149
|
|
155
150
|
def test_create_group_by_with_sort_reverse
|
156
151
|
assembly = mock_assembly do
|
157
|
-
group_by
|
152
|
+
group_by 'offset', 'line', :sort_by => 'line', :reverse => true
|
158
153
|
end
|
159
154
|
|
160
|
-
|
155
|
+
assert_equal Java::CascadingPipe::GroupBy, assembly.tail_pipe.class
|
161
156
|
grouping_fields = assembly.tail_pipe.key_selectors['test']
|
162
157
|
sorting_fields = assembly.tail_pipe.sorting_selectors['test']
|
163
158
|
|
164
|
-
|
165
|
-
|
159
|
+
assert assembly.tail_pipe.is_sorted
|
160
|
+
assert assembly.tail_pipe.is_sort_reversed
|
161
|
+
|
162
|
+
assert_equal ['offset', 'line'], grouping_fields.to_a
|
163
|
+
assert_equal ['line'], sorting_fields.to_a
|
166
164
|
|
167
|
-
assert_equal 'offset',
|
168
|
-
assert_equal 'line', grouping_fields.
|
169
|
-
assert assembly.tail_pipe.isSorted()
|
170
|
-
assert assembly.tail_pipe.isSortReversed()
|
171
|
-
assert_equal 'line', sorting_fields.get(0)
|
165
|
+
assert_equal ['offset', 'line'], assembly.scope.values_fields.to_a
|
166
|
+
assert_equal ['offset', 'line'], assembly.scope.grouping_fields.to_a
|
172
167
|
end
|
173
168
|
|
174
169
|
def test_create_group_by_reverse
|
175
170
|
assembly = mock_assembly do
|
176
|
-
group_by
|
171
|
+
group_by 'offset', 'line', :reverse => true
|
177
172
|
end
|
178
173
|
|
179
|
-
|
174
|
+
assert_equal Java::CascadingPipe::GroupBy, assembly.tail_pipe.class
|
180
175
|
grouping_fields = assembly.tail_pipe.key_selectors['test']
|
181
176
|
sorting_fields = assembly.tail_pipe.sorting_selectors['test']
|
182
177
|
|
183
|
-
|
184
|
-
|
178
|
+
assert !assembly.tail_pipe.is_sorted
|
179
|
+
assert assembly.tail_pipe.is_sort_reversed
|
180
|
+
|
181
|
+
assert_equal ['offset', 'line'], grouping_fields.to_a
|
182
|
+
assert_nil sorting_fields
|
183
|
+
|
184
|
+
assert_equal ['offset', 'line'], assembly.scope.values_fields.to_a
|
185
|
+
assert_equal ['offset', 'line'], assembly.scope.grouping_fields.to_a
|
186
|
+
end
|
187
|
+
|
188
|
+
def test_create_union
|
189
|
+
assembly = mock_branched_assembly do
|
190
|
+
union 'test1', 'test2', :on => 'line'
|
191
|
+
end
|
192
|
+
|
193
|
+
assert_equal Java::CascadingPipe::GroupBy, assembly.tail_pipe.class
|
194
|
+
|
195
|
+
left_grouping_fields = assembly.tail_pipe.key_selectors['test1']
|
196
|
+
assert_equal ['line'], left_grouping_fields.to_a
|
197
|
+
|
198
|
+
right_grouping_fields = assembly.tail_pipe.key_selectors['test2']
|
199
|
+
assert_equal ['line'], right_grouping_fields.to_a
|
200
|
+
|
201
|
+
assert_equal ['offset', 'line'], assembly.scope.values_fields.to_a
|
202
|
+
assert_equal ['line'], assembly.scope.grouping_fields.to_a
|
203
|
+
|
204
|
+
assembly = mock_branched_assembly do
|
205
|
+
union 'test1', 'test2', :on => 'offset'
|
206
|
+
end
|
207
|
+
|
208
|
+
assert_equal Java::CascadingPipe::GroupBy, assembly.tail_pipe.class
|
209
|
+
left_grouping_fields = assembly.tail_pipe.key_selectors['test1']
|
210
|
+
assert_equal ['offset'], left_grouping_fields.to_a
|
211
|
+
right_grouping_fields = assembly.tail_pipe.key_selectors['test2']
|
212
|
+
assert_equal ['offset'], right_grouping_fields.to_a
|
213
|
+
|
214
|
+
assert_equal ['offset', 'line'], assembly.scope.values_fields.to_a
|
215
|
+
assert_equal ['offset'], assembly.scope.grouping_fields.to_a
|
216
|
+
|
217
|
+
assembly = mock_branched_assembly do
|
218
|
+
union 'test1', 'test2'
|
219
|
+
end
|
220
|
+
|
221
|
+
assert_equal Java::CascadingPipe::GroupBy, assembly.tail_pipe.class
|
222
|
+
left_grouping_fields = assembly.tail_pipe.key_selectors['test1']
|
223
|
+
assert_equal ['offset'], left_grouping_fields.to_a
|
224
|
+
right_grouping_fields = assembly.tail_pipe.key_selectors['test2']
|
225
|
+
assert_equal ['offset'], right_grouping_fields.to_a
|
226
|
+
|
227
|
+
assert_equal ['offset', 'line'], assembly.scope.values_fields.to_a
|
228
|
+
assert_equal ['offset'], assembly.scope.grouping_fields.to_a
|
229
|
+
end
|
230
|
+
|
231
|
+
def test_create_union_many_fields
|
232
|
+
assembly = mock_branched_assembly do
|
233
|
+
union 'test1', 'test2', :on => ['offset', 'line']
|
234
|
+
end
|
235
|
+
|
236
|
+
assert_equal Java::CascadingPipe::GroupBy, assembly.tail_pipe.class
|
237
|
+
|
238
|
+
left_grouping_fields = assembly.tail_pipe.key_selectors['test1']
|
239
|
+
assert_equal ['offset', 'line'], left_grouping_fields.to_a
|
240
|
+
|
241
|
+
right_grouping_fields = assembly.tail_pipe.key_selectors['test2']
|
242
|
+
assert_equal ['offset', 'line'], right_grouping_fields.to_a
|
243
|
+
|
244
|
+
assert_equal ['offset', 'line'], assembly.scope.values_fields.to_a
|
245
|
+
assert_equal ['offset', 'line'], assembly.scope.grouping_fields.to_a
|
246
|
+
end
|
247
|
+
|
248
|
+
def test_create_union_with_sort
|
249
|
+
assembly = mock_branched_assembly do
|
250
|
+
union 'test1', 'test2', :on => ['offset', 'line'], :sort_by => 'line'
|
251
|
+
end
|
252
|
+
|
253
|
+
assert_equal Java::CascadingPipe::GroupBy, assembly.tail_pipe.class
|
254
|
+
left_grouping_fields = assembly.tail_pipe.key_selectors['test1']
|
255
|
+
right_grouping_fields = assembly.tail_pipe.key_selectors['test2']
|
256
|
+
left_sorting_fields = assembly.tail_pipe.sorting_selectors['test1']
|
257
|
+
right_sorting_fields = assembly.tail_pipe.sorting_selectors['test2']
|
258
|
+
|
259
|
+
assert assembly.tail_pipe.is_sorted
|
260
|
+
assert !assembly.tail_pipe.is_sort_reversed
|
261
|
+
|
262
|
+
assert_equal ['offset', 'line'], left_grouping_fields.to_a
|
263
|
+
assert_equal ['offset', 'line'], right_grouping_fields.to_a
|
264
|
+
assert_equal ['line'], left_sorting_fields.to_a
|
265
|
+
assert_equal ['line'], right_sorting_fields.to_a
|
266
|
+
|
267
|
+
assert_equal ['offset', 'line'], assembly.scope.values_fields.to_a
|
268
|
+
assert_equal ['offset', 'line'], assembly.scope.grouping_fields.to_a
|
269
|
+
end
|
270
|
+
|
271
|
+
def test_create_union_with_sort_reverse
|
272
|
+
assembly = mock_branched_assembly do
|
273
|
+
union 'test1', 'test2', :on => ['offset', 'line'], :sort_by => 'line', :reverse => true
|
274
|
+
end
|
275
|
+
|
276
|
+
assert_equal Java::CascadingPipe::GroupBy, assembly.tail_pipe.class
|
277
|
+
left_grouping_fields = assembly.tail_pipe.key_selectors['test1']
|
278
|
+
right_grouping_fields = assembly.tail_pipe.key_selectors['test2']
|
279
|
+
left_sorting_fields = assembly.tail_pipe.sorting_selectors['test1']
|
280
|
+
right_sorting_fields = assembly.tail_pipe.sorting_selectors['test2']
|
281
|
+
|
282
|
+
assert assembly.tail_pipe.is_sorted
|
283
|
+
assert assembly.tail_pipe.is_sort_reversed
|
284
|
+
|
285
|
+
assert_equal ['offset', 'line'], left_grouping_fields.to_a
|
286
|
+
assert_equal ['offset', 'line'], right_grouping_fields.to_a
|
287
|
+
assert_equal ['line'], left_sorting_fields.to_a
|
288
|
+
assert_equal ['line'], right_sorting_fields.to_a
|
289
|
+
|
290
|
+
assert_equal ['offset', 'line'], assembly.scope.values_fields.to_a
|
291
|
+
assert_equal ['offset', 'line'], assembly.scope.grouping_fields.to_a
|
292
|
+
end
|
293
|
+
|
294
|
+
def test_create_union_reverse
|
295
|
+
assembly = mock_branched_assembly do
|
296
|
+
union 'test1', 'test2', :on => ['offset', 'line'], :reverse => true
|
297
|
+
end
|
298
|
+
|
299
|
+
assert_equal Java::CascadingPipe::GroupBy, assembly.tail_pipe.class
|
300
|
+
left_grouping_fields = assembly.tail_pipe.key_selectors['test1']
|
301
|
+
right_grouping_fields = assembly.tail_pipe.key_selectors['test2']
|
302
|
+
left_sorting_fields = assembly.tail_pipe.sorting_selectors['test1']
|
303
|
+
right_sorting_fields = assembly.tail_pipe.sorting_selectors['test2']
|
304
|
+
|
305
|
+
assert assembly.tail_pipe.is_sorted # FIXME: Missing constructor in wip-255
|
306
|
+
assert assembly.tail_pipe.is_sort_reversed
|
307
|
+
|
308
|
+
assert_equal ['offset', 'line'], left_grouping_fields.to_a
|
309
|
+
assert_equal ['offset', 'line'], right_grouping_fields.to_a
|
310
|
+
assert_equal ['offset', 'line'], left_sorting_fields.to_a # FIXME: Missing constructor in wip-255
|
311
|
+
assert_equal ['offset', 'line'], right_sorting_fields.to_a # FIXME: Missing constructor in wip-255
|
312
|
+
|
313
|
+
assert_equal ['offset', 'line'], assembly.scope.values_fields.to_a
|
314
|
+
assert_equal ['offset', 'line'], assembly.scope.grouping_fields.to_a
|
315
|
+
end
|
316
|
+
|
317
|
+
def test_union_undefined_inputs
|
318
|
+
assert_raise RuntimeError, "Could not find assembly 'doesnotexist' in union" do
|
319
|
+
flow 'test_union_undefined_inputs' do
|
320
|
+
source 'data1', tap('test/data/data1.txt')
|
321
|
+
|
322
|
+
assembly 'data1' do
|
323
|
+
pass
|
324
|
+
end
|
325
|
+
|
326
|
+
assembly 'union' do
|
327
|
+
union 'doesnotexist', 'data1'
|
328
|
+
end
|
329
|
+
|
330
|
+
sink 'union', tap('output/test_union_undefined_inputs')
|
331
|
+
end
|
332
|
+
end
|
333
|
+
end
|
334
|
+
|
335
|
+
def test_create_join
|
336
|
+
assembly = mock_two_input_assembly do
|
337
|
+
join 'test1', 'test2', :on => 'name'
|
338
|
+
end
|
339
|
+
|
340
|
+
assert_equal Java::CascadingPipe::CoGroup, assembly.tail_pipe.class
|
341
|
+
|
342
|
+
left_grouping_fields = assembly.tail_pipe.key_selectors['test1']
|
343
|
+
assert_equal ['name'], left_grouping_fields.to_a
|
344
|
+
|
345
|
+
right_grouping_fields = assembly.tail_pipe.key_selectors['test2']
|
346
|
+
assert_equal ['name'], right_grouping_fields.to_a
|
347
|
+
|
348
|
+
assert_equal ['name', 'score1', 'score2', 'id', 'name_', 'id_', 'town'], assembly.scope.values_fields.to_a
|
349
|
+
assert_equal ['name', 'name_'], assembly.scope.grouping_fields.to_a
|
350
|
+
|
351
|
+
assembly = mock_two_input_assembly do
|
352
|
+
join 'test1', 'test2', :on => 'id'
|
353
|
+
end
|
354
|
+
|
355
|
+
assert_equal Java::CascadingPipe::CoGroup, assembly.tail_pipe.class
|
356
|
+
left_grouping_fields = assembly.tail_pipe.key_selectors['test1']
|
357
|
+
assert_equal ['id'], left_grouping_fields.to_a
|
358
|
+
right_grouping_fields = assembly.tail_pipe.key_selectors['test2']
|
359
|
+
assert_equal ['id'], right_grouping_fields.to_a
|
360
|
+
|
361
|
+
assert_equal ['name', 'score1', 'score2', 'id', 'name_', 'id_', 'town'], assembly.scope.values_fields.to_a
|
362
|
+
assert_equal ['id', 'id_'], assembly.scope.grouping_fields.to_a
|
363
|
+
end
|
364
|
+
|
365
|
+
def test_create_join_many_fields
|
366
|
+
assembly = mock_two_input_assembly do
|
367
|
+
join 'test1', 'test2', :on => ['name', 'id']
|
368
|
+
end
|
369
|
+
|
370
|
+
assert_equal Java::CascadingPipe::CoGroup, assembly.tail_pipe.class
|
371
|
+
left_grouping_fields = assembly.tail_pipe.key_selectors['test1']
|
372
|
+
assert_equal ['name', 'id'], left_grouping_fields.to_a
|
373
|
+
right_grouping_fields = assembly.tail_pipe.key_selectors['test2']
|
374
|
+
assert_equal ['name', 'id'], right_grouping_fields.to_a
|
185
375
|
|
186
|
-
assert_equal '
|
187
|
-
assert_equal '
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
376
|
+
assert_equal ['name', 'score1', 'score2', 'id', 'name_', 'id_', 'town'], assembly.scope.values_fields.to_a
|
377
|
+
assert_equal ['name', 'id', 'name_', 'id_'], assembly.scope.grouping_fields.to_a
|
378
|
+
end
|
379
|
+
|
380
|
+
def test_create_join_with_declared_fields
|
381
|
+
assembly = mock_two_input_assembly do
|
382
|
+
join 'test1', 'test2', :on => 'name', :declared_fields => ['a', 'b', 'c', 'd', 'e', 'f', 'g']
|
383
|
+
end
|
384
|
+
|
385
|
+
assert_equal Java::CascadingPipe::CoGroup, assembly.tail_pipe.class
|
386
|
+
|
387
|
+
left_grouping_fields = assembly.tail_pipe.key_selectors['test1']
|
388
|
+
assert_equal ['name'], left_grouping_fields.to_a
|
389
|
+
|
390
|
+
right_grouping_fields = assembly.tail_pipe.key_selectors['test2']
|
391
|
+
assert_equal ['name'], right_grouping_fields.to_a
|
392
|
+
|
393
|
+
assert_equal ['a', 'b', 'c', 'd', 'e', 'f', 'g'], assembly.scope.values_fields.to_a
|
394
|
+
assert_equal ['name', 'name_'], assembly.scope.grouping_fields.to_a
|
395
|
+
end
|
396
|
+
|
397
|
+
def test_join_with_block
|
398
|
+
assembly = mock_two_input_assembly do
|
399
|
+
join 'test1', 'test2', :on => 'name' do
|
400
|
+
count
|
401
|
+
end
|
402
|
+
end
|
403
|
+
|
404
|
+
assert_equal Java::CascadingPipe::Every, assembly.tail_pipe.class
|
405
|
+
|
406
|
+
assert_equal ['name', 'name_', 'count'], assembly.scope.values_fields.to_a
|
407
|
+
assert_equal ['name', 'name_', 'count'], assembly.scope.grouping_fields.to_a
|
408
|
+
end
|
409
|
+
|
410
|
+
def test_join_undefined_inputs
|
411
|
+
assert_raise RuntimeError, "Could not find assembly 'doesnotexist' in join" do
|
412
|
+
flow 'test_join_undefined_inputs' do
|
413
|
+
source 'data1', tap('test/data/data1.txt')
|
414
|
+
|
415
|
+
assembly 'data1' do
|
416
|
+
pass
|
417
|
+
end
|
418
|
+
|
419
|
+
assembly 'join' do
|
420
|
+
join 'doesnotexist', 'data1', :on => 'name'
|
421
|
+
end
|
422
|
+
|
423
|
+
sink 'join', tap('output/test_join_undefined_inputs')
|
424
|
+
end
|
425
|
+
end
|
426
|
+
end
|
427
|
+
|
428
|
+
def test_join_without_on
|
429
|
+
assert_raise RuntimeError, 'join requires :on parameter' do
|
430
|
+
mock_two_input_assembly do
|
431
|
+
join 'test1', 'test2'
|
432
|
+
end
|
433
|
+
end
|
434
|
+
end
|
435
|
+
|
436
|
+
def test_join_invalid_on
|
437
|
+
assert_raise RuntimeError, "Unsupported data type for :on in join: 'Fixnum'" do
|
438
|
+
mock_two_input_assembly do
|
439
|
+
join 'test1', 'test2', :on => 1
|
440
|
+
end
|
441
|
+
end
|
442
|
+
end
|
443
|
+
|
444
|
+
def test_join_empty_on
|
445
|
+
assert_raise RuntimeError, 'join requres non-empty :on parameter' do
|
446
|
+
mock_two_input_assembly do
|
447
|
+
join 'test1', 'test2', :on => []
|
448
|
+
end
|
449
|
+
end
|
450
|
+
|
451
|
+
assert_raise RuntimeError, 'join requres non-empty :on parameter' do
|
452
|
+
mock_two_input_assembly do
|
453
|
+
join 'test1', 'test2', :on => {}
|
454
|
+
end
|
455
|
+
end
|
192
456
|
end
|
193
457
|
|
194
458
|
def test_branch_unique
|
@@ -198,7 +462,6 @@ class TC_Assembly < Test::Unit::TestCase
|
|
198
462
|
end
|
199
463
|
|
200
464
|
assert_equal 1, assembly.children.size
|
201
|
-
|
202
465
|
end
|
203
466
|
|
204
467
|
def test_branch_empty
|
@@ -231,20 +494,74 @@ class TC_Assembly < Test::Unit::TestCase
|
|
231
494
|
assert_equal 0, assembly.children['branch1'].children['branch2'].children.size
|
232
495
|
end
|
233
496
|
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
each('offset', :output => 'offset_copy',
|
240
|
-
:filter => Java::CascadingOperation::Identity.new(fields('offset_copy')))
|
241
|
-
every(:aggregator => count_function)
|
242
|
-
end
|
497
|
+
def test_sub_assembly
|
498
|
+
assembly = mock_assembly do
|
499
|
+
sub_assembly Java::CascadingPipeAssembly::Discard.new(tail_pipe, fields('offset'))
|
500
|
+
end
|
501
|
+
assert_equal ['line'], assembly.scope.values_fields.to_a
|
243
502
|
|
244
|
-
|
503
|
+
assembly = mock_assembly do
|
504
|
+
sub_assembly Java::CascadingPipeAssembly::Retain.new(tail_pipe, fields('offset'))
|
505
|
+
end
|
506
|
+
assert_equal ['offset'], assembly.scope.values_fields.to_a
|
245
507
|
|
246
|
-
|
508
|
+
assembly = mock_assembly do
|
509
|
+
sub_assembly Java::CascadingPipeAssembly::Rename.new(tail_pipe, fields(['offset', 'line']), fields(['byte', 'line']))
|
247
510
|
end
|
511
|
+
assert_equal ['byte', 'line'], assembly.scope.values_fields.to_a
|
512
|
+
|
513
|
+
assembly = mock_assembly do
|
514
|
+
sub_assembly Java::CascadingPipeAssembly::Unique.new(tail_pipe, fields('line'))
|
515
|
+
end
|
516
|
+
assert_equal ['offset', 'line'], assembly.scope.values_fields.to_a
|
517
|
+
assert_equal ['offset', 'line'], assembly.scope.grouping_fields.to_a
|
518
|
+
end
|
519
|
+
|
520
|
+
def test_count_by_sub_assembly
|
521
|
+
assembly = mock_branched_assembly do
|
522
|
+
pipes, _ = populate_incoming_scopes(['test1', 'test2'])
|
523
|
+
|
524
|
+
aggregate_by = Java::CascadingPipeAssembly::AggregateBy.new(
|
525
|
+
name,
|
526
|
+
pipes.to_java(Java::CascadingPipe::Pipe),
|
527
|
+
fields('line'),
|
528
|
+
[Java::CascadingPipeAssembly::CountBy.new(fields('count'))].to_java(Java::CascadingPipeAssembly::AggregateBy)
|
529
|
+
)
|
530
|
+
|
531
|
+
sub_assembly aggregate_by, pipes, @incoming_scopes
|
532
|
+
end
|
533
|
+
assert_equal ['line', 'count'], assembly.scope.values_fields.to_a
|
534
|
+
assert_equal ['line', 'count'], assembly.scope.grouping_fields.to_a
|
535
|
+
end
|
536
|
+
|
537
|
+
def test_average_by_sub_assembly
|
538
|
+
assembly = mock_assembly do
|
539
|
+
aggregate_by = Java::CascadingPipeAssembly::AggregateBy.new(
|
540
|
+
name,
|
541
|
+
[tail_pipe].to_java(Java::CascadingPipe::Pipe),
|
542
|
+
fields('line'),
|
543
|
+
[Java::CascadingPipeAssembly::AverageBy.new(fields('offset'), fields('average'))].to_java(Java::CascadingPipeAssembly::AggregateBy)
|
544
|
+
)
|
545
|
+
|
546
|
+
sub_assembly aggregate_by
|
547
|
+
end
|
548
|
+
assert_equal ['line', 'average'], assembly.scope.values_fields.to_a
|
549
|
+
assert_equal ['line', 'average'], assembly.scope.grouping_fields.to_a
|
550
|
+
end
|
551
|
+
|
552
|
+
def test_sum_by_sub_assembly
|
553
|
+
assembly = mock_assembly do
|
554
|
+
aggregate_by = Java::CascadingPipeAssembly::AggregateBy.new(
|
555
|
+
name,
|
556
|
+
[tail_pipe].to_java(Java::CascadingPipe::Pipe),
|
557
|
+
fields('line'),
|
558
|
+
[Java::CascadingPipeAssembly::SumBy.new(fields('offset'), fields('sum'), Java::double.java_class)].to_java(Java::CascadingPipeAssembly::AggregateBy)
|
559
|
+
)
|
560
|
+
|
561
|
+
sub_assembly aggregate_by
|
562
|
+
end
|
563
|
+
assert_equal ['line', 'sum'], assembly.scope.values_fields.to_a
|
564
|
+
assert_equal ['line', 'sum'], assembly.scope.grouping_fields.to_a
|
248
565
|
end
|
249
566
|
|
250
567
|
def test_empty_where
|
@@ -252,10 +569,10 @@ class TC_Assembly < Test::Unit::TestCase
|
|
252
569
|
split 'line', ['name', 'score1', 'score2', 'id'], :pattern => /[.,]*\s+/, :output => ['name', 'score1', 'score2', 'id']
|
253
570
|
where
|
254
571
|
end
|
255
|
-
|
572
|
+
assert_equal Java::CascadingPipe::Each, assembly.tail_pipe.class
|
256
573
|
|
257
574
|
# Empty where compiles away
|
258
|
-
|
575
|
+
assert_equal Java::CascadingOperationRegex::RegexSplitter, assembly.tail_pipe.operation.class
|
259
576
|
end
|
260
577
|
|
261
578
|
def test_where
|
@@ -263,8 +580,8 @@ class TC_Assembly < Test::Unit::TestCase
|
|
263
580
|
split 'line', ['name', 'score1', 'score2', 'id'], :pattern => /[.,]*\s+/, :output => ['name', 'score1', 'score2', 'id']
|
264
581
|
where 'score1:double < score2:double'
|
265
582
|
end
|
266
|
-
|
267
|
-
|
583
|
+
assert_equal Java::CascadingPipe::Each, assembly.tail_pipe.class
|
584
|
+
assert_equal Java::CascadingOperationExpression::ExpressionFilter, assembly.tail_pipe.operation.class
|
268
585
|
end
|
269
586
|
|
270
587
|
def test_where_with_expression
|
@@ -272,8 +589,8 @@ class TC_Assembly < Test::Unit::TestCase
|
|
272
589
|
split 'line', ['name', 'score1', 'score2', 'id'], :pattern => /[.,]*\s+/, :output => ['name', 'score1', 'score2', 'id']
|
273
590
|
where :expression => 'score1:double < score2:double'
|
274
591
|
end
|
275
|
-
|
276
|
-
|
592
|
+
assert_equal Java::CascadingPipe::Each, assembly.tail_pipe.class
|
593
|
+
assert_equal Java::CascadingOperationExpression::ExpressionFilter, assembly.tail_pipe.operation.class
|
277
594
|
end
|
278
595
|
|
279
596
|
def test_where_with_import
|
@@ -282,8 +599,8 @@ class TC_Assembly < Test::Unit::TestCase
|
|
282
599
|
names = ['SMITH', 'JONES', 'BROWN']
|
283
600
|
where "import java.util.Arrays;\nArrays.asList(new String[] { \"#{names.join('", "')}\" }).contains(name:string)"
|
284
601
|
end
|
285
|
-
|
286
|
-
|
602
|
+
assert_equal Java::CascadingPipe::Each, assembly.tail_pipe.class
|
603
|
+
assert_equal Java::CascadingOperationExpression::ExpressionFilter, assembly.tail_pipe.operation.class
|
287
604
|
end
|
288
605
|
|
289
606
|
def test_smoke_test_debug_scope
|
@@ -291,92 +608,16 @@ class TC_Assembly < Test::Unit::TestCase
|
|
291
608
|
flow 'smoke' do
|
292
609
|
source 'input', tap('test/data/data1.txt')
|
293
610
|
assembly 'input' do
|
294
|
-
|
611
|
+
debug_scope
|
612
|
+
group_by 'line' do
|
613
|
+
count
|
614
|
+
sum 'offset', :type => :long
|
615
|
+
debug_scope
|
616
|
+
end
|
295
617
|
debug_scope
|
296
618
|
end
|
297
|
-
sink 'input', tap('output/
|
619
|
+
sink 'input', tap('output/test_smoke_test_debug_scope')
|
298
620
|
end
|
299
621
|
end
|
300
622
|
end
|
301
623
|
end
|
302
|
-
|
303
|
-
class TC_AssemblyScenarii < Test::Unit::TestCase
|
304
|
-
def test_smoke_test_sequence_file_scheme
|
305
|
-
cascade 'smoke' do
|
306
|
-
flow 'smoke' do
|
307
|
-
source 'input', tap('test/data/data1.txt')
|
308
|
-
assembly 'input' do
|
309
|
-
pass
|
310
|
-
end
|
311
|
-
compress_output :default, :block
|
312
|
-
sink 'input', tap('output/smoke_test_sequence_file_scheme', :scheme => sequence_file_scheme)
|
313
|
-
end
|
314
|
-
end.complete
|
315
|
-
end
|
316
|
-
|
317
|
-
def test_splitter
|
318
|
-
flow = flow "splitter" do
|
319
|
-
source "copy", tap("test/data/data1.txt")
|
320
|
-
sink "copy", tap('output/splitter', :sink_mode => :replace)
|
321
|
-
|
322
|
-
assembly "copy" do
|
323
|
-
split "line", :pattern => /[.,]*\s+/, :into=>["name", "score1", "score2", "id"], :output => ["name", "score1", "score2", "id"]
|
324
|
-
assert_size_equals 4
|
325
|
-
assert_not_null
|
326
|
-
debug :print_fields => true
|
327
|
-
end
|
328
|
-
end.complete
|
329
|
-
end
|
330
|
-
|
331
|
-
def test_join1
|
332
|
-
cascade 'splitter' do
|
333
|
-
flow 'splitter' do
|
334
|
-
source "data1", tap("test/data/data1.txt")
|
335
|
-
source "data2", tap("test/data/data2.txt")
|
336
|
-
sink "joined", tap('output/joined', :sink_mode => :replace)
|
337
|
-
|
338
|
-
assembly1 = assembly "data1" do
|
339
|
-
split "line", :pattern => /[.,]*\s+/, :into=>["name", "score1", "score2", "id"], :output => ["name", "score1", "score2", "id"]
|
340
|
-
assert_size_equals 4
|
341
|
-
assert_not_null
|
342
|
-
debug :print_fields => true
|
343
|
-
end
|
344
|
-
|
345
|
-
assembly2 = assembly "data2" do
|
346
|
-
split "line", :pattern => /[.,]*\s+/, :into=>["name", "id", "town"], :output => ["name", "id", "town"]
|
347
|
-
assert_size_equals 3
|
348
|
-
assert_not_null
|
349
|
-
debug :print_fields => true
|
350
|
-
end
|
351
|
-
|
352
|
-
assembly "joined" do
|
353
|
-
join assembly1.name, assembly2.name, :on => ["name", "id"], :declared_fields => ["name", "score1", "score2", "id", "name2", "id2", "town"]
|
354
|
-
assert_size_equals 7
|
355
|
-
assert_not_null
|
356
|
-
end
|
357
|
-
end
|
358
|
-
end.complete
|
359
|
-
end
|
360
|
-
|
361
|
-
def test_join2
|
362
|
-
flow = flow "splitter" do
|
363
|
-
source "data1", tap("test/data/data1.txt")
|
364
|
-
source "data2", tap("test/data/data2.txt")
|
365
|
-
sink "joined", tap('output/joined', :sink_mode => :replace)
|
366
|
-
|
367
|
-
assembly "data1" do
|
368
|
-
split "line", :pattern => /[.,]*\s+/, :into=>["name", "score1", "score2", "id"], :output => ["name", "score1", "score2", "id"]
|
369
|
-
debug :print_fields => true
|
370
|
-
end
|
371
|
-
|
372
|
-
assembly "data2" do
|
373
|
-
split "line", :pattern => /[.,]*\s+/, :into=>["name", "code", "town"], :output => ["name", "code", "town"]
|
374
|
-
debug :print_fields => true
|
375
|
-
end
|
376
|
-
|
377
|
-
assembly "joined" do
|
378
|
-
join :on => {"data1"=>["name", "id"], "data2"=>["name", "code"]}, :declared_fields => ["name", "score1", "score2", "id", "name2", "code", "town"]
|
379
|
-
end
|
380
|
-
end.complete
|
381
|
-
end
|
382
|
-
end
|