cascading.jruby 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- data/HACKING.md +15 -0
- data/History.txt +0 -0
- data/LICENSE.txt +165 -0
- data/README.md +7 -0
- data/Rakefile +45 -0
- data/bin/make_job +81 -0
- data/lib/cascading/assembly.rb +726 -0
- data/lib/cascading/base.rb +63 -0
- data/lib/cascading/cascade.rb +63 -0
- data/lib/cascading/cascading.rb +134 -0
- data/lib/cascading/cascading_exception.rb +30 -0
- data/lib/cascading/expr_stub.rb +33 -0
- data/lib/cascading/ext/array.rb +15 -0
- data/lib/cascading/flow.rb +168 -0
- data/lib/cascading/operations.rb +204 -0
- data/lib/cascading/scope.rb +160 -0
- data/lib/cascading.rb +63 -0
- data/samples/branch.rb +31 -0
- data/samples/cascading.rb +41 -0
- data/samples/copy.rb +18 -0
- data/samples/data/data2.txt +88799 -0
- data/samples/data/data_join1.txt +3 -0
- data/samples/data/data_join2.txt +3 -0
- data/samples/data/data_join3.txt +3 -0
- data/samples/join.rb +32 -0
- data/samples/logwordcount.rb +22 -0
- data/samples/project.rb +24 -0
- data/samples/rename.rb +21 -0
- data/samples/scorenames.rb +20 -0
- data/samples/splitter.rb +20 -0
- data/samples/union.rb +35 -0
- data/spec/cascading_spec.rb +100 -0
- data/spec/expr_spec.rb +10 -0
- data/spec/primary_key_spec.rb +119 -0
- data/spec/resource/join_input.txt +3 -0
- data/spec/resource/test_input.txt +4 -0
- data/spec/scope_spec.rb +174 -0
- data/spec/spec.opts +6 -0
- data/spec/spec_helper.rb +5 -0
- data/spec/spec_util.rb +188 -0
- data/src/cascading/jruby/Main.java +38 -0
- data/src/cascading/jruby/runner.rb +6 -0
- data/tags +238 -0
- data/tasks/ann.rake +80 -0
- data/tasks/ant.rake +11 -0
- data/tasks/bones.rake +20 -0
- data/tasks/gem.rake +206 -0
- data/tasks/git.rake +40 -0
- data/tasks/notes.rake +27 -0
- data/tasks/post_load.rake +34 -0
- data/tasks/rdoc.rake +50 -0
- data/tasks/rubyforge.rake +55 -0
- data/tasks/samples.rake +13 -0
- data/tasks/setup.rb +300 -0
- data/tasks/spec.rake +59 -0
- data/tasks/svn.rake +47 -0
- data/tasks/test.rake +42 -0
- data/test/data/data1.txt +14 -0
- data/test/data/data2.txt +14 -0
- data/test/test_assembly.rb +321 -0
- data/test/test_cascading.rb +49 -0
- data/test/test_flow.rb +15 -0
- metadata +137 -0
@@ -0,0 +1,726 @@
|
|
1
|
+
# Copyright 2009, Grégoire Marabout. All Rights Reserved.
|
2
|
+
#
|
3
|
+
# This is free software. Please see the LICENSE and COPYING files for details.
|
4
|
+
|
5
|
+
require 'cascading/base'
|
6
|
+
require 'cascading/operations'
|
7
|
+
require 'cascading/ext/array'
|
8
|
+
|
9
|
+
module Cascading
|
10
|
+
class Assembly < Cascading::Node
|
11
|
+
include Operations
|
12
|
+
|
13
|
+
attr_accessor :tail_pipe, :head_pipe, :outgoing_scopes
|
14
|
+
|
15
|
+
def initialize(name, parent, outgoing_scopes = {})
|
16
|
+
super(name, parent)
|
17
|
+
|
18
|
+
@every_applied = false
|
19
|
+
@outgoing_scopes = outgoing_scopes
|
20
|
+
if parent.kind_of?(Assembly)
|
21
|
+
@head_pipe = Java::CascadingPipe::Pipe.new(name, parent.tail_pipe)
|
22
|
+
# Copy to allow destructive update of name
|
23
|
+
@outgoing_scopes[name] = parent.scope.copy
|
24
|
+
scope.scope.name = name
|
25
|
+
else # Parent is a Flow
|
26
|
+
@head_pipe = Java::CascadingPipe::Pipe.new(name)
|
27
|
+
@outgoing_scopes[name] ||= Scope.empty_scope(name)
|
28
|
+
end
|
29
|
+
@tail_pipe = @head_pipe
|
30
|
+
end
|
31
|
+
|
32
|
+
def parent_flow
|
33
|
+
return parent if parent.kind_of?(Flow)
|
34
|
+
parent.parent_flow
|
35
|
+
end
|
36
|
+
|
37
|
+
def scope
|
38
|
+
@outgoing_scopes[name]
|
39
|
+
end
|
40
|
+
|
41
|
+
def debug_scope
|
42
|
+
puts "Current scope for '#{name}':\n #{scope}\n----------\n"
|
43
|
+
end
|
44
|
+
|
45
|
+
def primary(*args)
|
46
|
+
options = args.extract_options!
|
47
|
+
if args.size > 0 && args[0] != nil
|
48
|
+
scope.primary_key_fields = fields(args)
|
49
|
+
else
|
50
|
+
scope.primary_key_fields = nil
|
51
|
+
end
|
52
|
+
scope.grouping_primary_key_fields = scope.primary_key_fields
|
53
|
+
end
|
54
|
+
|
55
|
+
def make_each(type, *parameters)
|
56
|
+
make_pipe(type, parameters)
|
57
|
+
@every_applied = false
|
58
|
+
end
|
59
|
+
|
60
|
+
def make_every(type, *parameters)
|
61
|
+
make_pipe(type, parameters, scope.grouping_key_fields)
|
62
|
+
@every_applied = true
|
63
|
+
end
|
64
|
+
|
65
|
+
def every_applied?
|
66
|
+
@every_applied
|
67
|
+
end
|
68
|
+
|
69
|
+
def do_every_block_and_rename_fields(group_fields, incoming_scopes, &block)
|
70
|
+
return unless block
|
71
|
+
|
72
|
+
# TODO: this should really be instance evaled on an object
|
73
|
+
# that only allows aggregation and buffer operations.
|
74
|
+
instance_eval &block
|
75
|
+
|
76
|
+
# First all non-primary key fields from each pipe if its primary key is a
|
77
|
+
# subset of the grouping primary key
|
78
|
+
first_fields = incoming_scopes.map do |scope|
|
79
|
+
if scope.primary_key_fields
|
80
|
+
primary_key = scope.primary_key_fields.to_a
|
81
|
+
grouping_primary_key = scope.grouping_primary_key_fields.to_a
|
82
|
+
if (primary_key & grouping_primary_key) == primary_key
|
83
|
+
difference_fields(scope.values_fields, scope.primary_key_fields).to_a
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end.compact.flatten
|
87
|
+
# assert first_fields == first_fields.uniq
|
88
|
+
|
89
|
+
# Do no first any fields explicitly aggregated over
|
90
|
+
first_fields = first_fields - scope.grouping_fields.to_a
|
91
|
+
if first_fields.size > 0
|
92
|
+
first *first_fields
|
93
|
+
puts "Firsting: #{first_fields.inspect} in assembly: #{@name}"
|
94
|
+
end
|
95
|
+
|
96
|
+
bind_names scope.grouping_fields.to_a if every_applied?
|
97
|
+
end
|
98
|
+
|
99
|
+
def make_pipe(type, parameters, grouping_key_fields = [], incoming_scopes = [scope])
|
100
|
+
@tail_pipe = type.new(*parameters)
|
101
|
+
@outgoing_scopes[name] = Scope.outgoing_scope(@tail_pipe, incoming_scopes, grouping_key_fields, every_applied?)
|
102
|
+
end
|
103
|
+
|
104
|
+
def to_s
|
105
|
+
"#{@name} : head pipe : #{@head_pipe} - tail pipe: #{@tail_pipe}"
|
106
|
+
end
|
107
|
+
|
108
|
+
# Builds a join (CoGroup) pipe. Requires a list of assembly names to join.
|
109
|
+
def join(*args, &block)
|
110
|
+
options = args.extract_options!
|
111
|
+
|
112
|
+
pipes, incoming_scopes = [], []
|
113
|
+
args.each do |assembly_name|
|
114
|
+
assembly = parent_flow.find_child(assembly_name)
|
115
|
+
raise "Could not find assembly '#{assembly_name}' in join" unless assembly
|
116
|
+
|
117
|
+
pipes << assembly.tail_pipe
|
118
|
+
incoming_scopes << @outgoing_scopes[assembly.name]
|
119
|
+
end
|
120
|
+
|
121
|
+
group_fields_args = options.delete(:on)
|
122
|
+
if group_fields_args.kind_of?(String)
|
123
|
+
group_fields_args = [group_fields_args]
|
124
|
+
end
|
125
|
+
group_fields_names = group_fields_args.to_a
|
126
|
+
group_fields = []
|
127
|
+
if group_fields_args.kind_of?(Array)
|
128
|
+
pipes.size.times do
|
129
|
+
group_fields << fields(group_fields_args)
|
130
|
+
end
|
131
|
+
elsif group_fields_args.kind_of?(Hash)
|
132
|
+
pipes, incoming_scopes = [], []
|
133
|
+
keys = group_fields_args.keys.sort
|
134
|
+
keys.each do |assembly_name|
|
135
|
+
v = group_fields_args[assembly_name]
|
136
|
+
assembly = parent_flow.find_child(assembly_name)
|
137
|
+
raise "Could not find assembly '#{assembly_name}' in join" unless assembly
|
138
|
+
|
139
|
+
pipes << assembly.tail_pipe
|
140
|
+
incoming_scopes << @outgoing_scopes[assembly.name]
|
141
|
+
group_fields << fields(v)
|
142
|
+
group_fields_names = group_fields_args[keys.first].to_a
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
group_fields = group_fields.to_java(Java::CascadingTuple::Fields)
|
147
|
+
incoming_fields = incoming_scopes.map{ |s| s.values_fields }
|
148
|
+
declared_fields = fields(options[:declared_fields] || dedup_fields(*incoming_fields))
|
149
|
+
joiner = options.delete(:joiner)
|
150
|
+
|
151
|
+
if declared_fields
|
152
|
+
case joiner
|
153
|
+
when :inner, "inner", nil
|
154
|
+
joiner = Java::CascadingPipeCogroup::InnerJoin.new
|
155
|
+
when :left, "left"
|
156
|
+
joiner = Java::CascadingPipeCogroup::LeftJoin.new
|
157
|
+
when :right, "right"
|
158
|
+
joiner = Java::CascadingPipeCogroup::RightJoin.new
|
159
|
+
when :outer, "outer"
|
160
|
+
joiner = Java::CascadingPipeCogroup::OuterJoin.new
|
161
|
+
when Array
|
162
|
+
joiner = joiner.map do |t|
|
163
|
+
case t
|
164
|
+
when true, 1, :inner then true
|
165
|
+
when false, 0, :outer then false
|
166
|
+
else fail "invalid mixed joiner entry: #{t}"
|
167
|
+
end
|
168
|
+
end
|
169
|
+
joiner = Java::CascadingPipeCogroup::MixedJoin.new(joiner.to_java(:boolean))
|
170
|
+
end
|
171
|
+
end
|
172
|
+
|
173
|
+
parameters = [pipes.to_java(Java::CascadingPipe::Pipe), group_fields, declared_fields, joiner].compact
|
174
|
+
grouping_key_fields = group_fields[0] # Left key group wins
|
175
|
+
make_pipe(Java::CascadingPipe::CoGroup, parameters, grouping_key_fields, incoming_scopes)
|
176
|
+
do_every_block_and_rename_fields(group_fields_names, incoming_scopes, &block)
|
177
|
+
end
|
178
|
+
alias co_group join
|
179
|
+
|
180
|
+
def inner_join(*args, &block)
|
181
|
+
options = args.extract_options!
|
182
|
+
options[:joiner] = :inner
|
183
|
+
args << options
|
184
|
+
join(*args, &block)
|
185
|
+
end
|
186
|
+
|
187
|
+
def left_join(*args, &block)
|
188
|
+
options = args.extract_options!
|
189
|
+
options[:joiner] = :left
|
190
|
+
args << options
|
191
|
+
join(*args, &block)
|
192
|
+
end
|
193
|
+
|
194
|
+
def right_join(*args, &block)
|
195
|
+
options = args.extract_options!
|
196
|
+
options[:joiner] = :right
|
197
|
+
args << options
|
198
|
+
join(*args, &block)
|
199
|
+
end
|
200
|
+
|
201
|
+
def outer_join(*args, &block)
|
202
|
+
options = args.extract_options!
|
203
|
+
options[:joiner] = :outer
|
204
|
+
args << options
|
205
|
+
join(*args, &block)
|
206
|
+
end
|
207
|
+
|
208
|
+
# Builds a new branch.
|
209
|
+
def branch(name, &block)
|
210
|
+
raise "Could not build branch '#{name}'; block required" unless block_given?
|
211
|
+
assembly = Assembly.new(name, self, @outgoing_scopes)
|
212
|
+
add_child(assembly)
|
213
|
+
assembly.instance_eval(&block)
|
214
|
+
assembly
|
215
|
+
end
|
216
|
+
|
217
|
+
# Builds a new _group_by_ pipe. The fields used for grouping are specified in the args
|
218
|
+
# array.
|
219
|
+
def group_by(*args, &block)
|
220
|
+
options = args.extract_options!
|
221
|
+
|
222
|
+
group_fields = fields(args)
|
223
|
+
|
224
|
+
sort_fields = fields(options[:sort_by] || args)
|
225
|
+
reverse = options[:reverse]
|
226
|
+
|
227
|
+
parameters = [@tail_pipe, group_fields, sort_fields, reverse].compact
|
228
|
+
make_pipe(Java::CascadingPipe::GroupBy, parameters, group_fields)
|
229
|
+
do_every_block_and_rename_fields(args, [scope], &block)
|
230
|
+
end
|
231
|
+
|
232
|
+
# Unifies several pipes sharing the same field structure.
|
233
|
+
# This actually creates a GroupBy pipe.
|
234
|
+
# It expects a list of assembly names as parameter.
|
235
|
+
def union_pipes(*args)
|
236
|
+
pipes, incoming_scopes = [], []
|
237
|
+
args[0].each do |assembly_name|
|
238
|
+
assembly = parent_flow.find_child(assembly_name)
|
239
|
+
pipes << assembly.tail_pipe
|
240
|
+
incoming_scopes << @outgoing_scopes[assembly.name]
|
241
|
+
end
|
242
|
+
|
243
|
+
# Groups only on the 1st field (see line 186 of GroupBy.java)
|
244
|
+
grouping_key_fields = fields(incoming_scopes.first.values_fields.get(0))
|
245
|
+
make_pipe(Java::CascadingPipe::GroupBy, [pipes.to_java(Java::CascadingPipe::Pipe)], grouping_key_fields, incoming_scopes)
|
246
|
+
# TODO: Shouldn't union_pipes accept an every block?
|
247
|
+
#do_every_block_and_rename_fields(args, incoming_scopes, &block)
|
248
|
+
end
|
249
|
+
|
250
|
+
# Builds an basic _every_ pipe, and adds it to the current assembly.
|
251
|
+
def every(*args)
|
252
|
+
options = args.extract_options!
|
253
|
+
|
254
|
+
in_fields = fields(args)
|
255
|
+
out_fields = fields(options[:output])
|
256
|
+
operation = options[:aggregator] || options[:buffer]
|
257
|
+
|
258
|
+
parameters = [@tail_pipe, in_fields, operation, out_fields].compact
|
259
|
+
make_every(Java::CascadingPipe::Every, *parameters)
|
260
|
+
end
|
261
|
+
|
262
|
+
# Builds a basic _each_ pipe, and adds it to the current assembly.
|
263
|
+
# --
|
264
|
+
# Example:
|
265
|
+
# each "line", :filter=>regex_splitter(["name", "val1", "val2", "id"],
|
266
|
+
# :pattern => /[.,]*\s+/),
|
267
|
+
# :output=>["id", "name", "val1", "val2"]
|
268
|
+
def each(*args)
|
269
|
+
options = args.extract_options!
|
270
|
+
|
271
|
+
in_fields = fields(args)
|
272
|
+
out_fields = fields(options[:output])
|
273
|
+
operation = options[:filter] || options[:function]
|
274
|
+
|
275
|
+
parameters = [@tail_pipe, in_fields, operation, out_fields].compact
|
276
|
+
make_each(Java::CascadingPipe::Each, *parameters)
|
277
|
+
end
|
278
|
+
|
279
|
+
# Restricts the current assembly to the specified fields.
|
280
|
+
# --
|
281
|
+
# Example:
|
282
|
+
# project "field1", "field2"
|
283
|
+
def project(*args)
|
284
|
+
fields = fields(args)
|
285
|
+
operation = Java::CascadingOperation::Identity.new
|
286
|
+
make_each(Java::CascadingPipe::Each, @tail_pipe, fields, operation)
|
287
|
+
end
|
288
|
+
|
289
|
+
# Removes the specified fields from the current assembly.
|
290
|
+
# --
|
291
|
+
# Example:
|
292
|
+
# discard "field1", "field2"
|
293
|
+
def discard(*args)
|
294
|
+
discard_fields = fields(args)
|
295
|
+
keep_fields = difference_fields(scope.values_fields, discard_fields)
|
296
|
+
project(*keep_fields.to_a)
|
297
|
+
end
|
298
|
+
|
299
|
+
# Assign new names to initial fields in positional order.
|
300
|
+
# --
|
301
|
+
# Example:
|
302
|
+
# bind_names "field1", "field2"
|
303
|
+
def bind_names(*new_names)
|
304
|
+
new_fields = fields(new_names)
|
305
|
+
operation = Java::CascadingOperation::Identity.new(new_fields)
|
306
|
+
make_each(Java::CascadingPipe::Each, @tail_pipe, all_fields, operation)
|
307
|
+
end
|
308
|
+
|
309
|
+
# Renames fields according to the mapping provided.
|
310
|
+
# --
|
311
|
+
# Example:
|
312
|
+
# rename "old_name" => "new_name"
|
313
|
+
def rename(name_map)
|
314
|
+
old_names = scope.values_fields.to_a
|
315
|
+
new_names = old_names.map{ |name| name_map[name] || name }
|
316
|
+
invalid = name_map.keys.sort - old_names
|
317
|
+
raise "invalid names: #{invalid.inspect}" unless invalid.empty?
|
318
|
+
|
319
|
+
old_key = scope.primary_key_fields.to_a
|
320
|
+
new_key = old_key.map{ |name| name_map[name] || name }
|
321
|
+
|
322
|
+
new_fields = fields(new_names)
|
323
|
+
operation = Java::CascadingOperation::Identity.new(new_fields)
|
324
|
+
make_each(Java::CascadingPipe::Each, @tail_pipe, all_fields, operation)
|
325
|
+
primary(*new_key)
|
326
|
+
end
|
327
|
+
|
328
|
+
def cast(type_map)
|
329
|
+
names = type_map.keys.sort
|
330
|
+
types = JAVA_TYPE_MAP.values_at(*type_map.values_at(*names))
|
331
|
+
fields = fields(names)
|
332
|
+
types = types.to_java(java.lang.Class)
|
333
|
+
operation = Java::CascadingOperation::Identity.new(fields, types)
|
334
|
+
make_each(Java::CascadingPipe::Each, @tail_pipe, fields, operation)
|
335
|
+
end
|
336
|
+
|
337
|
+
def copy(*args)
|
338
|
+
options = args.extract_options!
|
339
|
+
from = args[0] || all_fields
|
340
|
+
into = args[1] || options[:into] || all_fields
|
341
|
+
operation = Java::CascadingOperation::Identity.new(fields(into))
|
342
|
+
make_each(Java::CascadingPipe::Each, @tail_pipe, fields(from), operation, Java::CascadingTuple::Fields::ALL)
|
343
|
+
end
|
344
|
+
|
345
|
+
# A pipe that does nothing.
|
346
|
+
def pass(*args)
|
347
|
+
operation = Java::CascadingOperation::Identity.new
|
348
|
+
make_each(Java::CascadingPipe::Each, @tail_pipe, all_fields, operation)
|
349
|
+
end
|
350
|
+
|
351
|
+
def assert(*args)
|
352
|
+
options = args.extract_options!
|
353
|
+
assertion = args[0]
|
354
|
+
assertion_level = options[:level] || Java::CascadingOperation::AssertionLevel::STRICT
|
355
|
+
make_each(Java::CascadingPipe::Each, @tail_pipe, assertion_level, assertion)
|
356
|
+
end
|
357
|
+
|
358
|
+
def assert_group(*args)
|
359
|
+
options = args.extract_options!
|
360
|
+
assertion = args[0]
|
361
|
+
assertion_level = options[:level] || Java::CascadingOperation::AssertionLevel::STRICT
|
362
|
+
make_every(Java::CascadingPipe::Every, @tail_pipe, assertion_level, assertion)
|
363
|
+
end
|
364
|
+
|
365
|
+
# Builds a debugging pipe.
|
366
|
+
#
|
367
|
+
# Without arguments, it generate a simple debug pipe, that prints all tuple to the standard
|
368
|
+
# output.
|
369
|
+
#
|
370
|
+
# The other named options are:
|
371
|
+
# * <tt>:print_fields</tt> a boolean. If is set to true, then it prints every 10 tuples.
|
372
|
+
#
|
373
|
+
def debug(*args)
|
374
|
+
options = args.extract_options!
|
375
|
+
print_fields = options[:print_fields] || true
|
376
|
+
parameters = [print_fields].compact
|
377
|
+
debug = Java::CascadingOperation::Debug.new(*parameters)
|
378
|
+
debug.print_tuple_every = options[:tuple_interval] || 1
|
379
|
+
debug.print_fields_every = options[:fields_interval] || 10
|
380
|
+
each(all_fields, :filter => debug)
|
381
|
+
end
|
382
|
+
|
383
|
+
# Builds a pipe that assert the size of the tuple is the size specified in parameter.
|
384
|
+
#
|
385
|
+
# The method accept an unique uname argument : a number indicating the size expected.
|
386
|
+
def assert_size_equals(*args)
|
387
|
+
options = args.extract_options!
|
388
|
+
assertion = Java::CascadingOperationAssertion::AssertSizeEquals.new(args[0])
|
389
|
+
assert(assertion, options)
|
390
|
+
end
|
391
|
+
|
392
|
+
# Builds a pipe that assert the none of the fields in the tuple are null.
|
393
|
+
def assert_not_null(*args)
|
394
|
+
options = args.extract_options!
|
395
|
+
assertion = Java::CascadingOperationAssertion::AssertNotNull.new
|
396
|
+
assert(assertion, options)
|
397
|
+
end
|
398
|
+
|
399
|
+
def assert_group_size_equals(*args)
|
400
|
+
options = args.extract_options!
|
401
|
+
assertion = Java::CascadingOperationAssertion::AssertGroupSizeEquals.new(args[0])
|
402
|
+
assert_group(assertion, options)
|
403
|
+
end
|
404
|
+
|
405
|
+
# Builds a series of every pipes for aggregation.
|
406
|
+
#
|
407
|
+
# Args can either be a list of fields to aggregate and an options hash or
|
408
|
+
# a hash that maps input field name to output field name (similar to
|
409
|
+
# insert) and an options hash.
|
410
|
+
#
|
411
|
+
# Options include:
|
412
|
+
# * <tt>:sql</tt> a boolean indicating whether the operation should act like the SQL equivalent
|
413
|
+
#
|
414
|
+
# <tt>function</tt> is a symbol that is the method to call to construct the Cascading Aggregator.
|
415
|
+
def composite_aggregator(args, function)
|
416
|
+
if !args.empty? && args.first.kind_of?(Hash)
|
417
|
+
field_map = args.shift.sort
|
418
|
+
options = args.extract_options!
|
419
|
+
else
|
420
|
+
options = args.extract_options!
|
421
|
+
field_map = args.zip(args)
|
422
|
+
end
|
423
|
+
field_map.each do |in_field, out_field|
|
424
|
+
agg = self.send(function, out_field, options)
|
425
|
+
every(in_field, :aggregator => agg, :output => all_fields)
|
426
|
+
end
|
427
|
+
puts "WARNING: composite aggregator '#{function.to_s.gsub('_function', '')}' invoked on 0 fields; will be ignored" if field_map.empty?
|
428
|
+
end
|
429
|
+
|
430
|
+
def min(*args); composite_aggregator(args, :min_function); end
|
431
|
+
def max(*args); composite_aggregator(args, :max_function); end
|
432
|
+
def first(*args); composite_aggregator(args, :first_function); end
|
433
|
+
def last(*args); composite_aggregator(args, :last_function); end
|
434
|
+
def average(*args); composite_aggregator(args, :average_function); end
|
435
|
+
|
436
|
+
# Counts elements of a group. First unnamed parameter is the name of the
|
437
|
+
# output count field (defaults to 'count' if it is not provided).
|
438
|
+
def count(*args)
|
439
|
+
options = args.extract_options!
|
440
|
+
name = args[0] || 'count'
|
441
|
+
every(last_grouping_fields, :aggregator => count_function(name, options), :output => all_fields)
|
442
|
+
end
|
443
|
+
|
444
|
+
# Fields to be summed may either be provided as an array, in which case
|
445
|
+
# they will be aggregated into the same field in the given order, or as a
|
446
|
+
# hash, in which case they will be aggregated from the field named by the
|
447
|
+
# key into the field named by the value after being sorted.
|
448
|
+
def sum(*args)
|
449
|
+
options = args.extract_options!
|
450
|
+
type = JAVA_TYPE_MAP[options[:type]]
|
451
|
+
raise "No type specified for sum" unless type
|
452
|
+
|
453
|
+
mapping = options[:mapping] ? options[:mapping].sort : args.zip(args)
|
454
|
+
mapping.each do |in_field, out_field|
|
455
|
+
every(in_field, :aggregator => sum_function(out_field, :type => type), :output => all_fields)
|
456
|
+
end
|
457
|
+
end
|
458
|
+
|
459
|
+
# Builds a _parse_ pipe. This pipe will parse the fields specified in input (first unamed arguments),
|
460
|
+
# using a specified regex pattern.
|
461
|
+
#
|
462
|
+
# If provided, the unamed arguments must be the fields to be parsed. If not provided, then all incoming
|
463
|
+
# fields are used.
|
464
|
+
#
|
465
|
+
# The named options are:
|
466
|
+
# * <tt>:pattern</tt> a string or regex. Specifies the regular expression used for parsing the argument fields.
|
467
|
+
# * <tt>:output</tt> a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
|
468
|
+
def parse(*args)
|
469
|
+
options = args.extract_options!
|
470
|
+
fields = args || all_fields
|
471
|
+
pattern = options[:pattern]
|
472
|
+
output = options[:output] || all_fields
|
473
|
+
each(fields, :filter => regex_parser(pattern, options), :output => output)
|
474
|
+
end
|
475
|
+
|
476
|
+
# Builds a pipe that splits a field into other fields, using a specified regular expression.
|
477
|
+
#
|
478
|
+
# The first unnamed argument is the field to be split.
|
479
|
+
# The second unnamed argument is an array of strings indicating the fields receiving the result of the split.
|
480
|
+
#
|
481
|
+
# The named options are:
|
482
|
+
# * <tt>:pattern</tt> a string or regex. Specifies the regular expression used for splitting the argument fields.
|
483
|
+
# * <tt>:output</tt> a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
|
484
|
+
def split(*args)
|
485
|
+
options = args.extract_options!
|
486
|
+
fields = options[:into] || args[1]
|
487
|
+
pattern = options[:pattern] || /[.,]*\s+/
|
488
|
+
output = options[:output] || all_fields
|
489
|
+
each(args[0], :function => regex_splitter(fields, :pattern => pattern), :output=>output)
|
490
|
+
end
|
491
|
+
|
492
|
+
# Builds a pipe that splits a field into new rows, using a specified regular expression.
|
493
|
+
#
|
494
|
+
# The first unnamed argument is the field to be split.
|
495
|
+
# The second unnamed argument is the field receiving the result of the split.
|
496
|
+
#
|
497
|
+
# The named options are:
|
498
|
+
# * <tt>:pattern</tt> a string or regex. Specifies the regular expression used for splitting the argument fields.
|
499
|
+
# * <tt>:output</tt> a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
|
500
|
+
def split_rows(*args)
|
501
|
+
options = args.extract_options!
|
502
|
+
fields = options[:into] || args[1]
|
503
|
+
pattern = options[:pattern] || /[.,]*\s+/
|
504
|
+
output = options[:output] || all_fields
|
505
|
+
each(args[0], :function => regex_split_generator(fields, :pattern => pattern), :output=>output)
|
506
|
+
end
|
507
|
+
|
508
|
+
# Builds a pipe that emits a new row for each regex group matched in a field, using a specified regular expression.
|
509
|
+
#
|
510
|
+
# The first unnamed argument is the field to be matched against.
|
511
|
+
# The second unnamed argument is the field receiving the result of the match.
|
512
|
+
#
|
513
|
+
# The named options are:
|
514
|
+
# * <tt>:pattern</tt> a string or regex. Specifies the regular expression used for matching the argument fields.
|
515
|
+
# * <tt>:output</tt> a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
|
516
|
+
def match_rows(*args)
|
517
|
+
options = args.extract_options!
|
518
|
+
fields = options[:into] || args[1]
|
519
|
+
pattern = options[:pattern] || /[\w]+/
|
520
|
+
output = options[:output] || all_fields
|
521
|
+
each(args[0], :function => regex_generator(fields, :pattern => pattern), :output=>output)
|
522
|
+
end
|
523
|
+
|
524
|
+
# Builds a pipe that parses the specified field as a date using hte provided format string.
|
525
|
+
# The unamed argument specifies the field to format.
|
526
|
+
#
|
527
|
+
# The named options are:
|
528
|
+
# * <tt>:into</tt> a string. It specifies the receiving field. By default, it will be named after
|
529
|
+
# the input argument.
|
530
|
+
# * <tt>:pattern</tt> a string. Specifies the date format.
|
531
|
+
# * <tt>:output</tt> a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
|
532
|
+
def parse_date(*args)
|
533
|
+
options = args.extract_options!
|
534
|
+
field = options[:into] || "#{args[0]}_parsed"
|
535
|
+
output = options[:output] || all_fields
|
536
|
+
pattern = options[:pattern] || "yyyy/MM/dd"
|
537
|
+
|
538
|
+
each args[0], :function => date_parser(field, pattern), :output => output
|
539
|
+
end
|
540
|
+
|
541
|
+
# Builds a pipe that format a date using a specified format pattern.
|
542
|
+
#
|
543
|
+
# The unamed argument specifies the field to format.
|
544
|
+
#
|
545
|
+
# The named options are:
|
546
|
+
# * <tt>:into</tt> a string. It specifies the receiving field. By default, it will be named after
|
547
|
+
# the input argument.
|
548
|
+
# * <tt>:pattern</tt> a string. Specifies the date format.
|
549
|
+
# * <tt>:timezone</tt> a string. Specifies the timezone (defaults to UTC).
|
550
|
+
# * <tt>:output</tt> a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
|
551
|
+
def format_date(*args)
|
552
|
+
options = args.extract_options!
|
553
|
+
field = options[:into] || "#{args[0]}_formatted"
|
554
|
+
pattern = options[:pattern] || "yyyy/MM/dd"
|
555
|
+
output = options[:output] || all_fields
|
556
|
+
|
557
|
+
each args[0], :function => date_formatter(field, pattern, options[:timezone]), :output => output
|
558
|
+
end
|
559
|
+
|
560
|
+
# Builds a pipe that perform a query/replace based on a regular expression.
|
561
|
+
#
|
562
|
+
# The first unamed argument specifies the input field.
|
563
|
+
#
|
564
|
+
# The named options are:
|
565
|
+
# * <tt>:pattern</tt> a string or regex. Specifies the pattern to look for in the input field. This non-optional argument
|
566
|
+
# can also be specified as a second _unamed_ argument.
|
567
|
+
# * <tt>:replacement</tt> a string. Specifies the replacement.
|
568
|
+
# * <tt>:output</tt> a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
|
569
|
+
def replace(*args)
|
570
|
+
options = args.extract_options!
|
571
|
+
|
572
|
+
pattern = options[:pattern] || args[1]
|
573
|
+
replacement = options[:replacement] || args[2]
|
574
|
+
into = options[:into] || "#{args[0]}_replaced"
|
575
|
+
output = options[:output] || all_fields
|
576
|
+
|
577
|
+
each args[0], :function => regex_replace(into, pattern, replacement), :output => output
|
578
|
+
end
|
579
|
+
|
580
|
+
# Builds a pipe that inserts values into the current tuple.
|
581
|
+
#
|
582
|
+
# The method takes a hash as parameter. This hash contains as keys the names of the fields to insert
|
583
|
+
# and as values, the values they must contain. For example:
|
584
|
+
#
|
585
|
+
# insert {"who" => "Grégoire", "when" => Time.now.strftime("%Y-%m-%d") }
|
586
|
+
#
|
587
|
+
# will insert two new fields: a field _who_ containing the string "Grégoire", and a field _when_ containing
|
588
|
+
# the formatted current date.
|
589
|
+
# The methods outputs all fields.
|
590
|
+
# The named options are:
|
591
|
+
def insert(args)
|
592
|
+
args.keys.sort.each do |field_name|
|
593
|
+
value = args[field_name]
|
594
|
+
|
595
|
+
if value.kind_of?(ExprStub)
|
596
|
+
each all_fields,
|
597
|
+
:function => expression_function(field_name, :expression => value.expression,
|
598
|
+
:parameters => value.types), :output => all_fields
|
599
|
+
else
|
600
|
+
each all_fields, :function => insert_function([field_name], :values => [value]), :output => all_fields
|
601
|
+
end
|
602
|
+
end
|
603
|
+
end
|
604
|
+
|
605
|
+
# Builds a pipe that filters the tuples based on an expression or a pattern (but not both !).
|
606
|
+
#
|
607
|
+
# The first unamed argument, if provided, is a filtering expression (using the Janino syntax).
|
608
|
+
#
|
609
|
+
# The named options are:
|
610
|
+
# * <tt>:pattern</tt> a string. Specifies a regular expression pattern used to filter the tuples. If this
|
611
|
+
# option is provided, then the filter is regular expression-based. This is incompatible with the _expression_ option.
|
612
|
+
# * <tt>:expression</tt> a string. Specifies a Janino expression used to filter the tuples. This option has the
|
613
|
+
# same effect than providing it as first unamed argument. If this option is provided, then the filter is Janino
|
614
|
+
# expression-based. This is incompatible with the _pattern_ option.
|
615
|
+
def filter(*args)
|
616
|
+
options = args.extract_options!
|
617
|
+
from = options.delete(:from) || all_fields
|
618
|
+
expression = options.delete(:expression) || args.shift
|
619
|
+
regex = options.delete(:pattern)
|
620
|
+
if expression
|
621
|
+
stub = ExprStub.new(expression)
|
622
|
+
types, expression = stub.types, stub.expression
|
623
|
+
|
624
|
+
each from, :filter => expression_filter(
|
625
|
+
:parameters => types,
|
626
|
+
:expression => expression
|
627
|
+
)
|
628
|
+
elsif regex
|
629
|
+
each from, :filter => regex_filter(regex, options)
|
630
|
+
end
|
631
|
+
end
|
632
|
+
|
633
|
+
def filter_null(*args)
|
634
|
+
options = args.extract_options!
|
635
|
+
each(args, :filter => Java::CascadingOperationFilter::FilterNull.new)
|
636
|
+
end
|
637
|
+
alias reject_null filter_null
|
638
|
+
|
639
|
+
def filter_not_null(*args)
|
640
|
+
options = args.extract_options!
|
641
|
+
each(args, :filter => Java::CascadingOperationFilter::FilterNotNull.new)
|
642
|
+
end
|
643
|
+
alias where_null filter_not_null
|
644
|
+
|
645
|
+
# Builds a pipe that rejects the tuples based on an expression.
|
646
|
+
#
|
647
|
+
# The first unamed argument, if provided, is a filtering expression (using the Janino syntax).
|
648
|
+
#
|
649
|
+
# The named options are:
|
650
|
+
# * <tt>:expression</tt> a string. Specifies a Janino expression used to filter the tuples. This option has the
|
651
|
+
# same effect than providing it as first unamed argument. If this option is provided, then the filter is Janino
|
652
|
+
# expression-based.
|
653
|
+
def reject(*args)
|
654
|
+
options = args.extract_options
|
655
|
+
raise "Regex not allowed" if options && options[:pattern]
|
656
|
+
|
657
|
+
filter(*args)
|
658
|
+
end
|
659
|
+
|
660
|
+
# Builds a pipe that includes just the tuples matching an expression.
|
661
|
+
#
|
662
|
+
# The first unamed argument, if provided, is a filtering expression (using the Janino syntax).
|
663
|
+
#
|
664
|
+
# The named options are:
|
665
|
+
# * <tt>:expression</tt> a string. Specifies a Janino expression used to select the tuples. This option has the
|
666
|
+
# same effect than providing it as first unamed argument. If this option is provided, then the filter is Janino
|
667
|
+
# expression-based.
|
668
|
+
def where(*args)
|
669
|
+
options = args.extract_options
|
670
|
+
raise "Regex not allowed" if options && options[:pattern]
|
671
|
+
|
672
|
+
if options[:expression]
|
673
|
+
options[:expression] = "!(#{options[:expression]})"
|
674
|
+
elsif args[0]
|
675
|
+
args[0] = "!(#{args[0]})"
|
676
|
+
end
|
677
|
+
|
678
|
+
filter(*args)
|
679
|
+
end
|
680
|
+
|
681
|
+
# Builds a pipe that evaluates the specified Janino expression and insert it in a new field in the tuple.
|
682
|
+
#
|
683
|
+
# The named options are:
|
684
|
+
# * <tt>:from</tt> a string or array of strings. Specifies the input fields.
|
685
|
+
# * <tt>:express</tt> a string. The janino expression.
|
686
|
+
# * <tt>:into</tt> a string. Specified the name of the field to insert with the result of the evaluation.
|
687
|
+
# * <tt>:parameters</tt> a hash. Specifies the type mapping for the parameters. See Cascading::Operations.expression_function.
|
688
|
+
def eval_expression(*args)
|
689
|
+
options = args.extract_options!
|
690
|
+
|
691
|
+
into = options.delete(:into)
|
692
|
+
from = options.delete(:from) || all_fields
|
693
|
+
output = options.delete(:output) || all_fields
|
694
|
+
options[:expression] ||= args.shift
|
695
|
+
options[:parameters] ||= args.shift
|
696
|
+
|
697
|
+
each from, :function => expression_function(into, options), :output=>output
|
698
|
+
end
|
699
|
+
|
700
|
+
# Builds a pipe that returns distinct tuples based on the provided fields.
|
701
|
+
#
|
702
|
+
# The method accepts optional unamed argument specifying the fields to base the distinct on
|
703
|
+
# (all fields, by default).
|
704
|
+
def distinct(*args)
|
705
|
+
raise "Distinct is badly broken"
|
706
|
+
fields = args[0] || all_fields
|
707
|
+
group_by *fields
|
708
|
+
pass
|
709
|
+
end
|
710
|
+
|
711
|
+
# Builds a pipe that will unify (merge) pipes. The method accepts the list of pipes as argument.
|
712
|
+
# Tuples unified must share the same fields.
|
713
|
+
def union(*args)
|
714
|
+
options = args.extract_options!
|
715
|
+
pipes = args
|
716
|
+
union_pipes pipes
|
717
|
+
end
|
718
|
+
|
719
|
+
def join_fields(*args)
|
720
|
+
options = args.extract_options!
|
721
|
+
output = options[:output] || all_fields
|
722
|
+
|
723
|
+
each args, :function => field_joiner(options), :output => output
|
724
|
+
end
|
725
|
+
end
|
726
|
+
end
|