cascading.jruby 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/HACKING.md +15 -0
- data/History.txt +0 -0
- data/LICENSE.txt +165 -0
- data/README.md +7 -0
- data/Rakefile +45 -0
- data/bin/make_job +81 -0
- data/lib/cascading/assembly.rb +726 -0
- data/lib/cascading/base.rb +63 -0
- data/lib/cascading/cascade.rb +63 -0
- data/lib/cascading/cascading.rb +134 -0
- data/lib/cascading/cascading_exception.rb +30 -0
- data/lib/cascading/expr_stub.rb +33 -0
- data/lib/cascading/ext/array.rb +15 -0
- data/lib/cascading/flow.rb +168 -0
- data/lib/cascading/operations.rb +204 -0
- data/lib/cascading/scope.rb +160 -0
- data/lib/cascading.rb +63 -0
- data/samples/branch.rb +31 -0
- data/samples/cascading.rb +41 -0
- data/samples/copy.rb +18 -0
- data/samples/data/data2.txt +88799 -0
- data/samples/data/data_join1.txt +3 -0
- data/samples/data/data_join2.txt +3 -0
- data/samples/data/data_join3.txt +3 -0
- data/samples/join.rb +32 -0
- data/samples/logwordcount.rb +22 -0
- data/samples/project.rb +24 -0
- data/samples/rename.rb +21 -0
- data/samples/scorenames.rb +20 -0
- data/samples/splitter.rb +20 -0
- data/samples/union.rb +35 -0
- data/spec/cascading_spec.rb +100 -0
- data/spec/expr_spec.rb +10 -0
- data/spec/primary_key_spec.rb +119 -0
- data/spec/resource/join_input.txt +3 -0
- data/spec/resource/test_input.txt +4 -0
- data/spec/scope_spec.rb +174 -0
- data/spec/spec.opts +6 -0
- data/spec/spec_helper.rb +5 -0
- data/spec/spec_util.rb +188 -0
- data/src/cascading/jruby/Main.java +38 -0
- data/src/cascading/jruby/runner.rb +6 -0
- data/tags +238 -0
- data/tasks/ann.rake +80 -0
- data/tasks/ant.rake +11 -0
- data/tasks/bones.rake +20 -0
- data/tasks/gem.rake +206 -0
- data/tasks/git.rake +40 -0
- data/tasks/notes.rake +27 -0
- data/tasks/post_load.rake +34 -0
- data/tasks/rdoc.rake +50 -0
- data/tasks/rubyforge.rake +55 -0
- data/tasks/samples.rake +13 -0
- data/tasks/setup.rb +300 -0
- data/tasks/spec.rake +59 -0
- data/tasks/svn.rake +47 -0
- data/tasks/test.rake +42 -0
- data/test/data/data1.txt +14 -0
- data/test/data/data2.txt +14 -0
- data/test/test_assembly.rb +321 -0
- data/test/test_cascading.rb +49 -0
- data/test/test_flow.rb +15 -0
- metadata +137 -0
@@ -0,0 +1,726 @@
|
|
1
|
+
# Copyright 2009, Grégoire Marabout. All Rights Reserved.
|
2
|
+
#
|
3
|
+
# This is free software. Please see the LICENSE and COPYING files for details.
|
4
|
+
|
5
|
+
require 'cascading/base'
|
6
|
+
require 'cascading/operations'
|
7
|
+
require 'cascading/ext/array'
|
8
|
+
|
9
|
+
module Cascading
|
10
|
+
class Assembly < Cascading::Node
|
11
|
+
include Operations
|
12
|
+
|
13
|
+
attr_accessor :tail_pipe, :head_pipe, :outgoing_scopes
|
14
|
+
|
15
|
+
def initialize(name, parent, outgoing_scopes = {})
|
16
|
+
super(name, parent)
|
17
|
+
|
18
|
+
@every_applied = false
|
19
|
+
@outgoing_scopes = outgoing_scopes
|
20
|
+
if parent.kind_of?(Assembly)
|
21
|
+
@head_pipe = Java::CascadingPipe::Pipe.new(name, parent.tail_pipe)
|
22
|
+
# Copy to allow destructive update of name
|
23
|
+
@outgoing_scopes[name] = parent.scope.copy
|
24
|
+
scope.scope.name = name
|
25
|
+
else # Parent is a Flow
|
26
|
+
@head_pipe = Java::CascadingPipe::Pipe.new(name)
|
27
|
+
@outgoing_scopes[name] ||= Scope.empty_scope(name)
|
28
|
+
end
|
29
|
+
@tail_pipe = @head_pipe
|
30
|
+
end
|
31
|
+
|
32
|
+
def parent_flow
|
33
|
+
return parent if parent.kind_of?(Flow)
|
34
|
+
parent.parent_flow
|
35
|
+
end
|
36
|
+
|
37
|
+
def scope
|
38
|
+
@outgoing_scopes[name]
|
39
|
+
end
|
40
|
+
|
41
|
+
def debug_scope
|
42
|
+
puts "Current scope for '#{name}':\n #{scope}\n----------\n"
|
43
|
+
end
|
44
|
+
|
45
|
+
def primary(*args)
|
46
|
+
options = args.extract_options!
|
47
|
+
if args.size > 0 && args[0] != nil
|
48
|
+
scope.primary_key_fields = fields(args)
|
49
|
+
else
|
50
|
+
scope.primary_key_fields = nil
|
51
|
+
end
|
52
|
+
scope.grouping_primary_key_fields = scope.primary_key_fields
|
53
|
+
end
|
54
|
+
|
55
|
+
def make_each(type, *parameters)
|
56
|
+
make_pipe(type, parameters)
|
57
|
+
@every_applied = false
|
58
|
+
end
|
59
|
+
|
60
|
+
def make_every(type, *parameters)
|
61
|
+
make_pipe(type, parameters, scope.grouping_key_fields)
|
62
|
+
@every_applied = true
|
63
|
+
end
|
64
|
+
|
65
|
+
def every_applied?
|
66
|
+
@every_applied
|
67
|
+
end
|
68
|
+
|
69
|
+
def do_every_block_and_rename_fields(group_fields, incoming_scopes, &block)
|
70
|
+
return unless block
|
71
|
+
|
72
|
+
# TODO: this should really be instance evaled on an object
|
73
|
+
# that only allows aggregation and buffer operations.
|
74
|
+
instance_eval &block
|
75
|
+
|
76
|
+
# First all non-primary key fields from each pipe if its primary key is a
|
77
|
+
# subset of the grouping primary key
|
78
|
+
first_fields = incoming_scopes.map do |scope|
|
79
|
+
if scope.primary_key_fields
|
80
|
+
primary_key = scope.primary_key_fields.to_a
|
81
|
+
grouping_primary_key = scope.grouping_primary_key_fields.to_a
|
82
|
+
if (primary_key & grouping_primary_key) == primary_key
|
83
|
+
difference_fields(scope.values_fields, scope.primary_key_fields).to_a
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end.compact.flatten
|
87
|
+
# assert first_fields == first_fields.uniq
|
88
|
+
|
89
|
+
# Do no first any fields explicitly aggregated over
|
90
|
+
first_fields = first_fields - scope.grouping_fields.to_a
|
91
|
+
if first_fields.size > 0
|
92
|
+
first *first_fields
|
93
|
+
puts "Firsting: #{first_fields.inspect} in assembly: #{@name}"
|
94
|
+
end
|
95
|
+
|
96
|
+
bind_names scope.grouping_fields.to_a if every_applied?
|
97
|
+
end
|
98
|
+
|
99
|
+
def make_pipe(type, parameters, grouping_key_fields = [], incoming_scopes = [scope])
|
100
|
+
@tail_pipe = type.new(*parameters)
|
101
|
+
@outgoing_scopes[name] = Scope.outgoing_scope(@tail_pipe, incoming_scopes, grouping_key_fields, every_applied?)
|
102
|
+
end
|
103
|
+
|
104
|
+
def to_s
|
105
|
+
"#{@name} : head pipe : #{@head_pipe} - tail pipe: #{@tail_pipe}"
|
106
|
+
end
|
107
|
+
|
108
|
+
# Builds a join (CoGroup) pipe. Requires a list of assembly names to join.
|
109
|
+
def join(*args, &block)
|
110
|
+
options = args.extract_options!
|
111
|
+
|
112
|
+
pipes, incoming_scopes = [], []
|
113
|
+
args.each do |assembly_name|
|
114
|
+
assembly = parent_flow.find_child(assembly_name)
|
115
|
+
raise "Could not find assembly '#{assembly_name}' in join" unless assembly
|
116
|
+
|
117
|
+
pipes << assembly.tail_pipe
|
118
|
+
incoming_scopes << @outgoing_scopes[assembly.name]
|
119
|
+
end
|
120
|
+
|
121
|
+
group_fields_args = options.delete(:on)
|
122
|
+
if group_fields_args.kind_of?(String)
|
123
|
+
group_fields_args = [group_fields_args]
|
124
|
+
end
|
125
|
+
group_fields_names = group_fields_args.to_a
|
126
|
+
group_fields = []
|
127
|
+
if group_fields_args.kind_of?(Array)
|
128
|
+
pipes.size.times do
|
129
|
+
group_fields << fields(group_fields_args)
|
130
|
+
end
|
131
|
+
elsif group_fields_args.kind_of?(Hash)
|
132
|
+
pipes, incoming_scopes = [], []
|
133
|
+
keys = group_fields_args.keys.sort
|
134
|
+
keys.each do |assembly_name|
|
135
|
+
v = group_fields_args[assembly_name]
|
136
|
+
assembly = parent_flow.find_child(assembly_name)
|
137
|
+
raise "Could not find assembly '#{assembly_name}' in join" unless assembly
|
138
|
+
|
139
|
+
pipes << assembly.tail_pipe
|
140
|
+
incoming_scopes << @outgoing_scopes[assembly.name]
|
141
|
+
group_fields << fields(v)
|
142
|
+
group_fields_names = group_fields_args[keys.first].to_a
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
group_fields = group_fields.to_java(Java::CascadingTuple::Fields)
|
147
|
+
incoming_fields = incoming_scopes.map{ |s| s.values_fields }
|
148
|
+
declared_fields = fields(options[:declared_fields] || dedup_fields(*incoming_fields))
|
149
|
+
joiner = options.delete(:joiner)
|
150
|
+
|
151
|
+
if declared_fields
|
152
|
+
case joiner
|
153
|
+
when :inner, "inner", nil
|
154
|
+
joiner = Java::CascadingPipeCogroup::InnerJoin.new
|
155
|
+
when :left, "left"
|
156
|
+
joiner = Java::CascadingPipeCogroup::LeftJoin.new
|
157
|
+
when :right, "right"
|
158
|
+
joiner = Java::CascadingPipeCogroup::RightJoin.new
|
159
|
+
when :outer, "outer"
|
160
|
+
joiner = Java::CascadingPipeCogroup::OuterJoin.new
|
161
|
+
when Array
|
162
|
+
joiner = joiner.map do |t|
|
163
|
+
case t
|
164
|
+
when true, 1, :inner then true
|
165
|
+
when false, 0, :outer then false
|
166
|
+
else fail "invalid mixed joiner entry: #{t}"
|
167
|
+
end
|
168
|
+
end
|
169
|
+
joiner = Java::CascadingPipeCogroup::MixedJoin.new(joiner.to_java(:boolean))
|
170
|
+
end
|
171
|
+
end
|
172
|
+
|
173
|
+
parameters = [pipes.to_java(Java::CascadingPipe::Pipe), group_fields, declared_fields, joiner].compact
|
174
|
+
grouping_key_fields = group_fields[0] # Left key group wins
|
175
|
+
make_pipe(Java::CascadingPipe::CoGroup, parameters, grouping_key_fields, incoming_scopes)
|
176
|
+
do_every_block_and_rename_fields(group_fields_names, incoming_scopes, &block)
|
177
|
+
end
|
178
|
+
alias co_group join
|
179
|
+
|
180
|
+
def inner_join(*args, &block)
|
181
|
+
options = args.extract_options!
|
182
|
+
options[:joiner] = :inner
|
183
|
+
args << options
|
184
|
+
join(*args, &block)
|
185
|
+
end
|
186
|
+
|
187
|
+
def left_join(*args, &block)
|
188
|
+
options = args.extract_options!
|
189
|
+
options[:joiner] = :left
|
190
|
+
args << options
|
191
|
+
join(*args, &block)
|
192
|
+
end
|
193
|
+
|
194
|
+
def right_join(*args, &block)
|
195
|
+
options = args.extract_options!
|
196
|
+
options[:joiner] = :right
|
197
|
+
args << options
|
198
|
+
join(*args, &block)
|
199
|
+
end
|
200
|
+
|
201
|
+
def outer_join(*args, &block)
|
202
|
+
options = args.extract_options!
|
203
|
+
options[:joiner] = :outer
|
204
|
+
args << options
|
205
|
+
join(*args, &block)
|
206
|
+
end
|
207
|
+
|
208
|
+
# Builds a new branch.
|
209
|
+
def branch(name, &block)
|
210
|
+
raise "Could not build branch '#{name}'; block required" unless block_given?
|
211
|
+
assembly = Assembly.new(name, self, @outgoing_scopes)
|
212
|
+
add_child(assembly)
|
213
|
+
assembly.instance_eval(&block)
|
214
|
+
assembly
|
215
|
+
end
|
216
|
+
|
217
|
+
# Builds a new _group_by_ pipe. The fields used for grouping are specified in the args
|
218
|
+
# array.
|
219
|
+
def group_by(*args, &block)
|
220
|
+
options = args.extract_options!
|
221
|
+
|
222
|
+
group_fields = fields(args)
|
223
|
+
|
224
|
+
sort_fields = fields(options[:sort_by] || args)
|
225
|
+
reverse = options[:reverse]
|
226
|
+
|
227
|
+
parameters = [@tail_pipe, group_fields, sort_fields, reverse].compact
|
228
|
+
make_pipe(Java::CascadingPipe::GroupBy, parameters, group_fields)
|
229
|
+
do_every_block_and_rename_fields(args, [scope], &block)
|
230
|
+
end
|
231
|
+
|
232
|
+
# Unifies several pipes sharing the same field structure.
|
233
|
+
# This actually creates a GroupBy pipe.
|
234
|
+
# It expects a list of assembly names as parameter.
|
235
|
+
def union_pipes(*args)
|
236
|
+
pipes, incoming_scopes = [], []
|
237
|
+
args[0].each do |assembly_name|
|
238
|
+
assembly = parent_flow.find_child(assembly_name)
|
239
|
+
pipes << assembly.tail_pipe
|
240
|
+
incoming_scopes << @outgoing_scopes[assembly.name]
|
241
|
+
end
|
242
|
+
|
243
|
+
# Groups only on the 1st field (see line 186 of GroupBy.java)
|
244
|
+
grouping_key_fields = fields(incoming_scopes.first.values_fields.get(0))
|
245
|
+
make_pipe(Java::CascadingPipe::GroupBy, [pipes.to_java(Java::CascadingPipe::Pipe)], grouping_key_fields, incoming_scopes)
|
246
|
+
# TODO: Shouldn't union_pipes accept an every block?
|
247
|
+
#do_every_block_and_rename_fields(args, incoming_scopes, &block)
|
248
|
+
end
|
249
|
+
|
250
|
+
# Builds an basic _every_ pipe, and adds it to the current assembly.
|
251
|
+
def every(*args)
|
252
|
+
options = args.extract_options!
|
253
|
+
|
254
|
+
in_fields = fields(args)
|
255
|
+
out_fields = fields(options[:output])
|
256
|
+
operation = options[:aggregator] || options[:buffer]
|
257
|
+
|
258
|
+
parameters = [@tail_pipe, in_fields, operation, out_fields].compact
|
259
|
+
make_every(Java::CascadingPipe::Every, *parameters)
|
260
|
+
end
|
261
|
+
|
262
|
+
# Builds a basic _each_ pipe, and adds it to the current assembly.
|
263
|
+
# --
|
264
|
+
# Example:
|
265
|
+
# each "line", :filter=>regex_splitter(["name", "val1", "val2", "id"],
|
266
|
+
# :pattern => /[.,]*\s+/),
|
267
|
+
# :output=>["id", "name", "val1", "val2"]
|
268
|
+
def each(*args)
|
269
|
+
options = args.extract_options!
|
270
|
+
|
271
|
+
in_fields = fields(args)
|
272
|
+
out_fields = fields(options[:output])
|
273
|
+
operation = options[:filter] || options[:function]
|
274
|
+
|
275
|
+
parameters = [@tail_pipe, in_fields, operation, out_fields].compact
|
276
|
+
make_each(Java::CascadingPipe::Each, *parameters)
|
277
|
+
end
|
278
|
+
|
279
|
+
# Restricts the current assembly to the specified fields.
|
280
|
+
# --
|
281
|
+
# Example:
|
282
|
+
# project "field1", "field2"
|
283
|
+
def project(*args)
|
284
|
+
fields = fields(args)
|
285
|
+
operation = Java::CascadingOperation::Identity.new
|
286
|
+
make_each(Java::CascadingPipe::Each, @tail_pipe, fields, operation)
|
287
|
+
end
|
288
|
+
|
289
|
+
# Removes the specified fields from the current assembly.
|
290
|
+
# --
|
291
|
+
# Example:
|
292
|
+
# discard "field1", "field2"
|
293
|
+
def discard(*args)
|
294
|
+
discard_fields = fields(args)
|
295
|
+
keep_fields = difference_fields(scope.values_fields, discard_fields)
|
296
|
+
project(*keep_fields.to_a)
|
297
|
+
end
|
298
|
+
|
299
|
+
# Assign new names to initial fields in positional order.
|
300
|
+
# --
|
301
|
+
# Example:
|
302
|
+
# bind_names "field1", "field2"
|
303
|
+
def bind_names(*new_names)
|
304
|
+
new_fields = fields(new_names)
|
305
|
+
operation = Java::CascadingOperation::Identity.new(new_fields)
|
306
|
+
make_each(Java::CascadingPipe::Each, @tail_pipe, all_fields, operation)
|
307
|
+
end
|
308
|
+
|
309
|
+
# Renames fields according to the mapping provided.
|
310
|
+
# --
|
311
|
+
# Example:
|
312
|
+
# rename "old_name" => "new_name"
|
313
|
+
def rename(name_map)
|
314
|
+
old_names = scope.values_fields.to_a
|
315
|
+
new_names = old_names.map{ |name| name_map[name] || name }
|
316
|
+
invalid = name_map.keys.sort - old_names
|
317
|
+
raise "invalid names: #{invalid.inspect}" unless invalid.empty?
|
318
|
+
|
319
|
+
old_key = scope.primary_key_fields.to_a
|
320
|
+
new_key = old_key.map{ |name| name_map[name] || name }
|
321
|
+
|
322
|
+
new_fields = fields(new_names)
|
323
|
+
operation = Java::CascadingOperation::Identity.new(new_fields)
|
324
|
+
make_each(Java::CascadingPipe::Each, @tail_pipe, all_fields, operation)
|
325
|
+
primary(*new_key)
|
326
|
+
end
|
327
|
+
|
328
|
+
def cast(type_map)
|
329
|
+
names = type_map.keys.sort
|
330
|
+
types = JAVA_TYPE_MAP.values_at(*type_map.values_at(*names))
|
331
|
+
fields = fields(names)
|
332
|
+
types = types.to_java(java.lang.Class)
|
333
|
+
operation = Java::CascadingOperation::Identity.new(fields, types)
|
334
|
+
make_each(Java::CascadingPipe::Each, @tail_pipe, fields, operation)
|
335
|
+
end
|
336
|
+
|
337
|
+
def copy(*args)
|
338
|
+
options = args.extract_options!
|
339
|
+
from = args[0] || all_fields
|
340
|
+
into = args[1] || options[:into] || all_fields
|
341
|
+
operation = Java::CascadingOperation::Identity.new(fields(into))
|
342
|
+
make_each(Java::CascadingPipe::Each, @tail_pipe, fields(from), operation, Java::CascadingTuple::Fields::ALL)
|
343
|
+
end
|
344
|
+
|
345
|
+
# A pipe that does nothing.
|
346
|
+
def pass(*args)
|
347
|
+
operation = Java::CascadingOperation::Identity.new
|
348
|
+
make_each(Java::CascadingPipe::Each, @tail_pipe, all_fields, operation)
|
349
|
+
end
|
350
|
+
|
351
|
+
def assert(*args)
|
352
|
+
options = args.extract_options!
|
353
|
+
assertion = args[0]
|
354
|
+
assertion_level = options[:level] || Java::CascadingOperation::AssertionLevel::STRICT
|
355
|
+
make_each(Java::CascadingPipe::Each, @tail_pipe, assertion_level, assertion)
|
356
|
+
end
|
357
|
+
|
358
|
+
def assert_group(*args)
|
359
|
+
options = args.extract_options!
|
360
|
+
assertion = args[0]
|
361
|
+
assertion_level = options[:level] || Java::CascadingOperation::AssertionLevel::STRICT
|
362
|
+
make_every(Java::CascadingPipe::Every, @tail_pipe, assertion_level, assertion)
|
363
|
+
end
|
364
|
+
|
365
|
+
# Builds a debugging pipe.
|
366
|
+
#
|
367
|
+
# Without arguments, it generate a simple debug pipe, that prints all tuple to the standard
|
368
|
+
# output.
|
369
|
+
#
|
370
|
+
# The other named options are:
|
371
|
+
# * <tt>:print_fields</tt> a boolean. If is set to true, then it prints every 10 tuples.
|
372
|
+
#
|
373
|
+
def debug(*args)
|
374
|
+
options = args.extract_options!
|
375
|
+
print_fields = options[:print_fields] || true
|
376
|
+
parameters = [print_fields].compact
|
377
|
+
debug = Java::CascadingOperation::Debug.new(*parameters)
|
378
|
+
debug.print_tuple_every = options[:tuple_interval] || 1
|
379
|
+
debug.print_fields_every = options[:fields_interval] || 10
|
380
|
+
each(all_fields, :filter => debug)
|
381
|
+
end
|
382
|
+
|
383
|
+
# Builds a pipe that assert the size of the tuple is the size specified in parameter.
|
384
|
+
#
|
385
|
+
# The method accept an unique uname argument : a number indicating the size expected.
|
386
|
+
def assert_size_equals(*args)
|
387
|
+
options = args.extract_options!
|
388
|
+
assertion = Java::CascadingOperationAssertion::AssertSizeEquals.new(args[0])
|
389
|
+
assert(assertion, options)
|
390
|
+
end
|
391
|
+
|
392
|
+
# Builds a pipe that assert the none of the fields in the tuple are null.
|
393
|
+
def assert_not_null(*args)
|
394
|
+
options = args.extract_options!
|
395
|
+
assertion = Java::CascadingOperationAssertion::AssertNotNull.new
|
396
|
+
assert(assertion, options)
|
397
|
+
end
|
398
|
+
|
399
|
+
def assert_group_size_equals(*args)
|
400
|
+
options = args.extract_options!
|
401
|
+
assertion = Java::CascadingOperationAssertion::AssertGroupSizeEquals.new(args[0])
|
402
|
+
assert_group(assertion, options)
|
403
|
+
end
|
404
|
+
|
405
|
+
# Builds a series of every pipes for aggregation.
|
406
|
+
#
|
407
|
+
# Args can either be a list of fields to aggregate and an options hash or
|
408
|
+
# a hash that maps input field name to output field name (similar to
|
409
|
+
# insert) and an options hash.
|
410
|
+
#
|
411
|
+
# Options include:
|
412
|
+
# * <tt>:sql</tt> a boolean indicating whether the operation should act like the SQL equivalent
|
413
|
+
#
|
414
|
+
# <tt>function</tt> is a symbol that is the method to call to construct the Cascading Aggregator.
|
415
|
+
def composite_aggregator(args, function)
|
416
|
+
if !args.empty? && args.first.kind_of?(Hash)
|
417
|
+
field_map = args.shift.sort
|
418
|
+
options = args.extract_options!
|
419
|
+
else
|
420
|
+
options = args.extract_options!
|
421
|
+
field_map = args.zip(args)
|
422
|
+
end
|
423
|
+
field_map.each do |in_field, out_field|
|
424
|
+
agg = self.send(function, out_field, options)
|
425
|
+
every(in_field, :aggregator => agg, :output => all_fields)
|
426
|
+
end
|
427
|
+
puts "WARNING: composite aggregator '#{function.to_s.gsub('_function', '')}' invoked on 0 fields; will be ignored" if field_map.empty?
|
428
|
+
end
|
429
|
+
|
430
|
+
def min(*args); composite_aggregator(args, :min_function); end
|
431
|
+
def max(*args); composite_aggregator(args, :max_function); end
|
432
|
+
def first(*args); composite_aggregator(args, :first_function); end
|
433
|
+
def last(*args); composite_aggregator(args, :last_function); end
|
434
|
+
def average(*args); composite_aggregator(args, :average_function); end
|
435
|
+
|
436
|
+
# Counts elements of a group. First unnamed parameter is the name of the
|
437
|
+
# output count field (defaults to 'count' if it is not provided).
|
438
|
+
def count(*args)
|
439
|
+
options = args.extract_options!
|
440
|
+
name = args[0] || 'count'
|
441
|
+
every(last_grouping_fields, :aggregator => count_function(name, options), :output => all_fields)
|
442
|
+
end
|
443
|
+
|
444
|
+
# Fields to be summed may either be provided as an array, in which case
|
445
|
+
# they will be aggregated into the same field in the given order, or as a
|
446
|
+
# hash, in which case they will be aggregated from the field named by the
|
447
|
+
# key into the field named by the value after being sorted.
|
448
|
+
def sum(*args)
|
449
|
+
options = args.extract_options!
|
450
|
+
type = JAVA_TYPE_MAP[options[:type]]
|
451
|
+
raise "No type specified for sum" unless type
|
452
|
+
|
453
|
+
mapping = options[:mapping] ? options[:mapping].sort : args.zip(args)
|
454
|
+
mapping.each do |in_field, out_field|
|
455
|
+
every(in_field, :aggregator => sum_function(out_field, :type => type), :output => all_fields)
|
456
|
+
end
|
457
|
+
end
|
458
|
+
|
459
|
+
# Builds a _parse_ pipe. This pipe will parse the fields specified in input (first unamed arguments),
|
460
|
+
# using a specified regex pattern.
|
461
|
+
#
|
462
|
+
# If provided, the unamed arguments must be the fields to be parsed. If not provided, then all incoming
|
463
|
+
# fields are used.
|
464
|
+
#
|
465
|
+
# The named options are:
|
466
|
+
# * <tt>:pattern</tt> a string or regex. Specifies the regular expression used for parsing the argument fields.
|
467
|
+
# * <tt>:output</tt> a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
|
468
|
+
def parse(*args)
|
469
|
+
options = args.extract_options!
|
470
|
+
fields = args || all_fields
|
471
|
+
pattern = options[:pattern]
|
472
|
+
output = options[:output] || all_fields
|
473
|
+
each(fields, :filter => regex_parser(pattern, options), :output => output)
|
474
|
+
end
|
475
|
+
|
476
|
+
# Builds a pipe that splits a field into other fields, using a specified regular expression.
|
477
|
+
#
|
478
|
+
# The first unnamed argument is the field to be split.
|
479
|
+
# The second unnamed argument is an array of strings indicating the fields receiving the result of the split.
|
480
|
+
#
|
481
|
+
# The named options are:
|
482
|
+
# * <tt>:pattern</tt> a string or regex. Specifies the regular expression used for splitting the argument fields.
|
483
|
+
# * <tt>:output</tt> a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
|
484
|
+
def split(*args)
|
485
|
+
options = args.extract_options!
|
486
|
+
fields = options[:into] || args[1]
|
487
|
+
pattern = options[:pattern] || /[.,]*\s+/
|
488
|
+
output = options[:output] || all_fields
|
489
|
+
each(args[0], :function => regex_splitter(fields, :pattern => pattern), :output=>output)
|
490
|
+
end
|
491
|
+
|
492
|
+
# Builds a pipe that splits a field into new rows, using a specified regular expression.
|
493
|
+
#
|
494
|
+
# The first unnamed argument is the field to be split.
|
495
|
+
# The second unnamed argument is the field receiving the result of the split.
|
496
|
+
#
|
497
|
+
# The named options are:
|
498
|
+
# * <tt>:pattern</tt> a string or regex. Specifies the regular expression used for splitting the argument fields.
|
499
|
+
# * <tt>:output</tt> a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
|
500
|
+
def split_rows(*args)
|
501
|
+
options = args.extract_options!
|
502
|
+
fields = options[:into] || args[1]
|
503
|
+
pattern = options[:pattern] || /[.,]*\s+/
|
504
|
+
output = options[:output] || all_fields
|
505
|
+
each(args[0], :function => regex_split_generator(fields, :pattern => pattern), :output=>output)
|
506
|
+
end
|
507
|
+
|
508
|
+
# Builds a pipe that emits a new row for each regex group matched in a field, using a specified regular expression.
|
509
|
+
#
|
510
|
+
# The first unnamed argument is the field to be matched against.
|
511
|
+
# The second unnamed argument is the field receiving the result of the match.
|
512
|
+
#
|
513
|
+
# The named options are:
|
514
|
+
# * <tt>:pattern</tt> a string or regex. Specifies the regular expression used for matching the argument fields.
|
515
|
+
# * <tt>:output</tt> a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
|
516
|
+
def match_rows(*args)
|
517
|
+
options = args.extract_options!
|
518
|
+
fields = options[:into] || args[1]
|
519
|
+
pattern = options[:pattern] || /[\w]+/
|
520
|
+
output = options[:output] || all_fields
|
521
|
+
each(args[0], :function => regex_generator(fields, :pattern => pattern), :output=>output)
|
522
|
+
end
|
523
|
+
|
524
|
+
# Builds a pipe that parses the specified field as a date using hte provided format string.
|
525
|
+
# The unamed argument specifies the field to format.
|
526
|
+
#
|
527
|
+
# The named options are:
|
528
|
+
# * <tt>:into</tt> a string. It specifies the receiving field. By default, it will be named after
|
529
|
+
# the input argument.
|
530
|
+
# * <tt>:pattern</tt> a string. Specifies the date format.
|
531
|
+
# * <tt>:output</tt> a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
|
532
|
+
def parse_date(*args)
|
533
|
+
options = args.extract_options!
|
534
|
+
field = options[:into] || "#{args[0]}_parsed"
|
535
|
+
output = options[:output] || all_fields
|
536
|
+
pattern = options[:pattern] || "yyyy/MM/dd"
|
537
|
+
|
538
|
+
each args[0], :function => date_parser(field, pattern), :output => output
|
539
|
+
end
|
540
|
+
|
541
|
+
# Builds a pipe that format a date using a specified format pattern.
|
542
|
+
#
|
543
|
+
# The unamed argument specifies the field to format.
|
544
|
+
#
|
545
|
+
# The named options are:
|
546
|
+
# * <tt>:into</tt> a string. It specifies the receiving field. By default, it will be named after
|
547
|
+
# the input argument.
|
548
|
+
# * <tt>:pattern</tt> a string. Specifies the date format.
|
549
|
+
# * <tt>:timezone</tt> a string. Specifies the timezone (defaults to UTC).
|
550
|
+
# * <tt>:output</tt> a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
|
551
|
+
def format_date(*args)
|
552
|
+
options = args.extract_options!
|
553
|
+
field = options[:into] || "#{args[0]}_formatted"
|
554
|
+
pattern = options[:pattern] || "yyyy/MM/dd"
|
555
|
+
output = options[:output] || all_fields
|
556
|
+
|
557
|
+
each args[0], :function => date_formatter(field, pattern, options[:timezone]), :output => output
|
558
|
+
end
|
559
|
+
|
560
|
+
# Builds a pipe that perform a query/replace based on a regular expression.
|
561
|
+
#
|
562
|
+
# The first unamed argument specifies the input field.
|
563
|
+
#
|
564
|
+
# The named options are:
|
565
|
+
# * <tt>:pattern</tt> a string or regex. Specifies the pattern to look for in the input field. This non-optional argument
|
566
|
+
# can also be specified as a second _unamed_ argument.
|
567
|
+
# * <tt>:replacement</tt> a string. Specifies the replacement.
|
568
|
+
# * <tt>:output</tt> a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
|
569
|
+
def replace(*args)
|
570
|
+
options = args.extract_options!
|
571
|
+
|
572
|
+
pattern = options[:pattern] || args[1]
|
573
|
+
replacement = options[:replacement] || args[2]
|
574
|
+
into = options[:into] || "#{args[0]}_replaced"
|
575
|
+
output = options[:output] || all_fields
|
576
|
+
|
577
|
+
each args[0], :function => regex_replace(into, pattern, replacement), :output => output
|
578
|
+
end
|
579
|
+
|
580
|
+
# Builds a pipe that inserts values into the current tuple.
|
581
|
+
#
|
582
|
+
# The method takes a hash as parameter. This hash contains as keys the names of the fields to insert
|
583
|
+
# and as values, the values they must contain. For example:
|
584
|
+
#
|
585
|
+
# insert {"who" => "Grégoire", "when" => Time.now.strftime("%Y-%m-%d") }
|
586
|
+
#
|
587
|
+
# will insert two new fields: a field _who_ containing the string "Grégoire", and a field _when_ containing
|
588
|
+
# the formatted current date.
|
589
|
+
# The methods outputs all fields.
|
590
|
+
# The named options are:
|
591
|
+
def insert(args)
|
592
|
+
args.keys.sort.each do |field_name|
|
593
|
+
value = args[field_name]
|
594
|
+
|
595
|
+
if value.kind_of?(ExprStub)
|
596
|
+
each all_fields,
|
597
|
+
:function => expression_function(field_name, :expression => value.expression,
|
598
|
+
:parameters => value.types), :output => all_fields
|
599
|
+
else
|
600
|
+
each all_fields, :function => insert_function([field_name], :values => [value]), :output => all_fields
|
601
|
+
end
|
602
|
+
end
|
603
|
+
end
|
604
|
+
|
605
|
+
# Builds a pipe that filters the tuples based on an expression or a pattern (but not both !).
|
606
|
+
#
|
607
|
+
# The first unamed argument, if provided, is a filtering expression (using the Janino syntax).
|
608
|
+
#
|
609
|
+
# The named options are:
|
610
|
+
# * <tt>:pattern</tt> a string. Specifies a regular expression pattern used to filter the tuples. If this
|
611
|
+
# option is provided, then the filter is regular expression-based. This is incompatible with the _expression_ option.
|
612
|
+
# * <tt>:expression</tt> a string. Specifies a Janino expression used to filter the tuples. This option has the
|
613
|
+
# same effect than providing it as first unamed argument. If this option is provided, then the filter is Janino
|
614
|
+
# expression-based. This is incompatible with the _pattern_ option.
|
615
|
+
def filter(*args)
|
616
|
+
options = args.extract_options!
|
617
|
+
from = options.delete(:from) || all_fields
|
618
|
+
expression = options.delete(:expression) || args.shift
|
619
|
+
regex = options.delete(:pattern)
|
620
|
+
if expression
|
621
|
+
stub = ExprStub.new(expression)
|
622
|
+
types, expression = stub.types, stub.expression
|
623
|
+
|
624
|
+
each from, :filter => expression_filter(
|
625
|
+
:parameters => types,
|
626
|
+
:expression => expression
|
627
|
+
)
|
628
|
+
elsif regex
|
629
|
+
each from, :filter => regex_filter(regex, options)
|
630
|
+
end
|
631
|
+
end
|
632
|
+
|
633
|
+
def filter_null(*args)
|
634
|
+
options = args.extract_options!
|
635
|
+
each(args, :filter => Java::CascadingOperationFilter::FilterNull.new)
|
636
|
+
end
|
637
|
+
alias reject_null filter_null
|
638
|
+
|
639
|
+
def filter_not_null(*args)
|
640
|
+
options = args.extract_options!
|
641
|
+
each(args, :filter => Java::CascadingOperationFilter::FilterNotNull.new)
|
642
|
+
end
|
643
|
+
alias where_null filter_not_null
|
644
|
+
|
645
|
+
# Builds a pipe that rejects the tuples based on an expression.
|
646
|
+
#
|
647
|
+
# The first unamed argument, if provided, is a filtering expression (using the Janino syntax).
|
648
|
+
#
|
649
|
+
# The named options are:
|
650
|
+
# * <tt>:expression</tt> a string. Specifies a Janino expression used to filter the tuples. This option has the
|
651
|
+
# same effect than providing it as first unamed argument. If this option is provided, then the filter is Janino
|
652
|
+
# expression-based.
|
653
|
+
def reject(*args)
|
654
|
+
options = args.extract_options
|
655
|
+
raise "Regex not allowed" if options && options[:pattern]
|
656
|
+
|
657
|
+
filter(*args)
|
658
|
+
end
|
659
|
+
|
660
|
+
# Builds a pipe that includes just the tuples matching an expression.
|
661
|
+
#
|
662
|
+
# The first unamed argument, if provided, is a filtering expression (using the Janino syntax).
|
663
|
+
#
|
664
|
+
# The named options are:
|
665
|
+
# * <tt>:expression</tt> a string. Specifies a Janino expression used to select the tuples. This option has the
|
666
|
+
# same effect than providing it as first unamed argument. If this option is provided, then the filter is Janino
|
667
|
+
# expression-based.
|
668
|
+
def where(*args)
|
669
|
+
options = args.extract_options
|
670
|
+
raise "Regex not allowed" if options && options[:pattern]
|
671
|
+
|
672
|
+
if options[:expression]
|
673
|
+
options[:expression] = "!(#{options[:expression]})"
|
674
|
+
elsif args[0]
|
675
|
+
args[0] = "!(#{args[0]})"
|
676
|
+
end
|
677
|
+
|
678
|
+
filter(*args)
|
679
|
+
end
|
680
|
+
|
681
|
+
# Builds a pipe that evaluates the specified Janino expression and insert it in a new field in the tuple.
|
682
|
+
#
|
683
|
+
# The named options are:
|
684
|
+
# * <tt>:from</tt> a string or array of strings. Specifies the input fields.
|
685
|
+
# * <tt>:express</tt> a string. The janino expression.
|
686
|
+
# * <tt>:into</tt> a string. Specified the name of the field to insert with the result of the evaluation.
|
687
|
+
# * <tt>:parameters</tt> a hash. Specifies the type mapping for the parameters. See Cascading::Operations.expression_function.
|
688
|
+
def eval_expression(*args)
|
689
|
+
options = args.extract_options!
|
690
|
+
|
691
|
+
into = options.delete(:into)
|
692
|
+
from = options.delete(:from) || all_fields
|
693
|
+
output = options.delete(:output) || all_fields
|
694
|
+
options[:expression] ||= args.shift
|
695
|
+
options[:parameters] ||= args.shift
|
696
|
+
|
697
|
+
each from, :function => expression_function(into, options), :output=>output
|
698
|
+
end
|
699
|
+
|
700
|
+
# Builds a pipe that returns distinct tuples based on the provided fields.
|
701
|
+
#
|
702
|
+
# The method accepts optional unamed argument specifying the fields to base the distinct on
|
703
|
+
# (all fields, by default).
|
704
|
+
def distinct(*args)
|
705
|
+
raise "Distinct is badly broken"
|
706
|
+
fields = args[0] || all_fields
|
707
|
+
group_by *fields
|
708
|
+
pass
|
709
|
+
end
|
710
|
+
|
711
|
+
# Builds a pipe that will unify (merge) pipes. The method accepts the list of pipes as argument.
|
712
|
+
# Tuples unified must share the same fields.
|
713
|
+
def union(*args)
|
714
|
+
options = args.extract_options!
|
715
|
+
pipes = args
|
716
|
+
union_pipes pipes
|
717
|
+
end
|
718
|
+
|
719
|
+
def join_fields(*args)
|
720
|
+
options = args.extract_options!
|
721
|
+
output = options[:output] || all_fields
|
722
|
+
|
723
|
+
each args, :function => field_joiner(options), :output => output
|
724
|
+
end
|
725
|
+
end
|
726
|
+
end
|