cascading.jruby 0.0.10 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,15 +1,50 @@
1
1
  require 'cascading/base'
2
2
  require 'cascading/operations'
3
+ require 'cascading/identity_operations'
4
+ require 'cascading/filter_operations'
5
+ require 'cascading/regex_operations'
6
+ require 'cascading/text_operations'
3
7
  require 'cascading/aggregations'
4
8
  require 'cascading/sub_assembly'
5
9
  require 'cascading/ext/array'
6
10
 
7
11
  module Cascading
12
+ # An Assembly is a sequence of Cascading pipes (Each, GroupBy, CoGroup,
13
+ # Every, and SubAssembly). This class will serve as your primary mechanism
14
+ # for doing work within a flow and contains all the functions and filters you
15
+ # will apply to a pipe (Eaches), as well as group_by, union, and join. For
16
+ # aggregators and buffers, please see Aggregations.
17
+ #
18
+ # Function and filter DSL rules:
19
+ # * Use positional arguments for required parameters
20
+ # * Use options = {} for optional parameters
21
+ # * Use *args sparingly, specifically when you need to accept a varying length list of fields
22
+ # * If you require both a varying length list of fields and optional parameters, then see the Array#extract_options! extension
23
+ # * If you choose to name a required parameter, add it to options = {} and throw an exception if the caller does not provide it
24
+ # * If you have a require parameter that is provided by one of a set of options names, throw an exception if the caller does not provide at least one value (see :function and :filter in Assembly#each for an example)
25
+ #
26
+ # Function and filter DSL standard optional parameter names:
27
+ # [input] c.p.Each argument selector
28
+ # [into] c.o.Operation field declaration
29
+ # [output] c.p.Each output selector
30
+ #
31
+ # A note on aliases: when a DSL method uniquely wraps a single Cascading
32
+ # operation, we attempt to provide an alias that matches the Cascading
33
+ # operation. However, Cascading operations are often nouns rather than verbs,
34
+ # and the latter are preferable for a dataflow DSL.
8
35
  class Assembly < Cascading::Node
9
- include Operations
10
-
11
36
  attr_reader :head_pipe, :tail_pipe
12
37
 
38
+ # Do not use this constructor directly; instead, use Flow#assembly or
39
+ # Assembly#branch to build assemblies.
40
+ #
41
+ # Builds an Assembly given a name, parent, and optional outgoing_scopes
42
+ # (necessary only for branching).
43
+ #
44
+ # An assembly's name is quite important as it will determine:
45
+ # * The sources from which it will read, if any
46
+ # * The name to be used in joins or unions downstream
47
+ # * The name to be used to sink the output of the assembly downstream
13
48
  def initialize(name, parent, outgoing_scopes = {})
14
49
  super(name, parent)
15
50
 
@@ -27,6 +62,11 @@ module Cascading
27
62
  @incoming_scopes = [scope]
28
63
  end
29
64
 
65
+ # Produces a textual description of this Assembly. The description details
66
+ # the structure of the Assembly, its input and output fields and any
67
+ # children (branches). The offset parameter allows for this describe to be
68
+ # nested within a calling context, which lets us indent the structural
69
+ # hierarchy of a job.
30
70
  def describe(offset = '')
31
71
  incoming_scopes_desc = "#{@incoming_scopes.map{ |incoming_scope| incoming_scope.values_fields.to_a.inspect }.join(', ')}"
32
72
  incoming_scopes_desc = "(#{incoming_scopes_desc})" unless @incoming_scopes.size == 1
@@ -35,199 +75,231 @@ module Cascading
35
75
  description
36
76
  end
37
77
 
78
+ # Rather than the immediate parent, this method returns the parent flow of
79
+ # this Assembly. If this is a branch, we must traverse the parents of
80
+ # parent assemblies.
38
81
  def parent_flow
39
82
  return parent if parent.kind_of?(Flow)
40
83
  parent.parent_flow
41
84
  end
42
85
 
86
+ # Accesses the outgoing scope of this Assembly at the point at which it is
87
+ # called. This is useful for grabbing the values_fields at any point in
88
+ # the construction of the Assembly. See Scope for details.
43
89
  def scope
44
90
  @outgoing_scopes[name]
45
91
  end
46
92
 
93
+ # Prints information about the scope of this Assembly at the point at which
94
+ # it is called. This allows you to trace the propagation of field names
95
+ # through your job and is handy for debugging. See Scope for details.
47
96
  def debug_scope
48
97
  puts "Current scope for '#{name}':\n #{scope}\n----------\n"
49
98
  end
50
99
 
51
- def make_pipe(type, parameters)
52
- @tail_pipe = type.new(*parameters)
53
- @outgoing_scopes[name] = Scope.outgoing_scope(tail_pipe, [scope])
54
-
55
- tail_pipe
56
- end
57
- private :make_pipe
58
-
59
- def populate_incoming_scopes(assembly_names, group_fields_args = {})
60
- # NOTE: this overrides the existing incoming_scopes, which changes the
61
- # way describe will function on this assembly
62
- pipes, @incoming_scopes, group_fields = [], [], []
63
- assembly_names.each do |assembly_name|
64
- assembly = parent_flow.find_child(assembly_name)
65
- raise "Could not find assembly '#{assembly_name}' from '#{name}'" unless assembly
66
-
67
- pipes << assembly.tail_pipe
68
- @incoming_scopes << assembly.scope
69
- group_fields << fields(group_fields_args[assembly_name]) if group_fields_args[assembly_name]
70
- end
71
- [pipes, group_fields]
72
- end
73
- private :populate_incoming_scopes
74
-
75
- def apply_aggregations(group, incoming_scopes, &block)
76
- aggregations = Aggregations.new(self, group, incoming_scopes)
77
- aggregations.instance_eval(&block) if block_given?
78
-
79
- # Sorting of any type means that we cannot use the AggregateBy optimization
80
- if aggregations.can_aggregate_by? && !group.is_sorted && !group.is_sort_reversed
81
- grouping_fields = group.key_selectors.values.first
82
- group.key_selectors.values.each do |key_fields|
83
- raise "Grouping fields mismatch: #{grouping_fields} expected; #{key_fields} found from #{group.key_selectors}" unless key_fields == grouping_fields
84
- end
85
-
86
- aggregate_by = sub_assembly(Java::CascadingPipeAssembly::AggregateBy.new(
87
- name,
88
- group.previous,
89
- grouping_fields,
90
- aggregations.aggregate_bys.to_java(Java::CascadingPipeAssembly::AggregateBy)
91
- ), group.previous, incoming_scopes)
92
-
93
- aggregate_by
94
- else
95
- aggregations.finalize if block_given?
96
- @tail_pipe = aggregations.tail_pipe
97
- @outgoing_scopes[name] = aggregations.scope
98
-
99
- group
100
- end
101
- end
102
- private :apply_aggregations
103
-
100
+ # Prints detail about this Assembly including its name, head pipe, and tail
101
+ # pipe.
104
102
  def to_s
105
103
  "#{name} : head pipe : #{head_pipe} - tail pipe: #{tail_pipe}"
106
104
  end
107
105
 
108
- def prepare_join(*args, &block)
109
- options = args.extract_options!
110
-
111
- pipes, _ = populate_incoming_scopes(args)
112
-
113
- group_fields_args = options[:on]
114
- raise 'join requires :on parameter' unless group_fields_args
115
-
116
- if group_fields_args.kind_of?(String)
117
- group_fields_args = [group_fields_args]
118
- end
119
-
120
- group_fields = []
121
- if group_fields_args.kind_of?(Array)
122
- pipes.size.times do
123
- group_fields << fields(group_fields_args)
124
- end
125
- elsif group_fields_args.kind_of?(Hash)
126
- pipes, group_fields = populate_incoming_scopes(group_fields_args.keys.sort, group_fields_args)
127
- else
128
- raise "Unsupported data type for :on in join: '#{group_fields_args.class}'"
129
- end
130
-
131
- raise 'join requires non-empty :on parameter' if group_fields_args.empty?
132
- group_fields = group_fields.to_java(Java::CascadingTuple::Fields)
133
- incoming_fields = @incoming_scopes.map{ |s| s.values_fields }
134
- declared_fields = fields(options[:declared_fields] || dedup_fields(*incoming_fields))
135
- joiner = options[:joiner]
136
- is_hash_join = options[:hash] || false
137
-
138
- case joiner
139
- when :inner, 'inner', nil
140
- joiner = Java::CascadingPipeJoiner::InnerJoin.new
141
- when :left, 'left'
142
- joiner = Java::CascadingPipeJoiner::LeftJoin.new
143
- when :right, 'right'
144
- joiner = Java::CascadingPipeJoiner::RightJoin.new
145
- when :outer, 'outer'
146
- joiner = Java::CascadingPipeJoiner::OuterJoin.new
147
- when Array
148
- joiner = joiner.map do |t|
149
- case t
150
- when true, 1, :inner then true
151
- when false, 0, :outer then false
152
- else fail "invalid mixed joiner entry: #{t}"
153
- end
154
- end
155
- joiner = Java::CascadingPipeJoiner::MixedJoin.new(joiner.to_java(:boolean))
156
- end
157
-
158
- if is_hash_join
159
- raise ArgumentError, "hash joins don't support aggregations" if block_given?
160
- parameters = [
161
- pipes.to_java(Java::CascadingPipe::Pipe),
162
- group_fields,
163
- declared_fields,
164
- joiner
165
- ]
166
- group_assembly = Java::CascadingPipe::HashJoin.new(*parameters)
167
- else
168
- result_group_fields = dedup_fields(*group_fields)
169
- parameters = [
170
- pipes.to_java(Java::CascadingPipe::Pipe),
171
- group_fields,
172
- declared_fields,
173
- result_group_fields,
174
- joiner
175
- ]
176
- group_assembly = Java::CascadingPipe::CoGroup.new(*parameters)
177
- end
178
- apply_aggregations(group_assembly, @incoming_scopes, &block)
179
- end
180
- private :prepare_join
181
-
182
106
  # Builds a HashJoin pipe. This should be used carefully, as the right side
183
- # of the join is accumulated entirely in memory. Requires a list of assembly
184
- # names to join and :on to specify the join_fields.
185
- def hash_join(*args, &block)
186
- options = args.extract_options!
107
+ # of the join is accumulated entirely in memory. Requires a list of
108
+ # assembly names to join and :on to specify the join_fields. Note that a
109
+ # hash_join "takes over" the Assembly in which it is built, so it is
110
+ # typically the first statement within the block of the assembly or branch.
111
+ # Additionally, a hash join does not accept a block for aggregations like
112
+ # other joins; this restriction is enforced here, but comes directly from
113
+ # Cascading.
114
+ #
115
+ # The named options are:
116
+ # [on] The keys of the join, an array of strings if they are the same in
117
+ # all inputs, or a hash mapping assembly names to key names if they
118
+ # differ across inputs.
119
+ # [declared_fields] By default, a deduplicated array of incoming field
120
+ # names (see Cascading::dedup_fields). Specifies the
121
+ # names of the fields that will be available to
122
+ # aggregations or post-join if no aggregations are
123
+ # specified.
124
+ # [joiner] A specification of the c.p.j.Joiner to use. Values like :inner
125
+ # and 'inner', :right and 'right' are accepted, as well as an
126
+ # array specifying mixed joins. Typically, this is not provided,
127
+ # but one of the higher level join methods on Assembly is used
128
+ # directly (like Assembly#inner_join or Assembly#right_join).
129
+ #
130
+ # Example:
131
+ # assembly 'join_left_right' do
132
+ # hash_join 'left', 'right', :on => ['key1', 'key2'], :joiner => :inner
133
+ # end
134
+ def hash_join(*args_with_options)
135
+ raise ArgumentError, "HashJoin doesn't support aggregations so the block provided to hash_join will be ignored" if block_given?
136
+
137
+ options, assembly_names = args_with_options.extract_options!, args_with_options
187
138
  options[:hash] = true
188
- args << options
189
- prepare_join(*args, &block)
139
+ prepare_join(assembly_names, options)
190
140
  end
191
141
 
192
142
  # Builds a join (CoGroup) pipe. Requires a list of assembly names to join
193
- # and :on to specify the group_fields.
194
- def join(*args, &block)
195
- options = args.extract_options!
143
+ # and :on to specify the group_fields. Note that a join "takes over" the
144
+ # Assembly in which it is built, so it is typically the first statement
145
+ # within the block of the assembly or branch. The block passed to this
146
+ # method will be evaluated in the context of Aggregations, not Assembly.
147
+ #
148
+ # The named options are:
149
+ # [on] The keys of the join, an array of strings if they are the same in
150
+ # all inputs, or a hash mapping assembly names to key names if they
151
+ # differ across inputs.
152
+ # [declared_fields] By default, a deduplicated array of incoming field
153
+ # names (see Cascading::dedup_fields). Specifies the
154
+ # names of the fields that will be available to
155
+ # aggregations or post-join if no aggregations are
156
+ # specified.
157
+ # [joiner] A specification of the c.p.j.Joiner to use. Values like :inner
158
+ # and 'inner', :right and 'right' are accepted, as well as an
159
+ # array specifying mixed joins. Typically, this is not provided,
160
+ # but one of the higher level join methods on Assembly is used
161
+ # directly (like Assembly#inner_join or Assembly#right_join).
162
+ #
163
+ # Example:
164
+ # assembly 'join_left_right' do
165
+ # join 'left', 'right', :on => ['key1', 'key2'], :joiner => :inner do
166
+ # sum 'val1', 'val2', :type => :long
167
+ # end
168
+ # end
169
+ def join(*args_with_options, &block)
170
+ options, assembly_names = args_with_options.extract_options!, args_with_options
196
171
  options[:hash] = false
197
- args << options
198
- prepare_join(*args, &block)
172
+ prepare_join(assembly_names, options, &block)
199
173
  end
200
174
  alias co_group join
201
175
 
202
- def inner_join(*args, &block)
203
- options = args.extract_options!
176
+ # Builds an inner join (CoGroup) pipe. Requires a list of assembly names to
177
+ # join and :on to specify the group_fields.
178
+ #
179
+ # The named options are:
180
+ # [on] The keys of the join, an array of strings if they are the same in
181
+ # all inputs, or a hash mapping assembly names to key names if they
182
+ # differ across inputs.
183
+ # [declared_fields] By default, a deduplicated array of incoming field
184
+ # names (see Cascading::dedup_fields). Specifies the
185
+ # names of the fields that will be available to
186
+ # aggregations or post-join if no aggregations are
187
+ # specified.
188
+ #
189
+ # Example:
190
+ # assembly 'join_left_right' do
191
+ # inner_join 'left', 'right', :on => ['key1', 'key2']
192
+ # sum 'val1', 'val2', :type => :long
193
+ # end
194
+ # end
195
+ def inner_join(*args_with_options, &block)
196
+ options = args_with_options.extract_options!
204
197
  options[:joiner] = :inner
205
- args << options
206
- join(*args, &block)
198
+ args_with_options << options
199
+ join(*args_with_options, &block)
207
200
  end
208
201
 
209
- def left_join(*args, &block)
210
- options = args.extract_options!
202
+ # Builds a left join (CoGroup) pipe. Requires a list of assembly names to
203
+ # join and :on to specify the group_fields.
204
+ #
205
+ # The named options are:
206
+ # [on] The keys of the join, an array of strings if they are the same in
207
+ # all inputs, or a hash mapping assembly names to key names if they
208
+ # differ across inputs.
209
+ # [declared_fields] By default, a deduplicated array of incoming field
210
+ # names (see Cascading::dedup_fields). Specifies the
211
+ # names of the fields that will be available to
212
+ # aggregations or post-join if no aggregations are
213
+ # specified.
214
+ #
215
+ # Example:
216
+ # assembly 'join_left_right' do
217
+ # left_join 'left', 'right', :on => ['key1', 'key2'] do
218
+ # sum 'val1', 'val2', :type => :long
219
+ # end
220
+ # end
221
+ def left_join(*args_with_options, &block)
222
+ options = args_with_options.extract_options!
211
223
  options[:joiner] = :left
212
- args << options
213
- join(*args, &block)
224
+ args_with_options << options
225
+ join(*args_with_options, &block)
214
226
  end
215
227
 
216
- def right_join(*args, &block)
217
- options = args.extract_options!
228
+ # Builds a right join (CoGroup) pipe. Requires a list of assembly names to
229
+ # join and :on to specify the group_fields.
230
+ #
231
+ # The named options are:
232
+ # [on] The keys of the join, an array of strings if they are the same in
233
+ # all inputs, or a hash mapping assembly names to key names if they
234
+ # differ across inputs.
235
+ # [declared_fields] By default, a deduplicated array of incoming field
236
+ # names (see Cascading::dedup_fields). Specifies the
237
+ # names of the fields that will be available to
238
+ # aggregations or post-join if no aggregations are
239
+ # specified.
240
+ #
241
+ # Example:
242
+ # assembly 'join_left_right' do
243
+ # right_join 'left', 'right', :on => ['key1', 'key2'] do
244
+ # sum 'val1', 'val2', :type => :long
245
+ # end
246
+ # end
247
+ def right_join(*args_with_options, &block)
248
+ options = args_with_options.extract_options!
218
249
  options[:joiner] = :right
219
- args << options
220
- join(*args, &block)
250
+ args_with_options << options
251
+ join(*args_with_options, &block)
221
252
  end
222
253
 
223
- def outer_join(*args, &block)
224
- options = args.extract_options!
254
+ # Builds an outer join (CoGroup) pipe. Requires a list of assembly names to
255
+ # join and :on to specify the group_fields.
256
+ #
257
+ # The named options are:
258
+ # [on] The keys of the join, an array of strings if they are the same in
259
+ # all inputs, or a hash mapping assembly names to key names if they
260
+ # differ across inputs.
261
+ # [declared_fields] By default, a deduplicated array of incoming field
262
+ # names (see Cascading::dedup_fields). Specifies the
263
+ # names of the fields that will be available to
264
+ # aggregations or post-join if no aggregations are
265
+ # specified.
266
+ #
267
+ # Example:
268
+ # assembly 'join_left_right' do
269
+ # outer_join 'left', 'right', :on => ['key1', 'key2'] do
270
+ # sum 'val1', 'val2', :type => :long
271
+ # end
272
+ # end
273
+ def outer_join(*args_with_options, &block)
274
+ options = args_with_options.extract_options!
225
275
  options[:joiner] = :outer
226
- args << options
227
- join(*args, &block)
276
+ args_with_options << options
277
+ join(*args_with_options, &block)
228
278
  end
229
279
 
230
- # Builds a new branch.
280
+ # Builds a child Assembly that branches this Assembly given a name and
281
+ # block.
282
+ #
283
+ # An assembly's name is quite important as it will determine:
284
+ # * The sources from which it will read, if any
285
+ # * The name to be used in joins or unions downstream
286
+ # * The name to be used to sink the output of the assembly downstream
287
+ #
288
+ # Many branches may be built within an assembly. The result of a branch is
289
+ # the same as the Flow#assembly constructor, an Assembly object.
290
+ #
291
+ # Example:
292
+ # assembly 'some_work' do
293
+ # ...
294
+ #
295
+ # branch 'more_work' do
296
+ # ...
297
+ # end
298
+ #
299
+ # branch 'yet_more_work' do
300
+ # ...
301
+ # end
302
+ # end
231
303
  def branch(name, &block)
232
304
  raise "Could not build branch '#{name}'; block required" unless block_given?
233
305
  assembly = Assembly.new(name, self, @outgoing_scopes)
@@ -236,11 +308,27 @@ module Cascading
236
308
  assembly
237
309
  end
238
310
 
239
- # Builds a new GroupBy pipe that groups on the fields given in args.
240
- # Any block passed to this method should contain only Everies.
241
- def group_by(*args, &block)
242
- options = args.extract_options!
243
- group_fields = fields(args)
311
+ # Builds a new GroupBy pipe that groups on the fields given in
312
+ # args_with_options. The block passed to this method will be evaluated in
313
+ # the context of Aggregations, not Assembly.
314
+ #
315
+ # The named options are:
316
+ # [sort_by] Optional keys for within-group sort.
317
+ # [reverse] Boolean that can reverse the order of within-group sorting
318
+ # (only makes sense given :sort_by keys).
319
+ #
320
+ # Example:
321
+ # assembly 'total' do
322
+ # ...
323
+ # insert 'const' => 1
324
+ # group_by 'const' do
325
+ # count
326
+ # sum 'val1', 'val2', :type => :long
327
+ # end
328
+ # discard 'const'
329
+ # end
330
+ def group_by(*args_with_options, &block)
331
+ options, group_fields = args_with_options.extract_options!, fields(args_with_options)
244
332
  sort_fields = fields(options[:sort_by])
245
333
  reverse = options[:reverse]
246
334
 
@@ -251,16 +339,31 @@ module Cascading
251
339
  # Unifies multiple incoming pipes sharing the same field structure using a
252
340
  # GroupBy. Accepts :on like join and :sort_by and :reverse like group_by,
253
341
  # as well as a block which may be used for a sequence of Every
254
- # aggregations.
342
+ # aggregations. The block passed to this method will be evaluated in the
343
+ # context of Aggregations, not Assembly.
255
344
  #
256
345
  # By default, groups only on the first field (see line 189 of GroupBy.java)
257
- def union(*args, &block)
258
- options = args.extract_options!
346
+ #
347
+ # The named options are:
348
+ # [on] The keys of the union, which defaults to the first field in the
349
+ # first input assembly.
350
+ # [sort_by] Optional keys for sorting.
351
+ # [reverse] Boolean that can reverse the order of sorting
352
+ # (only makes sense given :sort_by keys).
353
+ #
354
+ # Example:
355
+ # assembly 'union_left_right' do
356
+ # union 'left', 'right' do
357
+ # sum 'val1', 'val2', :type => :long
358
+ # end
359
+ # end
360
+ def union(*args_with_options, &block)
361
+ options, assembly_names = args_with_options.extract_options!, args_with_options
259
362
  group_fields = fields(options[:on])
260
363
  sort_fields = fields(options[:sort_by])
261
364
  reverse = options[:reverse]
262
365
 
263
- pipes, _ = populate_incoming_scopes(args)
366
+ pipes, _ = populate_incoming_scopes(assembly_names)
264
367
 
265
368
  # Must provide group_fields to ensure field name propagation
266
369
  group_fields = fields(@incoming_scopes.first.values_fields.get(0)) unless group_fields
@@ -273,10 +376,15 @@ module Cascading
273
376
  end
274
377
  alias :union_pipes :union
275
378
 
276
- # Allows you to plugin c.p.SubAssemblies to a cascading.jruby Assembly
277
- # under certain assumptions. Note the default is to extend the tail pipe
278
- # of this Assembly using a linear SubAssembly. See SubAssembly class for
279
- # details.
379
+ # Allows you to plugin c.p.SubAssemblies to an Assembly under certain
380
+ # assumptions. Note the default is to extend the tail pipe of this
381
+ # Assembly using a linear SubAssembly. See SubAssembly class for details.
382
+ #
383
+ # Example:
384
+ # assembly 'id_rows' do
385
+ # ...
386
+ # sub_assembly Java::CascadingPipeAssembly::Discard.new(tail_pipe, fields('id'))
387
+ # end
280
388
  def sub_assembly(sub_assembly, pipes = [tail_pipe], incoming_scopes = [scope])
281
389
  sub_assembly = SubAssembly.new(self, sub_assembly)
282
390
  sub_assembly.finalize(pipes, incoming_scopes)
@@ -287,17 +395,24 @@ module Cascading
287
395
  sub_assembly
288
396
  end
289
397
 
290
- # Builds a basic _each_ pipe, and adds it to the current assembly.
291
- # --
398
+ # Builds a basic each pipe and adds it to the current Assembly. Default
399
+ # arguments are all_fields, a default inherited from c.o.Each. Exactly one
400
+ # of :function and :filter must be specified and filters do not support an
401
+ # :output selector.
402
+ #
403
+ # The named options are:
404
+ # [filter] A Cascading Filter, mutually exclusive with :function.
405
+ # [function] A Cascading Function, mutually exclusive with :filter.
406
+ # [output] c.p.Each output selector, only valid with :function.
407
+ #
292
408
  # Example:
293
- # each 'line', :function => regex_splitter(['name', 'val1', 'val2', 'id'], :pattern => /[.,]*\s+/), :output => ['id', 'name', 'val1', 'val2']
294
- def each(*args)
295
- options = args.extract_options!
296
-
297
- in_fields = fields(args)
298
- out_fields = fields(options[:output])
299
-
409
+ # each fields(input_fields), :function => Java::CascadingOperation::Identity.new
410
+ # each 'field1', 'field2', :function => Java::CascadingOperation::Identity.new
411
+ def each(*args_with_options)
412
+ options, in_fields = args_with_options.extract_options!, fields(args_with_options)
413
+ out_fields = fields(options[:output]) # Default Fields.RESULTS from c.o.Each
300
414
  operation = options[:filter] || options[:function]
415
+ raise 'each requires either :filter or :function' unless operation
301
416
  raise 'c.p.Each does not support applying an output selector to a c.o.Filter' if options[:filter] && options[:output]
302
417
 
303
418
  parameters = [tail_pipe, in_fields, operation, out_fields].compact
@@ -308,468 +423,156 @@ module Cascading
308
423
  each
309
424
  end
310
425
 
311
- # Restricts the current assembly to the specified fields.
312
- # --
313
- # Example:
314
- # project "field1", "field2"
315
- def project(*args)
316
- each fields(args), :function => Java::CascadingOperation::Identity.new
317
- end
318
-
319
- # Removes the specified fields from the current assembly.
320
- # --
321
- # Example:
322
- # discard "field1", "field2"
323
- def discard(*args)
324
- discard_fields = fields(args)
325
- keep_fields = difference_fields(scope.values_fields, discard_fields)
326
- project(*keep_fields.to_a)
327
- end
328
-
329
- # Renames fields according to the mapping provided.
330
- # --
331
- # Example:
332
- # rename "old_name" => "new_name"
333
- def rename(name_map)
334
- old_names = scope.values_fields.to_a
335
- new_names = old_names.map{ |name| name_map[name] || name }
336
- invalid = name_map.keys.sort - old_names
337
- raise "invalid names: #{invalid.inspect}" unless invalid.empty?
338
-
339
- each all_fields, :function => Java::CascadingOperation::Identity.new(fields(new_names))
340
- end
341
-
342
- def cast(type_map)
343
- names = type_map.keys.sort
344
- types = JAVA_TYPE_MAP.values_at(*type_map.values_at(*names))
345
- fields = fields(names)
346
- types = types.to_java(java.lang.Class)
347
- each fields, :function => Java::CascadingOperation::Identity.new(fields, types)
348
- end
349
-
350
- def copy(*args)
351
- options = args.extract_options!
352
- from = args[0] || all_fields
353
- into = args[1] || options[:into] || all_fields
354
- each fields(from), :function => Java::CascadingOperation::Identity.new(fields(into)), :output => all_fields
355
- end
356
-
357
- # A pipe that does nothing.
358
- def pass(*args)
359
- each all_fields, :function => Java::CascadingOperation::Identity.new
360
- end
426
+ include Operations
427
+ include IdentityOperations
428
+ include FilterOperations
429
+ include RegexOperations
430
+ include TextOperations
361
431
 
362
- def assert(*args)
363
- options = args.extract_options!
364
- assertion = args[0]
432
+ # Builds an each assertion pipe given a c.o.a.Assertion and adds it to the
433
+ # current Assembly.
434
+ #
435
+ # The named options are:
436
+ # [level] The assertion level; defaults to strict.
437
+ def assert(assertion, options = {})
365
438
  assertion_level = options[:level] || Java::CascadingOperation::AssertionLevel::STRICT
366
439
 
367
440
  parameters = [tail_pipe, assertion_level, assertion]
368
441
  make_pipe(Java::CascadingPipe::Each, parameters)
369
442
  end
370
443
 
371
- # Builds a debugging pipe.
372
- #
373
- # Without arguments, it generate a simple debug pipe, that prints all tuple to the standard
374
- # output.
375
- #
376
- # The other named options are:
377
- # * <tt>:print_fields</tt> a boolean. If is set to true, then it prints every 10 tuples.
378
- #
379
- def debug(*args)
380
- options = args.extract_options!
381
- print_fields = options[:print_fields] || true
382
- parameters = [print_fields].compact
383
- debug = Java::CascadingOperation::Debug.new(*parameters)
384
- debug.print_tuple_every = options[:tuple_interval] || 1
385
- debug.print_fields_every = options[:fields_interval] || 10
386
- each(all_fields, :filter => debug)
387
- end
388
-
389
- # Builds a pipe that assert the size of the tuple is the size specified in parameter.
390
- #
391
- # The method accept an unique uname argument : a number indicating the size expected.
392
- def assert_size_equals(*args)
393
- options = args.extract_options!
394
- assertion = Java::CascadingOperationAssertion::AssertSizeEquals.new(args[0])
444
+ # Builds a pipe that asserts the size of the tuple is the specified size.
445
+ def assert_size_equals(size, options = {})
446
+ assertion = Java::CascadingOperationAssertion::AssertSizeEquals.new(size)
395
447
  assert(assertion, options)
396
448
  end
397
449
 
398
- # Builds a pipe that assert the none of the fields in the tuple are null.
399
- def assert_not_null(*args)
400
- options = args.extract_options!
450
+ # Builes a pipe that asserts none of the fiels in the tuple are null.
451
+ def assert_not_null(options = {})
401
452
  assertion = Java::CascadingOperationAssertion::AssertNotNull.new
402
453
  assert(assertion, options)
403
454
  end
404
455
 
405
- # Builds a _parse_ pipe. This pipe will parse the fields specified in input (first unamed arguments),
406
- # using a specified regex pattern.
407
- #
408
- # If provided, the unamed arguments must be the fields to be parsed. If not provided, then all incoming
409
- # fields are used.
410
- #
411
- # The named options are:
412
- # * <tt>:pattern</tt> a string or regex. Specifies the regular expression used for parsing the argument fields.
413
- # * <tt>:output</tt> a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
414
- def parse(*args)
415
- options = args.extract_options!
416
- fields = args || all_fields
417
- pattern = options[:pattern]
418
- output = options[:output] || all_fields
419
- each(fields, :function => regex_parser(pattern, options), :output => output)
420
- end
456
+ private
421
457
 
422
- # Builds a pipe that splits a field into other fields, using a specified regular expression.
423
- #
424
- # The first unnamed argument is the field to be split.
425
- # The second unnamed argument is an array of strings indicating the fields receiving the result of the split.
426
- #
427
- # The named options are:
428
- # * <tt>:pattern</tt> a string or regex. Specifies the regular expression used for splitting the argument fields.
429
- # * <tt>:output</tt> a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
430
- def split(*args)
431
- options = args.extract_options!
432
- fields = options[:into] || args[1]
433
- pattern = options[:pattern] || /[.,]*\s+/
434
- output = options[:output] || all_fields
435
- each(args[0], :function => regex_splitter(fields, :pattern => pattern), :output=>output)
436
- end
437
-
438
- # Builds a pipe that splits a field into new rows, using a specified regular expression.
439
- #
440
- # The first unnamed argument is the field to be split.
441
- # The second unnamed argument is the field receiving the result of the split.
442
- #
443
- # The named options are:
444
- # * <tt>:pattern</tt> a string or regex. Specifies the regular expression used for splitting the argument fields.
445
- # * <tt>:output</tt> a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
446
- def split_rows(*args)
447
- options = args.extract_options!
448
- fields = options[:into] || args[1]
449
- pattern = options[:pattern] || /[.,]*\s+/
450
- output = options[:output] || all_fields
451
- each(args[0], :function => regex_split_generator(fields, :pattern => pattern), :output=>output)
452
- end
453
-
454
- # Builds a pipe that emits a new row for each regex group matched in a field, using a specified regular expression.
455
- #
456
- # The first unnamed argument is the field to be matched against.
457
- # The second unnamed argument is the field receiving the result of the match.
458
- #
459
- # The named options are:
460
- # * <tt>:pattern</tt> a string or regex. Specifies the regular expression used for matching the argument fields.
461
- # * <tt>:output</tt> a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
462
- def match_rows(*args)
463
- options = args.extract_options!
464
- fields = options[:into] || args[1]
465
- pattern = options[:pattern] || /[\w]+/
466
- output = options[:output] || all_fields
467
- each(args[0], :function => regex_generator(fields, :pattern => pattern), :output=>output)
468
- end
469
-
470
- # Builds a pipe that parses the specified field as a date using hte provided format string.
471
- # The unamed argument specifies the field to format.
472
- #
473
- # The named options are:
474
- # * <tt>:into</tt> a string. It specifies the receiving field. By default, it will be named after
475
- # the input argument.
476
- # * <tt>:pattern</tt> a string. Specifies the date format.
477
- # * <tt>:output</tt> a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
478
- def parse_date(*args)
479
- options = args.extract_options!
480
- field = options[:into] || "#{args[0]}_parsed"
481
- output = options[:output] || all_fields
482
- pattern = options[:pattern] || "yyyy/MM/dd"
483
-
484
- each args[0], :function => date_parser(field, pattern), :output => output
485
- end
458
+ def make_pipe(type, parameters)
459
+ @tail_pipe = type.new(*parameters)
460
+ @outgoing_scopes[name] = Scope.outgoing_scope(tail_pipe, [scope])
486
461
 
487
- # Builds a pipe that format a date using a specified format pattern.
488
- #
489
- # The unamed argument specifies the field to format.
490
- #
491
- # The named options are:
492
- # * <tt>:into</tt> a string. It specifies the receiving field. By default, it will be named after
493
- # the input argument.
494
- # * <tt>:pattern</tt> a string. Specifies the date format.
495
- # * <tt>:timezone</tt> a string. Specifies the timezone (defaults to UTC).
496
- # * <tt>:output</tt> a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
497
- def format_date(*args)
498
- options = args.extract_options!
499
- field = options[:into] || "#{args[0]}_formatted"
500
- pattern = options[:pattern] || "yyyy/MM/dd"
501
- output = options[:output] || all_fields
502
-
503
- each args[0], :function => date_formatter(field, pattern, options[:timezone]), :output => output
462
+ tail_pipe
504
463
  end
505
464
 
506
- # Builds a pipe that perform a query/replace based on a regular expression.
507
- #
508
- # The first unamed argument specifies the input field.
509
- #
510
- # The named options are:
511
- # * <tt>:pattern</tt> a string or regex. Specifies the pattern to look for in the input field. This non-optional argument
512
- # can also be specified as a second _unamed_ argument.
513
- # * <tt>:replacement</tt> a string. Specifies the replacement.
514
- # * <tt>:output</tt> a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
515
- def replace(*args)
516
- options = args.extract_options!
517
-
518
- pattern = options[:pattern] || args[1]
519
- replacement = options[:replacement] || args[2]
520
- into = options[:into] || "#{args[0]}_replaced"
521
- output = options[:output] || all_fields
522
-
523
- each args[0], :function => regex_replace(into, pattern, replacement), :output => output
524
- end
465
+ def populate_incoming_scopes(assembly_names, group_fields_args = {})
466
+ # NOTE: this overrides the existing incoming_scopes, which changes the
467
+ # way describe will function on this assembly
468
+ pipes, @incoming_scopes, group_fields = [], [], []
469
+ assembly_names.each do |assembly_name|
470
+ assembly = parent_flow.find_child(assembly_name)
471
+ raise "Could not find assembly '#{assembly_name}' from '#{name}'" unless assembly
525
472
 
526
- # Builds a pipe that inserts values into the current tuple.
527
- #
528
- # The method takes a hash as parameter. This hash contains as keys the names of the fields to insert
529
- # and as values, the values they must contain. For example:
530
- #
531
- # insert {"who" => "Grégoire", "when" => Time.now.strftime("%Y-%m-%d") }
532
- #
533
- # will insert two new fields: a field _who_ containing the string "Grégoire", and a field _when_ containing
534
- # the formatted current date.
535
- # The methods outputs all fields.
536
- # The named options are:
537
- def insert(args)
538
- args.keys.sort.each do |field_name|
539
- value = args[field_name]
540
-
541
- if value.kind_of?(ExprStub)
542
- value.validate_scope(scope)
543
- each all_fields, :function => expression_function(field_name, :expression => value.expression, :parameters => value.types), :output => all_fields
544
- else
545
- each all_fields, :function => insert_function([field_name], :values => [value]), :output => all_fields
546
- end
473
+ pipes << assembly.tail_pipe
474
+ @incoming_scopes << assembly.scope
475
+ group_fields << fields(group_fields_args[assembly_name]) if group_fields_args[assembly_name]
547
476
  end
477
+ [pipes, group_fields]
548
478
  end
549
479
 
550
- # Builds a pipe that filters the tuples based on an expression or a pattern (but not both !).
551
- #
552
- # The first unamed argument, if provided, is a filtering expression (using the Janino syntax).
553
- #
554
- # The named options are:
555
- # * <tt>:pattern</tt> a string. Specifies a regular expression pattern used to filter the tuples. If this
556
- # option is provided, then the filter is regular expression-based. This is incompatible with the _expression_ option.
557
- # * <tt>:expression</tt> a string. Specifies a Janino expression used to filter the tuples. This option has the
558
- # same effect than providing it as first unamed argument. If this option is provided, then the filter is Janino
559
- # expression-based. This is incompatible with the _pattern_ option.
560
- # * <tt>:validate</tt> a boolean. Passed into Cascading#expr to enable or disable
561
- # expression validation. Defaults to true.
562
- # * <tt>:validate_with</tt> a hash. Actual arguments used by Cascading#expr for
563
- # expression validation. Defaults to {}.
564
- def filter(*args)
565
- options = args.extract_options!
566
- from = options.delete(:from) || all_fields
567
- expression = options.delete(:expression) || args.shift
568
- regex = options.delete(:pattern)
569
- validate = options.has_key?(:validate) ? options.delete(:validate) : true
570
- validate_with = options.has_key?(:validate_with) ? options.delete(:validate_with) : {}
571
-
572
- if expression
573
- stub = expr(expression, { :validate => validate, :validate_with => validate_with })
574
- types, expression = stub.types, stub.expression
575
-
576
- stub.validate_scope(scope)
577
- each from, :filter => expression_filter(
578
- :parameters => types,
579
- :expression => expression
580
- )
581
- elsif regex
582
- each from, :filter => regex_filter(regex, options)
583
- end
584
- end
480
+ def apply_aggregations(group, incoming_scopes, &block)
481
+ aggregations = Aggregations.new(self, group, incoming_scopes)
482
+ aggregations.instance_eval(&block) if block_given?
585
483
 
586
- def filter_null(*args)
587
- options = args.extract_options!
588
- each(args, :filter => Java::CascadingOperationFilter::FilterNull.new)
589
- end
590
- alias reject_null filter_null
484
+ # Sorting of any type means that we cannot use the AggregateBy optimization
485
+ if aggregations.can_aggregate_by? && !group.is_sorted && !group.is_sort_reversed
486
+ grouping_fields = group.key_selectors.values.first
487
+ group.key_selectors.values.each do |key_fields|
488
+ raise "Grouping fields mismatch: #{grouping_fields} expected; #{key_fields} found from #{group.key_selectors}" unless key_fields == grouping_fields
489
+ end
591
490
 
592
- def filter_not_null(*args)
593
- options = args.extract_options!
594
- each(args, :filter => Java::CascadingOperationFilter::FilterNotNull.new)
595
- end
596
- alias where_null filter_not_null
491
+ aggregate_by = sub_assembly(Java::CascadingPipeAssembly::AggregateBy.new(
492
+ name,
493
+ group.previous,
494
+ grouping_fields,
495
+ aggregations.aggregate_bys.to_java(Java::CascadingPipeAssembly::AggregateBy)
496
+ ), group.previous, incoming_scopes)
597
497
 
598
- # Builds a pipe that rejects the tuples based on an expression.
599
- #
600
- # The first unamed argument, if provided, is a filtering expression (using the Janino syntax).
601
- #
602
- # The named options are:
603
- # * <tt>:expression</tt> a string. Specifies a Janino expression used to filter the tuples. This option has the
604
- # same effect than providing it as first unamed argument. If this option is provided, then the filter is Janino
605
- # expression-based.
606
- # * <tt>:validate</tt> a boolean. Passed into Cascading#expr to enable or disable
607
- # expression validation. Defaults to true.
608
- # * <tt>:validate_with</tt> a hash. Actual arguments used by Cascading#expr for
609
- # expression validation. Defaults to {}.
610
- def reject(*args)
611
- options = args.extract_options
612
- raise "Regex not allowed" if options && options[:pattern]
613
-
614
- filter(*args)
615
- end
498
+ aggregate_by
499
+ else
500
+ aggregations.finalize if block_given?
501
+ @tail_pipe = aggregations.tail_pipe
502
+ @outgoing_scopes[name] = aggregations.scope
616
503
 
617
- # Builds a pipe that includes just the tuples matching an expression.
618
- #
619
- # The first unamed argument, if provided, is a filtering expression (using the Janino syntax).
620
- #
621
- # The named options are:
622
- # * <tt>:expression</tt> a string. Specifies a Janino expression used to select the tuples. This option has the
623
- # same effect than providing it as first unamed argument. If this option is provided, then the filter is Janino
624
- # expression-based.
625
- # * <tt>:validate</tt> a boolean. Passed into Cascading#expr to enable or disable
626
- # expression validation. Defaults to true.
627
- # * <tt>:validate_with</tt> a hash. Actual arguments used by Cascading#expr for
628
- # expression validation. Defaults to {}.
629
- def where(*args)
630
- options = args.extract_options
631
- raise "Regex not allowed" if options && options[:pattern]
632
-
633
- if options[:expression]
634
- _, imports, expr = options[:expression].match(/^((?:\s*import.*;\s*)*)(.*)$/).to_a
635
- options[:expression] = "#{imports}!(#{expr})"
636
- elsif args[0]
637
- _, imports, expr = args[0].match(/^((?:\s*import.*;\s*)*)(.*)$/).to_a
638
- args[0] = "#{imports}!(#{expr})"
504
+ group
639
505
  end
640
-
641
- filter(*args)
642
506
  end
643
507
 
644
- # Builds a pipe that evaluates the specified Janino expression and insert it in a new field in the tuple.
645
- #
646
- # The named options are:
647
- # * <tt>:from</tt> a string or array of strings. Specifies the input fields.
648
- # * <tt>:express</tt> a string. The janino expression.
649
- # * <tt>:into</tt> a string. Specified the name of the field to insert with the result of the evaluation.
650
- # * <tt>:parameters</tt> a hash. Specifies the type mapping for the parameters. See Cascading::Operations.expression_function.
651
- def eval_expression(*args)
652
- options = args.extract_options!
653
-
654
- into = options.delete(:into)
655
- from = options.delete(:from) || all_fields
656
- output = options.delete(:output) || all_fields
657
- options[:expression] ||= args.shift
658
- options[:parameters] ||= args.shift
659
-
660
- each from, :function => expression_function(into, options), :output=>output
661
- end
508
+ def prepare_join(assembly_names, options, &block)
509
+ pipes, _ = populate_incoming_scopes(assembly_names)
662
510
 
663
- # Builds a pipe that returns distinct tuples based on the provided fields.
664
- #
665
- # The method accepts optional unamed argument specifying the fields to base the distinct on
666
- # (all fields, by default).
667
- def distinct(*args)
668
- raise "Distinct is badly broken"
669
- fields = args[0] || all_fields
670
- group_by *fields
671
- pass
672
- end
673
-
674
- def join_fields(*args)
675
- options = args.extract_options!
676
- output = options[:output] || all_fields
511
+ group_fields_args = options[:on]
512
+ raise 'join requires :on parameter' unless group_fields_args
677
513
 
678
- each args, :function => field_joiner(options), :output => output
679
- end
514
+ if group_fields_args.kind_of?(String)
515
+ group_fields_args = [group_fields_args]
516
+ end
680
517
 
681
- # Ungroups, or unpivots, a tuple (see Cascading's UnGroup at http://docs.cascading.org/cascading/2.0/javadoc/cascading/operation/function/UnGroup.html).
682
- #
683
- # You must provide :key and you must provide only one of :value_selectors
684
- # and :num_values.
685
- #
686
- # The named options are:
687
- # * <tt>:key</tt> required array of field names to replicate on every
688
- # output row in an ungrouped group.
689
- # * <tt>:value_selectors</tt> an array of field names to ungroup. Each
690
- # field will be ungrouped into an output tuple along with the key fields
691
- # in the order provided.
692
- # * <tt>:num_values</tt> an integer specifying the number of fields to
693
- # ungroup into each output tuple (excluding the key fields). All input
694
- # fields will be ungrouped.
695
- # * <tt>:input</tt> an array of field names that specifies the fields to
696
- # input to UnGroup. Defaults to all_fields.
697
- # * <tt>:into</tt> an array of field names. Default set by UnGroup.
698
- # * <tt>:output</tt> an array of field names that specifies the fields to
699
- # produce as output of UnGroup. Defaults to all_fields.
700
- def ungroup(*args)
701
- options = args.extract_options!
702
- input = options[:input] || all_fields
703
- into = fields(options[:into])
704
- output = options[:output] || all_fields
705
- key = fields(options[:key])
706
-
707
- raise 'You must provide exactly one of :value_selectors or :num_values to ungroup' unless options.has_key?(:value_selectors) ^ options.has_key?(:num_values)
708
- value_selectors = options[:value_selectors].map{ |vs| fields(vs) }.to_java(Java::CascadingTuple::Fields) if options.has_key?(:value_selectors)
709
- num_values = options[:num_values] if options.has_key?(:num_values)
710
-
711
- parameters = [into, key, value_selectors, num_values].compact
712
- each input, :function => Java::CascadingOperationFunction::UnGroup.new(*parameters), :output => output
713
- end
518
+ group_fields = []
519
+ if group_fields_args.kind_of?(Array)
520
+ pipes.size.times do
521
+ group_fields << fields(group_fields_args)
522
+ end
523
+ elsif group_fields_args.kind_of?(Hash)
524
+ pipes, group_fields = populate_incoming_scopes(group_fields_args.keys.sort, group_fields_args)
525
+ else
526
+ raise "Unsupported data type for :on in join: '#{group_fields_args.class}'"
527
+ end
714
528
 
715
- # Inserts one of two values into the dataflow based upon the result of the
716
- # supplied filter on the input fields. This is primarily useful for
717
- # creating indicators from filters.
718
- #
719
- # Parameters:
720
- # * <tt>input</tt> name of field to apply the filter.
721
- # * <tt>filter</tt> Cascading Filter to apply.
722
- # * <tt>keep_value</tt> Java value to produce when the filter would keep
723
- # the given input.
724
- # * <tt>remove_value</tt> Java value to produce when the filter would
725
- # remove the given input.
726
- #
727
- # The named options are:
728
- # * <tt>:into</tt> an output field name, defaulting to 'filter_value'.
729
- # * <tt>:output</tt> an array of field names that specifies the fields to
730
- # retain in the output tuple. Defaults to all_fields.
731
- def set_value(input, filter, keep_value, remove_value, params = {})
732
- into = fields(params[:into] || 'filter_value')
733
- output = params[:output] || all_fields
734
- each input, :function => Java::CascadingOperationFunction::SetValue.new(into, filter, keep_value, remove_value), :output => output
735
- end
529
+ raise 'join requires non-empty :on parameter' if group_fields_args.empty?
530
+ group_fields = group_fields.to_java(Java::CascadingTuple::Fields)
531
+ incoming_fields = @incoming_scopes.map{ |s| s.values_fields }
532
+ declared_fields = fields(options[:declared_fields] || dedup_fields(*incoming_fields))
533
+ joiner = options[:joiner]
534
+ is_hash_join = options[:hash] || false
736
535
 
737
- # Efficient way of inserting a null indicator for any field, even one that
738
- # cannot be coerced to a string. This is accomplished using Cascading's
739
- # FilterNull and SetValue operators rather than Janino. 1 is produced if
740
- # the field is null and 0 otherwise.
741
- #
742
- # Parameters:
743
- # * <tt>input</tt> name of field to check for null.
744
- #
745
- # The named options are:
746
- # * <tt>:into</tt> an output field name, defaulting to 'is_null'.
747
- # * <tt>:output</tt> an array of field names that specifies the fields to
748
- # retain in the output tuple. Defaults to all_fields.
749
- def null_indicator(input, params = {})
750
- into = fields(params[:into] || 'is_null')
751
- output = params[:output] || all_fields
752
- set_value input, Java::CascadingOperationFilter::FilterNull.new, 1.to_java, 0.to_java, :into => into, :output => output
753
- end
536
+ case joiner
537
+ when :inner, 'inner', nil
538
+ joiner = Java::CascadingPipeJoiner::InnerJoin.new
539
+ when :left, 'left'
540
+ joiner = Java::CascadingPipeJoiner::LeftJoin.new
541
+ when :right, 'right'
542
+ joiner = Java::CascadingPipeJoiner::RightJoin.new
543
+ when :outer, 'outer'
544
+ joiner = Java::CascadingPipeJoiner::OuterJoin.new
545
+ when Array
546
+ joiner = joiner.map do |t|
547
+ case t
548
+ when true, 1, :inner then true
549
+ when false, 0, :outer then false
550
+ else fail "invalid mixed joiner entry: #{t}"
551
+ end
552
+ end
553
+ joiner = Java::CascadingPipeJoiner::MixedJoin.new(joiner.to_java(:boolean))
554
+ end
754
555
 
755
- # Given a field and a regex, returns an indicator that is 1 if the string
756
- # contains at least 1 match and 0 otherwise.
757
- #
758
- # Parameters:
759
- # * <tt>input</tt> field name or names that specifies the fields over which
760
- # to perform the match.
761
- # * <tt>pattern</tt> regex to apply to the input.
762
- #
763
- # The named options are:
764
- # * <tt>:into</tt> an output field name, defaulting to 'regex_contains'.
765
- # * <tt>:output</tt> an array of field names that specifies the fields to
766
- # retain in the output tuple. Defaults to all_fields.
767
- def regex_contains(input, pattern, params = {})
768
- input = fields(input)
769
- pattern = pattern.to_s # Supports JRuby regexes
770
- into = fields(params[:into] || 'regex_contains')
771
- output = params[:output] || all_fields
772
- set_value input, Java::CascadingOperationRegex::RegexFilter.new(pattern), 1.to_java, 0.to_java, :into => into, :output => output
556
+ if is_hash_join
557
+ parameters = [
558
+ pipes.to_java(Java::CascadingPipe::Pipe),
559
+ group_fields,
560
+ declared_fields,
561
+ joiner
562
+ ]
563
+ group_assembly = Java::CascadingPipe::HashJoin.new(*parameters)
564
+ else
565
+ result_group_fields = dedup_fields(*group_fields)
566
+ parameters = [
567
+ pipes.to_java(Java::CascadingPipe::Pipe),
568
+ group_fields,
569
+ declared_fields,
570
+ result_group_fields,
571
+ joiner
572
+ ]
573
+ group_assembly = Java::CascadingPipe::CoGroup.new(*parameters)
574
+ end
575
+ apply_aggregations(group_assembly, @incoming_scopes, &block)
773
576
  end
774
577
  end
775
578
  end