cascading.jruby 0.0.10 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,15 +1,50 @@
1
1
  require 'cascading/base'
2
2
  require 'cascading/operations'
3
+ require 'cascading/identity_operations'
4
+ require 'cascading/filter_operations'
5
+ require 'cascading/regex_operations'
6
+ require 'cascading/text_operations'
3
7
  require 'cascading/aggregations'
4
8
  require 'cascading/sub_assembly'
5
9
  require 'cascading/ext/array'
6
10
 
7
11
  module Cascading
12
+ # An Assembly is a sequence of Cascading pipes (Each, GroupBy, CoGroup,
13
+ # Every, and SubAssembly). This class will serve as your primary mechanism
14
+ # for doing work within a flow and contains all the functions and filters you
15
+ # will apply to a pipe (Eaches), as well as group_by, union, and join. For
16
+ # aggregators and buffers, please see Aggregations.
17
+ #
18
+ # Function and filter DSL rules:
19
+ # * Use positional arguments for required parameters
20
+ # * Use options = {} for optional parameters
21
+ # * Use *args sparingly, specifically when you need to accept a varying length list of fields
22
+ # * If you require both a varying length list of fields and optional parameters, then see the Array#extract_options! extension
23
+ # * If you choose to name a required parameter, add it to options = {} and throw an exception if the caller does not provide it
24
+ # * If you have a require parameter that is provided by one of a set of options names, throw an exception if the caller does not provide at least one value (see :function and :filter in Assembly#each for an example)
25
+ #
26
+ # Function and filter DSL standard optional parameter names:
27
+ # [input] c.p.Each argument selector
28
+ # [into] c.o.Operation field declaration
29
+ # [output] c.p.Each output selector
30
+ #
31
+ # A note on aliases: when a DSL method uniquely wraps a single Cascading
32
+ # operation, we attempt to provide an alias that matches the Cascading
33
+ # operation. However, Cascading operations are often nouns rather than verbs,
34
+ # and the latter are preferable for a dataflow DSL.
8
35
  class Assembly < Cascading::Node
9
- include Operations
10
-
11
36
  attr_reader :head_pipe, :tail_pipe
12
37
 
38
+ # Do not use this constructor directly; instead, use Flow#assembly or
39
+ # Assembly#branch to build assemblies.
40
+ #
41
+ # Builds an Assembly given a name, parent, and optional outgoing_scopes
42
+ # (necessary only for branching).
43
+ #
44
+ # An assembly's name is quite important as it will determine:
45
+ # * The sources from which it will read, if any
46
+ # * The name to be used in joins or unions downstream
47
+ # * The name to be used to sink the output of the assembly downstream
13
48
  def initialize(name, parent, outgoing_scopes = {})
14
49
  super(name, parent)
15
50
 
@@ -27,6 +62,11 @@ module Cascading
27
62
  @incoming_scopes = [scope]
28
63
  end
29
64
 
65
+ # Produces a textual description of this Assembly. The description details
66
+ # the structure of the Assembly, its input and output fields and any
67
+ # children (branches). The offset parameter allows for this describe to be
68
+ # nested within a calling context, which lets us indent the structural
69
+ # hierarchy of a job.
30
70
  def describe(offset = '')
31
71
  incoming_scopes_desc = "#{@incoming_scopes.map{ |incoming_scope| incoming_scope.values_fields.to_a.inspect }.join(', ')}"
32
72
  incoming_scopes_desc = "(#{incoming_scopes_desc})" unless @incoming_scopes.size == 1
@@ -35,199 +75,231 @@ module Cascading
35
75
  description
36
76
  end
37
77
 
78
+ # Rather than the immediate parent, this method returns the parent flow of
79
+ # this Assembly. If this is a branch, we must traverse the parents of
80
+ # parent assemblies.
38
81
  def parent_flow
39
82
  return parent if parent.kind_of?(Flow)
40
83
  parent.parent_flow
41
84
  end
42
85
 
86
+ # Accesses the outgoing scope of this Assembly at the point at which it is
87
+ # called. This is useful for grabbing the values_fields at any point in
88
+ # the construction of the Assembly. See Scope for details.
43
89
  def scope
44
90
  @outgoing_scopes[name]
45
91
  end
46
92
 
93
+ # Prints information about the scope of this Assembly at the point at which
94
+ # it is called. This allows you to trace the propagation of field names
95
+ # through your job and is handy for debugging. See Scope for details.
47
96
  def debug_scope
48
97
  puts "Current scope for '#{name}':\n #{scope}\n----------\n"
49
98
  end
50
99
 
51
- def make_pipe(type, parameters)
52
- @tail_pipe = type.new(*parameters)
53
- @outgoing_scopes[name] = Scope.outgoing_scope(tail_pipe, [scope])
54
-
55
- tail_pipe
56
- end
57
- private :make_pipe
58
-
59
- def populate_incoming_scopes(assembly_names, group_fields_args = {})
60
- # NOTE: this overrides the existing incoming_scopes, which changes the
61
- # way describe will function on this assembly
62
- pipes, @incoming_scopes, group_fields = [], [], []
63
- assembly_names.each do |assembly_name|
64
- assembly = parent_flow.find_child(assembly_name)
65
- raise "Could not find assembly '#{assembly_name}' from '#{name}'" unless assembly
66
-
67
- pipes << assembly.tail_pipe
68
- @incoming_scopes << assembly.scope
69
- group_fields << fields(group_fields_args[assembly_name]) if group_fields_args[assembly_name]
70
- end
71
- [pipes, group_fields]
72
- end
73
- private :populate_incoming_scopes
74
-
75
- def apply_aggregations(group, incoming_scopes, &block)
76
- aggregations = Aggregations.new(self, group, incoming_scopes)
77
- aggregations.instance_eval(&block) if block_given?
78
-
79
- # Sorting of any type means that we cannot use the AggregateBy optimization
80
- if aggregations.can_aggregate_by? && !group.is_sorted && !group.is_sort_reversed
81
- grouping_fields = group.key_selectors.values.first
82
- group.key_selectors.values.each do |key_fields|
83
- raise "Grouping fields mismatch: #{grouping_fields} expected; #{key_fields} found from #{group.key_selectors}" unless key_fields == grouping_fields
84
- end
85
-
86
- aggregate_by = sub_assembly(Java::CascadingPipeAssembly::AggregateBy.new(
87
- name,
88
- group.previous,
89
- grouping_fields,
90
- aggregations.aggregate_bys.to_java(Java::CascadingPipeAssembly::AggregateBy)
91
- ), group.previous, incoming_scopes)
92
-
93
- aggregate_by
94
- else
95
- aggregations.finalize if block_given?
96
- @tail_pipe = aggregations.tail_pipe
97
- @outgoing_scopes[name] = aggregations.scope
98
-
99
- group
100
- end
101
- end
102
- private :apply_aggregations
103
-
100
+ # Prints detail about this Assembly including its name, head pipe, and tail
101
+ # pipe.
104
102
  def to_s
105
103
  "#{name} : head pipe : #{head_pipe} - tail pipe: #{tail_pipe}"
106
104
  end
107
105
 
108
- def prepare_join(*args, &block)
109
- options = args.extract_options!
110
-
111
- pipes, _ = populate_incoming_scopes(args)
112
-
113
- group_fields_args = options[:on]
114
- raise 'join requires :on parameter' unless group_fields_args
115
-
116
- if group_fields_args.kind_of?(String)
117
- group_fields_args = [group_fields_args]
118
- end
119
-
120
- group_fields = []
121
- if group_fields_args.kind_of?(Array)
122
- pipes.size.times do
123
- group_fields << fields(group_fields_args)
124
- end
125
- elsif group_fields_args.kind_of?(Hash)
126
- pipes, group_fields = populate_incoming_scopes(group_fields_args.keys.sort, group_fields_args)
127
- else
128
- raise "Unsupported data type for :on in join: '#{group_fields_args.class}'"
129
- end
130
-
131
- raise 'join requires non-empty :on parameter' if group_fields_args.empty?
132
- group_fields = group_fields.to_java(Java::CascadingTuple::Fields)
133
- incoming_fields = @incoming_scopes.map{ |s| s.values_fields }
134
- declared_fields = fields(options[:declared_fields] || dedup_fields(*incoming_fields))
135
- joiner = options[:joiner]
136
- is_hash_join = options[:hash] || false
137
-
138
- case joiner
139
- when :inner, 'inner', nil
140
- joiner = Java::CascadingPipeJoiner::InnerJoin.new
141
- when :left, 'left'
142
- joiner = Java::CascadingPipeJoiner::LeftJoin.new
143
- when :right, 'right'
144
- joiner = Java::CascadingPipeJoiner::RightJoin.new
145
- when :outer, 'outer'
146
- joiner = Java::CascadingPipeJoiner::OuterJoin.new
147
- when Array
148
- joiner = joiner.map do |t|
149
- case t
150
- when true, 1, :inner then true
151
- when false, 0, :outer then false
152
- else fail "invalid mixed joiner entry: #{t}"
153
- end
154
- end
155
- joiner = Java::CascadingPipeJoiner::MixedJoin.new(joiner.to_java(:boolean))
156
- end
157
-
158
- if is_hash_join
159
- raise ArgumentError, "hash joins don't support aggregations" if block_given?
160
- parameters = [
161
- pipes.to_java(Java::CascadingPipe::Pipe),
162
- group_fields,
163
- declared_fields,
164
- joiner
165
- ]
166
- group_assembly = Java::CascadingPipe::HashJoin.new(*parameters)
167
- else
168
- result_group_fields = dedup_fields(*group_fields)
169
- parameters = [
170
- pipes.to_java(Java::CascadingPipe::Pipe),
171
- group_fields,
172
- declared_fields,
173
- result_group_fields,
174
- joiner
175
- ]
176
- group_assembly = Java::CascadingPipe::CoGroup.new(*parameters)
177
- end
178
- apply_aggregations(group_assembly, @incoming_scopes, &block)
179
- end
180
- private :prepare_join
181
-
182
106
  # Builds a HashJoin pipe. This should be used carefully, as the right side
183
- # of the join is accumulated entirely in memory. Requires a list of assembly
184
- # names to join and :on to specify the join_fields.
185
- def hash_join(*args, &block)
186
- options = args.extract_options!
107
+ # of the join is accumulated entirely in memory. Requires a list of
108
+ # assembly names to join and :on to specify the join_fields. Note that a
109
+ # hash_join "takes over" the Assembly in which it is built, so it is
110
+ # typically the first statement within the block of the assembly or branch.
111
+ # Additionally, a hash join does not accept a block for aggregations like
112
+ # other joins; this restriction is enforced here, but comes directly from
113
+ # Cascading.
114
+ #
115
+ # The named options are:
116
+ # [on] The keys of the join, an array of strings if they are the same in
117
+ # all inputs, or a hash mapping assembly names to key names if they
118
+ # differ across inputs.
119
+ # [declared_fields] By default, a deduplicated array of incoming field
120
+ # names (see Cascading::dedup_fields). Specifies the
121
+ # names of the fields that will be available to
122
+ # aggregations or post-join if no aggregations are
123
+ # specified.
124
+ # [joiner] A specification of the c.p.j.Joiner to use. Values like :inner
125
+ # and 'inner', :right and 'right' are accepted, as well as an
126
+ # array specifying mixed joins. Typically, this is not provided,
127
+ # but one of the higher level join methods on Assembly is used
128
+ # directly (like Assembly#inner_join or Assembly#right_join).
129
+ #
130
+ # Example:
131
+ # assembly 'join_left_right' do
132
+ # hash_join 'left', 'right', :on => ['key1', 'key2'], :joiner => :inner
133
+ # end
134
+ def hash_join(*args_with_options)
135
+ raise ArgumentError, "HashJoin doesn't support aggregations so the block provided to hash_join will be ignored" if block_given?
136
+
137
+ options, assembly_names = args_with_options.extract_options!, args_with_options
187
138
  options[:hash] = true
188
- args << options
189
- prepare_join(*args, &block)
139
+ prepare_join(assembly_names, options)
190
140
  end
191
141
 
192
142
  # Builds a join (CoGroup) pipe. Requires a list of assembly names to join
193
- # and :on to specify the group_fields.
194
- def join(*args, &block)
195
- options = args.extract_options!
143
+ # and :on to specify the group_fields. Note that a join "takes over" the
144
+ # Assembly in which it is built, so it is typically the first statement
145
+ # within the block of the assembly or branch. The block passed to this
146
+ # method will be evaluated in the context of Aggregations, not Assembly.
147
+ #
148
+ # The named options are:
149
+ # [on] The keys of the join, an array of strings if they are the same in
150
+ # all inputs, or a hash mapping assembly names to key names if they
151
+ # differ across inputs.
152
+ # [declared_fields] By default, a deduplicated array of incoming field
153
+ # names (see Cascading::dedup_fields). Specifies the
154
+ # names of the fields that will be available to
155
+ # aggregations or post-join if no aggregations are
156
+ # specified.
157
+ # [joiner] A specification of the c.p.j.Joiner to use. Values like :inner
158
+ # and 'inner', :right and 'right' are accepted, as well as an
159
+ # array specifying mixed joins. Typically, this is not provided,
160
+ # but one of the higher level join methods on Assembly is used
161
+ # directly (like Assembly#inner_join or Assembly#right_join).
162
+ #
163
+ # Example:
164
+ # assembly 'join_left_right' do
165
+ # join 'left', 'right', :on => ['key1', 'key2'], :joiner => :inner do
166
+ # sum 'val1', 'val2', :type => :long
167
+ # end
168
+ # end
169
+ def join(*args_with_options, &block)
170
+ options, assembly_names = args_with_options.extract_options!, args_with_options
196
171
  options[:hash] = false
197
- args << options
198
- prepare_join(*args, &block)
172
+ prepare_join(assembly_names, options, &block)
199
173
  end
200
174
  alias co_group join
201
175
 
202
- def inner_join(*args, &block)
203
- options = args.extract_options!
176
+ # Builds an inner join (CoGroup) pipe. Requires a list of assembly names to
177
+ # join and :on to specify the group_fields.
178
+ #
179
+ # The named options are:
180
+ # [on] The keys of the join, an array of strings if they are the same in
181
+ # all inputs, or a hash mapping assembly names to key names if they
182
+ # differ across inputs.
183
+ # [declared_fields] By default, a deduplicated array of incoming field
184
+ # names (see Cascading::dedup_fields). Specifies the
185
+ # names of the fields that will be available to
186
+ # aggregations or post-join if no aggregations are
187
+ # specified.
188
+ #
189
+ # Example:
190
+ # assembly 'join_left_right' do
191
+ # inner_join 'left', 'right', :on => ['key1', 'key2']
192
+ # sum 'val1', 'val2', :type => :long
193
+ # end
194
+ # end
195
+ def inner_join(*args_with_options, &block)
196
+ options = args_with_options.extract_options!
204
197
  options[:joiner] = :inner
205
- args << options
206
- join(*args, &block)
198
+ args_with_options << options
199
+ join(*args_with_options, &block)
207
200
  end
208
201
 
209
- def left_join(*args, &block)
210
- options = args.extract_options!
202
+ # Builds a left join (CoGroup) pipe. Requires a list of assembly names to
203
+ # join and :on to specify the group_fields.
204
+ #
205
+ # The named options are:
206
+ # [on] The keys of the join, an array of strings if they are the same in
207
+ # all inputs, or a hash mapping assembly names to key names if they
208
+ # differ across inputs.
209
+ # [declared_fields] By default, a deduplicated array of incoming field
210
+ # names (see Cascading::dedup_fields). Specifies the
211
+ # names of the fields that will be available to
212
+ # aggregations or post-join if no aggregations are
213
+ # specified.
214
+ #
215
+ # Example:
216
+ # assembly 'join_left_right' do
217
+ # left_join 'left', 'right', :on => ['key1', 'key2'] do
218
+ # sum 'val1', 'val2', :type => :long
219
+ # end
220
+ # end
221
+ def left_join(*args_with_options, &block)
222
+ options = args_with_options.extract_options!
211
223
  options[:joiner] = :left
212
- args << options
213
- join(*args, &block)
224
+ args_with_options << options
225
+ join(*args_with_options, &block)
214
226
  end
215
227
 
216
- def right_join(*args, &block)
217
- options = args.extract_options!
228
+ # Builds a right join (CoGroup) pipe. Requires a list of assembly names to
229
+ # join and :on to specify the group_fields.
230
+ #
231
+ # The named options are:
232
+ # [on] The keys of the join, an array of strings if they are the same in
233
+ # all inputs, or a hash mapping assembly names to key names if they
234
+ # differ across inputs.
235
+ # [declared_fields] By default, a deduplicated array of incoming field
236
+ # names (see Cascading::dedup_fields). Specifies the
237
+ # names of the fields that will be available to
238
+ # aggregations or post-join if no aggregations are
239
+ # specified.
240
+ #
241
+ # Example:
242
+ # assembly 'join_left_right' do
243
+ # right_join 'left', 'right', :on => ['key1', 'key2'] do
244
+ # sum 'val1', 'val2', :type => :long
245
+ # end
246
+ # end
247
+ def right_join(*args_with_options, &block)
248
+ options = args_with_options.extract_options!
218
249
  options[:joiner] = :right
219
- args << options
220
- join(*args, &block)
250
+ args_with_options << options
251
+ join(*args_with_options, &block)
221
252
  end
222
253
 
223
- def outer_join(*args, &block)
224
- options = args.extract_options!
254
+ # Builds an outer join (CoGroup) pipe. Requires a list of assembly names to
255
+ # join and :on to specify the group_fields.
256
+ #
257
+ # The named options are:
258
+ # [on] The keys of the join, an array of strings if they are the same in
259
+ # all inputs, or a hash mapping assembly names to key names if they
260
+ # differ across inputs.
261
+ # [declared_fields] By default, a deduplicated array of incoming field
262
+ # names (see Cascading::dedup_fields). Specifies the
263
+ # names of the fields that will be available to
264
+ # aggregations or post-join if no aggregations are
265
+ # specified.
266
+ #
267
+ # Example:
268
+ # assembly 'join_left_right' do
269
+ # outer_join 'left', 'right', :on => ['key1', 'key2'] do
270
+ # sum 'val1', 'val2', :type => :long
271
+ # end
272
+ # end
273
+ def outer_join(*args_with_options, &block)
274
+ options = args_with_options.extract_options!
225
275
  options[:joiner] = :outer
226
- args << options
227
- join(*args, &block)
276
+ args_with_options << options
277
+ join(*args_with_options, &block)
228
278
  end
229
279
 
230
- # Builds a new branch.
280
+ # Builds a child Assembly that branches this Assembly given a name and
281
+ # block.
282
+ #
283
+ # An assembly's name is quite important as it will determine:
284
+ # * The sources from which it will read, if any
285
+ # * The name to be used in joins or unions downstream
286
+ # * The name to be used to sink the output of the assembly downstream
287
+ #
288
+ # Many branches may be built within an assembly. The result of a branch is
289
+ # the same as the Flow#assembly constructor, an Assembly object.
290
+ #
291
+ # Example:
292
+ # assembly 'some_work' do
293
+ # ...
294
+ #
295
+ # branch 'more_work' do
296
+ # ...
297
+ # end
298
+ #
299
+ # branch 'yet_more_work' do
300
+ # ...
301
+ # end
302
+ # end
231
303
  def branch(name, &block)
232
304
  raise "Could not build branch '#{name}'; block required" unless block_given?
233
305
  assembly = Assembly.new(name, self, @outgoing_scopes)
@@ -236,11 +308,27 @@ module Cascading
236
308
  assembly
237
309
  end
238
310
 
239
- # Builds a new GroupBy pipe that groups on the fields given in args.
240
- # Any block passed to this method should contain only Everies.
241
- def group_by(*args, &block)
242
- options = args.extract_options!
243
- group_fields = fields(args)
311
+ # Builds a new GroupBy pipe that groups on the fields given in
312
+ # args_with_options. The block passed to this method will be evaluated in
313
+ # the context of Aggregations, not Assembly.
314
+ #
315
+ # The named options are:
316
+ # [sort_by] Optional keys for within-group sort.
317
+ # [reverse] Boolean that can reverse the order of within-group sorting
318
+ # (only makes sense given :sort_by keys).
319
+ #
320
+ # Example:
321
+ # assembly 'total' do
322
+ # ...
323
+ # insert 'const' => 1
324
+ # group_by 'const' do
325
+ # count
326
+ # sum 'val1', 'val2', :type => :long
327
+ # end
328
+ # discard 'const'
329
+ # end
330
+ def group_by(*args_with_options, &block)
331
+ options, group_fields = args_with_options.extract_options!, fields(args_with_options)
244
332
  sort_fields = fields(options[:sort_by])
245
333
  reverse = options[:reverse]
246
334
 
@@ -251,16 +339,31 @@ module Cascading
251
339
  # Unifies multiple incoming pipes sharing the same field structure using a
252
340
  # GroupBy. Accepts :on like join and :sort_by and :reverse like group_by,
253
341
  # as well as a block which may be used for a sequence of Every
254
- # aggregations.
342
+ # aggregations. The block passed to this method will be evaluated in the
343
+ # context of Aggregations, not Assembly.
255
344
  #
256
345
  # By default, groups only on the first field (see line 189 of GroupBy.java)
257
- def union(*args, &block)
258
- options = args.extract_options!
346
+ #
347
+ # The named options are:
348
+ # [on] The keys of the union, which defaults to the first field in the
349
+ # first input assembly.
350
+ # [sort_by] Optional keys for sorting.
351
+ # [reverse] Boolean that can reverse the order of sorting
352
+ # (only makes sense given :sort_by keys).
353
+ #
354
+ # Example:
355
+ # assembly 'union_left_right' do
356
+ # union 'left', 'right' do
357
+ # sum 'val1', 'val2', :type => :long
358
+ # end
359
+ # end
360
+ def union(*args_with_options, &block)
361
+ options, assembly_names = args_with_options.extract_options!, args_with_options
259
362
  group_fields = fields(options[:on])
260
363
  sort_fields = fields(options[:sort_by])
261
364
  reverse = options[:reverse]
262
365
 
263
- pipes, _ = populate_incoming_scopes(args)
366
+ pipes, _ = populate_incoming_scopes(assembly_names)
264
367
 
265
368
  # Must provide group_fields to ensure field name propagation
266
369
  group_fields = fields(@incoming_scopes.first.values_fields.get(0)) unless group_fields
@@ -273,10 +376,15 @@ module Cascading
273
376
  end
274
377
  alias :union_pipes :union
275
378
 
276
- # Allows you to plugin c.p.SubAssemblies to a cascading.jruby Assembly
277
- # under certain assumptions. Note the default is to extend the tail pipe
278
- # of this Assembly using a linear SubAssembly. See SubAssembly class for
279
- # details.
379
+ # Allows you to plugin c.p.SubAssemblies to an Assembly under certain
380
+ # assumptions. Note the default is to extend the tail pipe of this
381
+ # Assembly using a linear SubAssembly. See SubAssembly class for details.
382
+ #
383
+ # Example:
384
+ # assembly 'id_rows' do
385
+ # ...
386
+ # sub_assembly Java::CascadingPipeAssembly::Discard.new(tail_pipe, fields('id'))
387
+ # end
280
388
  def sub_assembly(sub_assembly, pipes = [tail_pipe], incoming_scopes = [scope])
281
389
  sub_assembly = SubAssembly.new(self, sub_assembly)
282
390
  sub_assembly.finalize(pipes, incoming_scopes)
@@ -287,17 +395,24 @@ module Cascading
287
395
  sub_assembly
288
396
  end
289
397
 
290
- # Builds a basic _each_ pipe, and adds it to the current assembly.
291
- # --
398
+ # Builds a basic each pipe and adds it to the current Assembly. Default
399
+ # arguments are all_fields, a default inherited from c.o.Each. Exactly one
400
+ # of :function and :filter must be specified and filters do not support an
401
+ # :output selector.
402
+ #
403
+ # The named options are:
404
+ # [filter] A Cascading Filter, mutually exclusive with :function.
405
+ # [function] A Cascading Function, mutually exclusive with :filter.
406
+ # [output] c.p.Each output selector, only valid with :function.
407
+ #
292
408
  # Example:
293
- # each 'line', :function => regex_splitter(['name', 'val1', 'val2', 'id'], :pattern => /[.,]*\s+/), :output => ['id', 'name', 'val1', 'val2']
294
- def each(*args)
295
- options = args.extract_options!
296
-
297
- in_fields = fields(args)
298
- out_fields = fields(options[:output])
299
-
409
+ # each fields(input_fields), :function => Java::CascadingOperation::Identity.new
410
+ # each 'field1', 'field2', :function => Java::CascadingOperation::Identity.new
411
+ def each(*args_with_options)
412
+ options, in_fields = args_with_options.extract_options!, fields(args_with_options)
413
+ out_fields = fields(options[:output]) # Default Fields.RESULTS from c.o.Each
300
414
  operation = options[:filter] || options[:function]
415
+ raise 'each requires either :filter or :function' unless operation
301
416
  raise 'c.p.Each does not support applying an output selector to a c.o.Filter' if options[:filter] && options[:output]
302
417
 
303
418
  parameters = [tail_pipe, in_fields, operation, out_fields].compact
@@ -308,468 +423,156 @@ module Cascading
308
423
  each
309
424
  end
310
425
 
311
- # Restricts the current assembly to the specified fields.
312
- # --
313
- # Example:
314
- # project "field1", "field2"
315
- def project(*args)
316
- each fields(args), :function => Java::CascadingOperation::Identity.new
317
- end
318
-
319
- # Removes the specified fields from the current assembly.
320
- # --
321
- # Example:
322
- # discard "field1", "field2"
323
- def discard(*args)
324
- discard_fields = fields(args)
325
- keep_fields = difference_fields(scope.values_fields, discard_fields)
326
- project(*keep_fields.to_a)
327
- end
328
-
329
- # Renames fields according to the mapping provided.
330
- # --
331
- # Example:
332
- # rename "old_name" => "new_name"
333
- def rename(name_map)
334
- old_names = scope.values_fields.to_a
335
- new_names = old_names.map{ |name| name_map[name] || name }
336
- invalid = name_map.keys.sort - old_names
337
- raise "invalid names: #{invalid.inspect}" unless invalid.empty?
338
-
339
- each all_fields, :function => Java::CascadingOperation::Identity.new(fields(new_names))
340
- end
341
-
342
- def cast(type_map)
343
- names = type_map.keys.sort
344
- types = JAVA_TYPE_MAP.values_at(*type_map.values_at(*names))
345
- fields = fields(names)
346
- types = types.to_java(java.lang.Class)
347
- each fields, :function => Java::CascadingOperation::Identity.new(fields, types)
348
- end
349
-
350
- def copy(*args)
351
- options = args.extract_options!
352
- from = args[0] || all_fields
353
- into = args[1] || options[:into] || all_fields
354
- each fields(from), :function => Java::CascadingOperation::Identity.new(fields(into)), :output => all_fields
355
- end
356
-
357
- # A pipe that does nothing.
358
- def pass(*args)
359
- each all_fields, :function => Java::CascadingOperation::Identity.new
360
- end
426
+ include Operations
427
+ include IdentityOperations
428
+ include FilterOperations
429
+ include RegexOperations
430
+ include TextOperations
361
431
 
362
- def assert(*args)
363
- options = args.extract_options!
364
- assertion = args[0]
432
+ # Builds an each assertion pipe given a c.o.a.Assertion and adds it to the
433
+ # current Assembly.
434
+ #
435
+ # The named options are:
436
+ # [level] The assertion level; defaults to strict.
437
+ def assert(assertion, options = {})
365
438
  assertion_level = options[:level] || Java::CascadingOperation::AssertionLevel::STRICT
366
439
 
367
440
  parameters = [tail_pipe, assertion_level, assertion]
368
441
  make_pipe(Java::CascadingPipe::Each, parameters)
369
442
  end
370
443
 
371
- # Builds a debugging pipe.
372
- #
373
- # Without arguments, it generate a simple debug pipe, that prints all tuple to the standard
374
- # output.
375
- #
376
- # The other named options are:
377
- # * <tt>:print_fields</tt> a boolean. If is set to true, then it prints every 10 tuples.
378
- #
379
- def debug(*args)
380
- options = args.extract_options!
381
- print_fields = options[:print_fields] || true
382
- parameters = [print_fields].compact
383
- debug = Java::CascadingOperation::Debug.new(*parameters)
384
- debug.print_tuple_every = options[:tuple_interval] || 1
385
- debug.print_fields_every = options[:fields_interval] || 10
386
- each(all_fields, :filter => debug)
387
- end
388
-
389
- # Builds a pipe that assert the size of the tuple is the size specified in parameter.
390
- #
391
- # The method accept an unique uname argument : a number indicating the size expected.
392
- def assert_size_equals(*args)
393
- options = args.extract_options!
394
- assertion = Java::CascadingOperationAssertion::AssertSizeEquals.new(args[0])
444
+ # Builds a pipe that asserts the size of the tuple is the specified size.
445
+ def assert_size_equals(size, options = {})
446
+ assertion = Java::CascadingOperationAssertion::AssertSizeEquals.new(size)
395
447
  assert(assertion, options)
396
448
  end
397
449
 
398
- # Builds a pipe that assert the none of the fields in the tuple are null.
399
- def assert_not_null(*args)
400
- options = args.extract_options!
450
+ # Builes a pipe that asserts none of the fiels in the tuple are null.
451
+ def assert_not_null(options = {})
401
452
  assertion = Java::CascadingOperationAssertion::AssertNotNull.new
402
453
  assert(assertion, options)
403
454
  end
404
455
 
405
- # Builds a _parse_ pipe. This pipe will parse the fields specified in input (first unamed arguments),
406
- # using a specified regex pattern.
407
- #
408
- # If provided, the unamed arguments must be the fields to be parsed. If not provided, then all incoming
409
- # fields are used.
410
- #
411
- # The named options are:
412
- # * <tt>:pattern</tt> a string or regex. Specifies the regular expression used for parsing the argument fields.
413
- # * <tt>:output</tt> a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
414
- def parse(*args)
415
- options = args.extract_options!
416
- fields = args || all_fields
417
- pattern = options[:pattern]
418
- output = options[:output] || all_fields
419
- each(fields, :function => regex_parser(pattern, options), :output => output)
420
- end
456
+ private
421
457
 
422
- # Builds a pipe that splits a field into other fields, using a specified regular expression.
423
- #
424
- # The first unnamed argument is the field to be split.
425
- # The second unnamed argument is an array of strings indicating the fields receiving the result of the split.
426
- #
427
- # The named options are:
428
- # * <tt>:pattern</tt> a string or regex. Specifies the regular expression used for splitting the argument fields.
429
- # * <tt>:output</tt> a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
430
- def split(*args)
431
- options = args.extract_options!
432
- fields = options[:into] || args[1]
433
- pattern = options[:pattern] || /[.,]*\s+/
434
- output = options[:output] || all_fields
435
- each(args[0], :function => regex_splitter(fields, :pattern => pattern), :output=>output)
436
- end
437
-
438
- # Builds a pipe that splits a field into new rows, using a specified regular expression.
439
- #
440
- # The first unnamed argument is the field to be split.
441
- # The second unnamed argument is the field receiving the result of the split.
442
- #
443
- # The named options are:
444
- # * <tt>:pattern</tt> a string or regex. Specifies the regular expression used for splitting the argument fields.
445
- # * <tt>:output</tt> a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
446
- def split_rows(*args)
447
- options = args.extract_options!
448
- fields = options[:into] || args[1]
449
- pattern = options[:pattern] || /[.,]*\s+/
450
- output = options[:output] || all_fields
451
- each(args[0], :function => regex_split_generator(fields, :pattern => pattern), :output=>output)
452
- end
453
-
454
- # Builds a pipe that emits a new row for each regex group matched in a field, using a specified regular expression.
455
- #
456
- # The first unnamed argument is the field to be matched against.
457
- # The second unnamed argument is the field receiving the result of the match.
458
- #
459
- # The named options are:
460
- # * <tt>:pattern</tt> a string or regex. Specifies the regular expression used for matching the argument fields.
461
- # * <tt>:output</tt> a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
462
- def match_rows(*args)
463
- options = args.extract_options!
464
- fields = options[:into] || args[1]
465
- pattern = options[:pattern] || /[\w]+/
466
- output = options[:output] || all_fields
467
- each(args[0], :function => regex_generator(fields, :pattern => pattern), :output=>output)
468
- end
469
-
470
- # Builds a pipe that parses the specified field as a date using hte provided format string.
471
- # The unamed argument specifies the field to format.
472
- #
473
- # The named options are:
474
- # * <tt>:into</tt> a string. It specifies the receiving field. By default, it will be named after
475
- # the input argument.
476
- # * <tt>:pattern</tt> a string. Specifies the date format.
477
- # * <tt>:output</tt> a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
478
- def parse_date(*args)
479
- options = args.extract_options!
480
- field = options[:into] || "#{args[0]}_parsed"
481
- output = options[:output] || all_fields
482
- pattern = options[:pattern] || "yyyy/MM/dd"
483
-
484
- each args[0], :function => date_parser(field, pattern), :output => output
485
- end
458
+ def make_pipe(type, parameters)
459
+ @tail_pipe = type.new(*parameters)
460
+ @outgoing_scopes[name] = Scope.outgoing_scope(tail_pipe, [scope])
486
461
 
487
- # Builds a pipe that format a date using a specified format pattern.
488
- #
489
- # The unamed argument specifies the field to format.
490
- #
491
- # The named options are:
492
- # * <tt>:into</tt> a string. It specifies the receiving field. By default, it will be named after
493
- # the input argument.
494
- # * <tt>:pattern</tt> a string. Specifies the date format.
495
- # * <tt>:timezone</tt> a string. Specifies the timezone (defaults to UTC).
496
- # * <tt>:output</tt> a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
497
- def format_date(*args)
498
- options = args.extract_options!
499
- field = options[:into] || "#{args[0]}_formatted"
500
- pattern = options[:pattern] || "yyyy/MM/dd"
501
- output = options[:output] || all_fields
502
-
503
- each args[0], :function => date_formatter(field, pattern, options[:timezone]), :output => output
462
+ tail_pipe
504
463
  end
505
464
 
506
- # Builds a pipe that perform a query/replace based on a regular expression.
507
- #
508
- # The first unamed argument specifies the input field.
509
- #
510
- # The named options are:
511
- # * <tt>:pattern</tt> a string or regex. Specifies the pattern to look for in the input field. This non-optional argument
512
- # can also be specified as a second _unamed_ argument.
513
- # * <tt>:replacement</tt> a string. Specifies the replacement.
514
- # * <tt>:output</tt> a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
515
- def replace(*args)
516
- options = args.extract_options!
517
-
518
- pattern = options[:pattern] || args[1]
519
- replacement = options[:replacement] || args[2]
520
- into = options[:into] || "#{args[0]}_replaced"
521
- output = options[:output] || all_fields
522
-
523
- each args[0], :function => regex_replace(into, pattern, replacement), :output => output
524
- end
465
+ def populate_incoming_scopes(assembly_names, group_fields_args = {})
466
+ # NOTE: this overrides the existing incoming_scopes, which changes the
467
+ # way describe will function on this assembly
468
+ pipes, @incoming_scopes, group_fields = [], [], []
469
+ assembly_names.each do |assembly_name|
470
+ assembly = parent_flow.find_child(assembly_name)
471
+ raise "Could not find assembly '#{assembly_name}' from '#{name}'" unless assembly
525
472
 
526
- # Builds a pipe that inserts values into the current tuple.
527
- #
528
- # The method takes a hash as parameter. This hash contains as keys the names of the fields to insert
529
- # and as values, the values they must contain. For example:
530
- #
531
- # insert {"who" => "Grégoire", "when" => Time.now.strftime("%Y-%m-%d") }
532
- #
533
- # will insert two new fields: a field _who_ containing the string "Grégoire", and a field _when_ containing
534
- # the formatted current date.
535
- # The methods outputs all fields.
536
- # The named options are:
537
- def insert(args)
538
- args.keys.sort.each do |field_name|
539
- value = args[field_name]
540
-
541
- if value.kind_of?(ExprStub)
542
- value.validate_scope(scope)
543
- each all_fields, :function => expression_function(field_name, :expression => value.expression, :parameters => value.types), :output => all_fields
544
- else
545
- each all_fields, :function => insert_function([field_name], :values => [value]), :output => all_fields
546
- end
473
+ pipes << assembly.tail_pipe
474
+ @incoming_scopes << assembly.scope
475
+ group_fields << fields(group_fields_args[assembly_name]) if group_fields_args[assembly_name]
547
476
  end
477
+ [pipes, group_fields]
548
478
  end
549
479
 
550
- # Builds a pipe that filters the tuples based on an expression or a pattern (but not both !).
551
- #
552
- # The first unamed argument, if provided, is a filtering expression (using the Janino syntax).
553
- #
554
- # The named options are:
555
- # * <tt>:pattern</tt> a string. Specifies a regular expression pattern used to filter the tuples. If this
556
- # option is provided, then the filter is regular expression-based. This is incompatible with the _expression_ option.
557
- # * <tt>:expression</tt> a string. Specifies a Janino expression used to filter the tuples. This option has the
558
- # same effect than providing it as first unamed argument. If this option is provided, then the filter is Janino
559
- # expression-based. This is incompatible with the _pattern_ option.
560
- # * <tt>:validate</tt> a boolean. Passed into Cascading#expr to enable or disable
561
- # expression validation. Defaults to true.
562
- # * <tt>:validate_with</tt> a hash. Actual arguments used by Cascading#expr for
563
- # expression validation. Defaults to {}.
564
- def filter(*args)
565
- options = args.extract_options!
566
- from = options.delete(:from) || all_fields
567
- expression = options.delete(:expression) || args.shift
568
- regex = options.delete(:pattern)
569
- validate = options.has_key?(:validate) ? options.delete(:validate) : true
570
- validate_with = options.has_key?(:validate_with) ? options.delete(:validate_with) : {}
571
-
572
- if expression
573
- stub = expr(expression, { :validate => validate, :validate_with => validate_with })
574
- types, expression = stub.types, stub.expression
575
-
576
- stub.validate_scope(scope)
577
- each from, :filter => expression_filter(
578
- :parameters => types,
579
- :expression => expression
580
- )
581
- elsif regex
582
- each from, :filter => regex_filter(regex, options)
583
- end
584
- end
480
+ def apply_aggregations(group, incoming_scopes, &block)
481
+ aggregations = Aggregations.new(self, group, incoming_scopes)
482
+ aggregations.instance_eval(&block) if block_given?
585
483
 
586
- def filter_null(*args)
587
- options = args.extract_options!
588
- each(args, :filter => Java::CascadingOperationFilter::FilterNull.new)
589
- end
590
- alias reject_null filter_null
484
+ # Sorting of any type means that we cannot use the AggregateBy optimization
485
+ if aggregations.can_aggregate_by? && !group.is_sorted && !group.is_sort_reversed
486
+ grouping_fields = group.key_selectors.values.first
487
+ group.key_selectors.values.each do |key_fields|
488
+ raise "Grouping fields mismatch: #{grouping_fields} expected; #{key_fields} found from #{group.key_selectors}" unless key_fields == grouping_fields
489
+ end
591
490
 
592
- def filter_not_null(*args)
593
- options = args.extract_options!
594
- each(args, :filter => Java::CascadingOperationFilter::FilterNotNull.new)
595
- end
596
- alias where_null filter_not_null
491
+ aggregate_by = sub_assembly(Java::CascadingPipeAssembly::AggregateBy.new(
492
+ name,
493
+ group.previous,
494
+ grouping_fields,
495
+ aggregations.aggregate_bys.to_java(Java::CascadingPipeAssembly::AggregateBy)
496
+ ), group.previous, incoming_scopes)
597
497
 
598
- # Builds a pipe that rejects the tuples based on an expression.
599
- #
600
- # The first unamed argument, if provided, is a filtering expression (using the Janino syntax).
601
- #
602
- # The named options are:
603
- # * <tt>:expression</tt> a string. Specifies a Janino expression used to filter the tuples. This option has the
604
- # same effect than providing it as first unamed argument. If this option is provided, then the filter is Janino
605
- # expression-based.
606
- # * <tt>:validate</tt> a boolean. Passed into Cascading#expr to enable or disable
607
- # expression validation. Defaults to true.
608
- # * <tt>:validate_with</tt> a hash. Actual arguments used by Cascading#expr for
609
- # expression validation. Defaults to {}.
610
- def reject(*args)
611
- options = args.extract_options
612
- raise "Regex not allowed" if options && options[:pattern]
613
-
614
- filter(*args)
615
- end
498
+ aggregate_by
499
+ else
500
+ aggregations.finalize if block_given?
501
+ @tail_pipe = aggregations.tail_pipe
502
+ @outgoing_scopes[name] = aggregations.scope
616
503
 
617
- # Builds a pipe that includes just the tuples matching an expression.
618
- #
619
- # The first unamed argument, if provided, is a filtering expression (using the Janino syntax).
620
- #
621
- # The named options are:
622
- # * <tt>:expression</tt> a string. Specifies a Janino expression used to select the tuples. This option has the
623
- # same effect than providing it as first unamed argument. If this option is provided, then the filter is Janino
624
- # expression-based.
625
- # * <tt>:validate</tt> a boolean. Passed into Cascading#expr to enable or disable
626
- # expression validation. Defaults to true.
627
- # * <tt>:validate_with</tt> a hash. Actual arguments used by Cascading#expr for
628
- # expression validation. Defaults to {}.
629
- def where(*args)
630
- options = args.extract_options
631
- raise "Regex not allowed" if options && options[:pattern]
632
-
633
- if options[:expression]
634
- _, imports, expr = options[:expression].match(/^((?:\s*import.*;\s*)*)(.*)$/).to_a
635
- options[:expression] = "#{imports}!(#{expr})"
636
- elsif args[0]
637
- _, imports, expr = args[0].match(/^((?:\s*import.*;\s*)*)(.*)$/).to_a
638
- args[0] = "#{imports}!(#{expr})"
504
+ group
639
505
  end
640
-
641
- filter(*args)
642
506
  end
643
507
 
644
- # Builds a pipe that evaluates the specified Janino expression and insert it in a new field in the tuple.
645
- #
646
- # The named options are:
647
- # * <tt>:from</tt> a string or array of strings. Specifies the input fields.
648
- # * <tt>:express</tt> a string. The janino expression.
649
- # * <tt>:into</tt> a string. Specified the name of the field to insert with the result of the evaluation.
650
- # * <tt>:parameters</tt> a hash. Specifies the type mapping for the parameters. See Cascading::Operations.expression_function.
651
- def eval_expression(*args)
652
- options = args.extract_options!
653
-
654
- into = options.delete(:into)
655
- from = options.delete(:from) || all_fields
656
- output = options.delete(:output) || all_fields
657
- options[:expression] ||= args.shift
658
- options[:parameters] ||= args.shift
659
-
660
- each from, :function => expression_function(into, options), :output=>output
661
- end
508
+ def prepare_join(assembly_names, options, &block)
509
+ pipes, _ = populate_incoming_scopes(assembly_names)
662
510
 
663
- # Builds a pipe that returns distinct tuples based on the provided fields.
664
- #
665
- # The method accepts optional unamed argument specifying the fields to base the distinct on
666
- # (all fields, by default).
667
- def distinct(*args)
668
- raise "Distinct is badly broken"
669
- fields = args[0] || all_fields
670
- group_by *fields
671
- pass
672
- end
673
-
674
- def join_fields(*args)
675
- options = args.extract_options!
676
- output = options[:output] || all_fields
511
+ group_fields_args = options[:on]
512
+ raise 'join requires :on parameter' unless group_fields_args
677
513
 
678
- each args, :function => field_joiner(options), :output => output
679
- end
514
+ if group_fields_args.kind_of?(String)
515
+ group_fields_args = [group_fields_args]
516
+ end
680
517
 
681
- # Ungroups, or unpivots, a tuple (see Cascading's UnGroup at http://docs.cascading.org/cascading/2.0/javadoc/cascading/operation/function/UnGroup.html).
682
- #
683
- # You must provide :key and you must provide only one of :value_selectors
684
- # and :num_values.
685
- #
686
- # The named options are:
687
- # * <tt>:key</tt> required array of field names to replicate on every
688
- # output row in an ungrouped group.
689
- # * <tt>:value_selectors</tt> an array of field names to ungroup. Each
690
- # field will be ungrouped into an output tuple along with the key fields
691
- # in the order provided.
692
- # * <tt>:num_values</tt> an integer specifying the number of fields to
693
- # ungroup into each output tuple (excluding the key fields). All input
694
- # fields will be ungrouped.
695
- # * <tt>:input</tt> an array of field names that specifies the fields to
696
- # input to UnGroup. Defaults to all_fields.
697
- # * <tt>:into</tt> an array of field names. Default set by UnGroup.
698
- # * <tt>:output</tt> an array of field names that specifies the fields to
699
- # produce as output of UnGroup. Defaults to all_fields.
700
- def ungroup(*args)
701
- options = args.extract_options!
702
- input = options[:input] || all_fields
703
- into = fields(options[:into])
704
- output = options[:output] || all_fields
705
- key = fields(options[:key])
706
-
707
- raise 'You must provide exactly one of :value_selectors or :num_values to ungroup' unless options.has_key?(:value_selectors) ^ options.has_key?(:num_values)
708
- value_selectors = options[:value_selectors].map{ |vs| fields(vs) }.to_java(Java::CascadingTuple::Fields) if options.has_key?(:value_selectors)
709
- num_values = options[:num_values] if options.has_key?(:num_values)
710
-
711
- parameters = [into, key, value_selectors, num_values].compact
712
- each input, :function => Java::CascadingOperationFunction::UnGroup.new(*parameters), :output => output
713
- end
518
+ group_fields = []
519
+ if group_fields_args.kind_of?(Array)
520
+ pipes.size.times do
521
+ group_fields << fields(group_fields_args)
522
+ end
523
+ elsif group_fields_args.kind_of?(Hash)
524
+ pipes, group_fields = populate_incoming_scopes(group_fields_args.keys.sort, group_fields_args)
525
+ else
526
+ raise "Unsupported data type for :on in join: '#{group_fields_args.class}'"
527
+ end
714
528
 
715
- # Inserts one of two values into the dataflow based upon the result of the
716
- # supplied filter on the input fields. This is primarily useful for
717
- # creating indicators from filters.
718
- #
719
- # Parameters:
720
- # * <tt>input</tt> name of field to apply the filter.
721
- # * <tt>filter</tt> Cascading Filter to apply.
722
- # * <tt>keep_value</tt> Java value to produce when the filter would keep
723
- # the given input.
724
- # * <tt>remove_value</tt> Java value to produce when the filter would
725
- # remove the given input.
726
- #
727
- # The named options are:
728
- # * <tt>:into</tt> an output field name, defaulting to 'filter_value'.
729
- # * <tt>:output</tt> an array of field names that specifies the fields to
730
- # retain in the output tuple. Defaults to all_fields.
731
- def set_value(input, filter, keep_value, remove_value, params = {})
732
- into = fields(params[:into] || 'filter_value')
733
- output = params[:output] || all_fields
734
- each input, :function => Java::CascadingOperationFunction::SetValue.new(into, filter, keep_value, remove_value), :output => output
735
- end
529
+ raise 'join requires non-empty :on parameter' if group_fields_args.empty?
530
+ group_fields = group_fields.to_java(Java::CascadingTuple::Fields)
531
+ incoming_fields = @incoming_scopes.map{ |s| s.values_fields }
532
+ declared_fields = fields(options[:declared_fields] || dedup_fields(*incoming_fields))
533
+ joiner = options[:joiner]
534
+ is_hash_join = options[:hash] || false
736
535
 
737
- # Efficient way of inserting a null indicator for any field, even one that
738
- # cannot be coerced to a string. This is accomplished using Cascading's
739
- # FilterNull and SetValue operators rather than Janino. 1 is produced if
740
- # the field is null and 0 otherwise.
741
- #
742
- # Parameters:
743
- # * <tt>input</tt> name of field to check for null.
744
- #
745
- # The named options are:
746
- # * <tt>:into</tt> an output field name, defaulting to 'is_null'.
747
- # * <tt>:output</tt> an array of field names that specifies the fields to
748
- # retain in the output tuple. Defaults to all_fields.
749
- def null_indicator(input, params = {})
750
- into = fields(params[:into] || 'is_null')
751
- output = params[:output] || all_fields
752
- set_value input, Java::CascadingOperationFilter::FilterNull.new, 1.to_java, 0.to_java, :into => into, :output => output
753
- end
536
+ case joiner
537
+ when :inner, 'inner', nil
538
+ joiner = Java::CascadingPipeJoiner::InnerJoin.new
539
+ when :left, 'left'
540
+ joiner = Java::CascadingPipeJoiner::LeftJoin.new
541
+ when :right, 'right'
542
+ joiner = Java::CascadingPipeJoiner::RightJoin.new
543
+ when :outer, 'outer'
544
+ joiner = Java::CascadingPipeJoiner::OuterJoin.new
545
+ when Array
546
+ joiner = joiner.map do |t|
547
+ case t
548
+ when true, 1, :inner then true
549
+ when false, 0, :outer then false
550
+ else fail "invalid mixed joiner entry: #{t}"
551
+ end
552
+ end
553
+ joiner = Java::CascadingPipeJoiner::MixedJoin.new(joiner.to_java(:boolean))
554
+ end
754
555
 
755
- # Given a field and a regex, returns an indicator that is 1 if the string
756
- # contains at least 1 match and 0 otherwise.
757
- #
758
- # Parameters:
759
- # * <tt>input</tt> field name or names that specifies the fields over which
760
- # to perform the match.
761
- # * <tt>pattern</tt> regex to apply to the input.
762
- #
763
- # The named options are:
764
- # * <tt>:into</tt> an output field name, defaulting to 'regex_contains'.
765
- # * <tt>:output</tt> an array of field names that specifies the fields to
766
- # retain in the output tuple. Defaults to all_fields.
767
- def regex_contains(input, pattern, params = {})
768
- input = fields(input)
769
- pattern = pattern.to_s # Supports JRuby regexes
770
- into = fields(params[:into] || 'regex_contains')
771
- output = params[:output] || all_fields
772
- set_value input, Java::CascadingOperationRegex::RegexFilter.new(pattern), 1.to_java, 0.to_java, :into => into, :output => output
556
+ if is_hash_join
557
+ parameters = [
558
+ pipes.to_java(Java::CascadingPipe::Pipe),
559
+ group_fields,
560
+ declared_fields,
561
+ joiner
562
+ ]
563
+ group_assembly = Java::CascadingPipe::HashJoin.new(*parameters)
564
+ else
565
+ result_group_fields = dedup_fields(*group_fields)
566
+ parameters = [
567
+ pipes.to_java(Java::CascadingPipe::Pipe),
568
+ group_fields,
569
+ declared_fields,
570
+ result_group_fields,
571
+ joiner
572
+ ]
573
+ group_assembly = Java::CascadingPipe::CoGroup.new(*parameters)
574
+ end
575
+ apply_aggregations(group_assembly, @incoming_scopes, &block)
773
576
  end
774
577
  end
775
578
  end