cascading.jruby 0.0.10 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE.txt +13 -160
- data/README.md +35 -0
- data/lib/cascading.rb +8 -41
- data/lib/cascading/aggregations.rb +216 -71
- data/lib/cascading/assembly.rb +409 -606
- data/lib/cascading/base.rb +22 -0
- data/lib/cascading/cascade.rb +55 -18
- data/lib/cascading/cascading.rb +137 -47
- data/lib/cascading/expr_stub.rb +31 -17
- data/lib/cascading/ext/array.rb +17 -0
- data/lib/cascading/filter_operations.rb +101 -0
- data/lib/cascading/flow.rb +87 -23
- data/lib/cascading/identity_operations.rb +82 -0
- data/lib/cascading/mode.rb +14 -10
- data/lib/cascading/operations.rb +109 -174
- data/lib/cascading/regex_operations.rb +133 -0
- data/lib/cascading/scope.rb +32 -9
- data/lib/cascading/sub_assembly.rb +8 -5
- data/lib/cascading/tap.rb +41 -17
- data/lib/cascading/text_operations.rb +67 -0
- data/test/mock_assemblies.rb +55 -0
- data/test/test_assembly.rb +23 -25
- data/test/test_local_execution.rb +7 -7
- data/test/test_operations.rb +0 -10
- metadata +76 -74
- data/History.txt +0 -58
data/lib/cascading/assembly.rb
CHANGED
@@ -1,15 +1,50 @@
|
|
1
1
|
require 'cascading/base'
|
2
2
|
require 'cascading/operations'
|
3
|
+
require 'cascading/identity_operations'
|
4
|
+
require 'cascading/filter_operations'
|
5
|
+
require 'cascading/regex_operations'
|
6
|
+
require 'cascading/text_operations'
|
3
7
|
require 'cascading/aggregations'
|
4
8
|
require 'cascading/sub_assembly'
|
5
9
|
require 'cascading/ext/array'
|
6
10
|
|
7
11
|
module Cascading
|
12
|
+
# An Assembly is a sequence of Cascading pipes (Each, GroupBy, CoGroup,
|
13
|
+
# Every, and SubAssembly). This class will serve as your primary mechanism
|
14
|
+
# for doing work within a flow and contains all the functions and filters you
|
15
|
+
# will apply to a pipe (Eaches), as well as group_by, union, and join. For
|
16
|
+
# aggregators and buffers, please see Aggregations.
|
17
|
+
#
|
18
|
+
# Function and filter DSL rules:
|
19
|
+
# * Use positional arguments for required parameters
|
20
|
+
# * Use options = {} for optional parameters
|
21
|
+
# * Use *args sparingly, specifically when you need to accept a varying length list of fields
|
22
|
+
# * If you require both a varying length list of fields and optional parameters, then see the Array#extract_options! extension
|
23
|
+
# * If you choose to name a required parameter, add it to options = {} and throw an exception if the caller does not provide it
|
24
|
+
# * If you have a require parameter that is provided by one of a set of options names, throw an exception if the caller does not provide at least one value (see :function and :filter in Assembly#each for an example)
|
25
|
+
#
|
26
|
+
# Function and filter DSL standard optional parameter names:
|
27
|
+
# [input] c.p.Each argument selector
|
28
|
+
# [into] c.o.Operation field declaration
|
29
|
+
# [output] c.p.Each output selector
|
30
|
+
#
|
31
|
+
# A note on aliases: when a DSL method uniquely wraps a single Cascading
|
32
|
+
# operation, we attempt to provide an alias that matches the Cascading
|
33
|
+
# operation. However, Cascading operations are often nouns rather than verbs,
|
34
|
+
# and the latter are preferable for a dataflow DSL.
|
8
35
|
class Assembly < Cascading::Node
|
9
|
-
include Operations
|
10
|
-
|
11
36
|
attr_reader :head_pipe, :tail_pipe
|
12
37
|
|
38
|
+
# Do not use this constructor directly; instead, use Flow#assembly or
|
39
|
+
# Assembly#branch to build assemblies.
|
40
|
+
#
|
41
|
+
# Builds an Assembly given a name, parent, and optional outgoing_scopes
|
42
|
+
# (necessary only for branching).
|
43
|
+
#
|
44
|
+
# An assembly's name is quite important as it will determine:
|
45
|
+
# * The sources from which it will read, if any
|
46
|
+
# * The name to be used in joins or unions downstream
|
47
|
+
# * The name to be used to sink the output of the assembly downstream
|
13
48
|
def initialize(name, parent, outgoing_scopes = {})
|
14
49
|
super(name, parent)
|
15
50
|
|
@@ -27,6 +62,11 @@ module Cascading
|
|
27
62
|
@incoming_scopes = [scope]
|
28
63
|
end
|
29
64
|
|
65
|
+
# Produces a textual description of this Assembly. The description details
|
66
|
+
# the structure of the Assembly, its input and output fields and any
|
67
|
+
# children (branches). The offset parameter allows for this describe to be
|
68
|
+
# nested within a calling context, which lets us indent the structural
|
69
|
+
# hierarchy of a job.
|
30
70
|
def describe(offset = '')
|
31
71
|
incoming_scopes_desc = "#{@incoming_scopes.map{ |incoming_scope| incoming_scope.values_fields.to_a.inspect }.join(', ')}"
|
32
72
|
incoming_scopes_desc = "(#{incoming_scopes_desc})" unless @incoming_scopes.size == 1
|
@@ -35,199 +75,231 @@ module Cascading
|
|
35
75
|
description
|
36
76
|
end
|
37
77
|
|
78
|
+
# Rather than the immediate parent, this method returns the parent flow of
|
79
|
+
# this Assembly. If this is a branch, we must traverse the parents of
|
80
|
+
# parent assemblies.
|
38
81
|
def parent_flow
|
39
82
|
return parent if parent.kind_of?(Flow)
|
40
83
|
parent.parent_flow
|
41
84
|
end
|
42
85
|
|
86
|
+
# Accesses the outgoing scope of this Assembly at the point at which it is
|
87
|
+
# called. This is useful for grabbing the values_fields at any point in
|
88
|
+
# the construction of the Assembly. See Scope for details.
|
43
89
|
def scope
|
44
90
|
@outgoing_scopes[name]
|
45
91
|
end
|
46
92
|
|
93
|
+
# Prints information about the scope of this Assembly at the point at which
|
94
|
+
# it is called. This allows you to trace the propagation of field names
|
95
|
+
# through your job and is handy for debugging. See Scope for details.
|
47
96
|
def debug_scope
|
48
97
|
puts "Current scope for '#{name}':\n #{scope}\n----------\n"
|
49
98
|
end
|
50
99
|
|
51
|
-
|
52
|
-
|
53
|
-
@outgoing_scopes[name] = Scope.outgoing_scope(tail_pipe, [scope])
|
54
|
-
|
55
|
-
tail_pipe
|
56
|
-
end
|
57
|
-
private :make_pipe
|
58
|
-
|
59
|
-
def populate_incoming_scopes(assembly_names, group_fields_args = {})
|
60
|
-
# NOTE: this overrides the existing incoming_scopes, which changes the
|
61
|
-
# way describe will function on this assembly
|
62
|
-
pipes, @incoming_scopes, group_fields = [], [], []
|
63
|
-
assembly_names.each do |assembly_name|
|
64
|
-
assembly = parent_flow.find_child(assembly_name)
|
65
|
-
raise "Could not find assembly '#{assembly_name}' from '#{name}'" unless assembly
|
66
|
-
|
67
|
-
pipes << assembly.tail_pipe
|
68
|
-
@incoming_scopes << assembly.scope
|
69
|
-
group_fields << fields(group_fields_args[assembly_name]) if group_fields_args[assembly_name]
|
70
|
-
end
|
71
|
-
[pipes, group_fields]
|
72
|
-
end
|
73
|
-
private :populate_incoming_scopes
|
74
|
-
|
75
|
-
def apply_aggregations(group, incoming_scopes, &block)
|
76
|
-
aggregations = Aggregations.new(self, group, incoming_scopes)
|
77
|
-
aggregations.instance_eval(&block) if block_given?
|
78
|
-
|
79
|
-
# Sorting of any type means that we cannot use the AggregateBy optimization
|
80
|
-
if aggregations.can_aggregate_by? && !group.is_sorted && !group.is_sort_reversed
|
81
|
-
grouping_fields = group.key_selectors.values.first
|
82
|
-
group.key_selectors.values.each do |key_fields|
|
83
|
-
raise "Grouping fields mismatch: #{grouping_fields} expected; #{key_fields} found from #{group.key_selectors}" unless key_fields == grouping_fields
|
84
|
-
end
|
85
|
-
|
86
|
-
aggregate_by = sub_assembly(Java::CascadingPipeAssembly::AggregateBy.new(
|
87
|
-
name,
|
88
|
-
group.previous,
|
89
|
-
grouping_fields,
|
90
|
-
aggregations.aggregate_bys.to_java(Java::CascadingPipeAssembly::AggregateBy)
|
91
|
-
), group.previous, incoming_scopes)
|
92
|
-
|
93
|
-
aggregate_by
|
94
|
-
else
|
95
|
-
aggregations.finalize if block_given?
|
96
|
-
@tail_pipe = aggregations.tail_pipe
|
97
|
-
@outgoing_scopes[name] = aggregations.scope
|
98
|
-
|
99
|
-
group
|
100
|
-
end
|
101
|
-
end
|
102
|
-
private :apply_aggregations
|
103
|
-
|
100
|
+
# Prints detail about this Assembly including its name, head pipe, and tail
|
101
|
+
# pipe.
|
104
102
|
def to_s
|
105
103
|
"#{name} : head pipe : #{head_pipe} - tail pipe: #{tail_pipe}"
|
106
104
|
end
|
107
105
|
|
108
|
-
def prepare_join(*args, &block)
|
109
|
-
options = args.extract_options!
|
110
|
-
|
111
|
-
pipes, _ = populate_incoming_scopes(args)
|
112
|
-
|
113
|
-
group_fields_args = options[:on]
|
114
|
-
raise 'join requires :on parameter' unless group_fields_args
|
115
|
-
|
116
|
-
if group_fields_args.kind_of?(String)
|
117
|
-
group_fields_args = [group_fields_args]
|
118
|
-
end
|
119
|
-
|
120
|
-
group_fields = []
|
121
|
-
if group_fields_args.kind_of?(Array)
|
122
|
-
pipes.size.times do
|
123
|
-
group_fields << fields(group_fields_args)
|
124
|
-
end
|
125
|
-
elsif group_fields_args.kind_of?(Hash)
|
126
|
-
pipes, group_fields = populate_incoming_scopes(group_fields_args.keys.sort, group_fields_args)
|
127
|
-
else
|
128
|
-
raise "Unsupported data type for :on in join: '#{group_fields_args.class}'"
|
129
|
-
end
|
130
|
-
|
131
|
-
raise 'join requires non-empty :on parameter' if group_fields_args.empty?
|
132
|
-
group_fields = group_fields.to_java(Java::CascadingTuple::Fields)
|
133
|
-
incoming_fields = @incoming_scopes.map{ |s| s.values_fields }
|
134
|
-
declared_fields = fields(options[:declared_fields] || dedup_fields(*incoming_fields))
|
135
|
-
joiner = options[:joiner]
|
136
|
-
is_hash_join = options[:hash] || false
|
137
|
-
|
138
|
-
case joiner
|
139
|
-
when :inner, 'inner', nil
|
140
|
-
joiner = Java::CascadingPipeJoiner::InnerJoin.new
|
141
|
-
when :left, 'left'
|
142
|
-
joiner = Java::CascadingPipeJoiner::LeftJoin.new
|
143
|
-
when :right, 'right'
|
144
|
-
joiner = Java::CascadingPipeJoiner::RightJoin.new
|
145
|
-
when :outer, 'outer'
|
146
|
-
joiner = Java::CascadingPipeJoiner::OuterJoin.new
|
147
|
-
when Array
|
148
|
-
joiner = joiner.map do |t|
|
149
|
-
case t
|
150
|
-
when true, 1, :inner then true
|
151
|
-
when false, 0, :outer then false
|
152
|
-
else fail "invalid mixed joiner entry: #{t}"
|
153
|
-
end
|
154
|
-
end
|
155
|
-
joiner = Java::CascadingPipeJoiner::MixedJoin.new(joiner.to_java(:boolean))
|
156
|
-
end
|
157
|
-
|
158
|
-
if is_hash_join
|
159
|
-
raise ArgumentError, "hash joins don't support aggregations" if block_given?
|
160
|
-
parameters = [
|
161
|
-
pipes.to_java(Java::CascadingPipe::Pipe),
|
162
|
-
group_fields,
|
163
|
-
declared_fields,
|
164
|
-
joiner
|
165
|
-
]
|
166
|
-
group_assembly = Java::CascadingPipe::HashJoin.new(*parameters)
|
167
|
-
else
|
168
|
-
result_group_fields = dedup_fields(*group_fields)
|
169
|
-
parameters = [
|
170
|
-
pipes.to_java(Java::CascadingPipe::Pipe),
|
171
|
-
group_fields,
|
172
|
-
declared_fields,
|
173
|
-
result_group_fields,
|
174
|
-
joiner
|
175
|
-
]
|
176
|
-
group_assembly = Java::CascadingPipe::CoGroup.new(*parameters)
|
177
|
-
end
|
178
|
-
apply_aggregations(group_assembly, @incoming_scopes, &block)
|
179
|
-
end
|
180
|
-
private :prepare_join
|
181
|
-
|
182
106
|
# Builds a HashJoin pipe. This should be used carefully, as the right side
|
183
|
-
# of the join is accumulated entirely in memory. Requires a list of
|
184
|
-
# names to join and :on to specify the join_fields.
|
185
|
-
|
186
|
-
|
107
|
+
# of the join is accumulated entirely in memory. Requires a list of
|
108
|
+
# assembly names to join and :on to specify the join_fields. Note that a
|
109
|
+
# hash_join "takes over" the Assembly in which it is built, so it is
|
110
|
+
# typically the first statement within the block of the assembly or branch.
|
111
|
+
# Additionally, a hash join does not accept a block for aggregations like
|
112
|
+
# other joins; this restriction is enforced here, but comes directly from
|
113
|
+
# Cascading.
|
114
|
+
#
|
115
|
+
# The named options are:
|
116
|
+
# [on] The keys of the join, an array of strings if they are the same in
|
117
|
+
# all inputs, or a hash mapping assembly names to key names if they
|
118
|
+
# differ across inputs.
|
119
|
+
# [declared_fields] By default, a deduplicated array of incoming field
|
120
|
+
# names (see Cascading::dedup_fields). Specifies the
|
121
|
+
# names of the fields that will be available to
|
122
|
+
# aggregations or post-join if no aggregations are
|
123
|
+
# specified.
|
124
|
+
# [joiner] A specification of the c.p.j.Joiner to use. Values like :inner
|
125
|
+
# and 'inner', :right and 'right' are accepted, as well as an
|
126
|
+
# array specifying mixed joins. Typically, this is not provided,
|
127
|
+
# but one of the higher level join methods on Assembly is used
|
128
|
+
# directly (like Assembly#inner_join or Assembly#right_join).
|
129
|
+
#
|
130
|
+
# Example:
|
131
|
+
# assembly 'join_left_right' do
|
132
|
+
# hash_join 'left', 'right', :on => ['key1', 'key2'], :joiner => :inner
|
133
|
+
# end
|
134
|
+
def hash_join(*args_with_options)
|
135
|
+
raise ArgumentError, "HashJoin doesn't support aggregations so the block provided to hash_join will be ignored" if block_given?
|
136
|
+
|
137
|
+
options, assembly_names = args_with_options.extract_options!, args_with_options
|
187
138
|
options[:hash] = true
|
188
|
-
|
189
|
-
prepare_join(*args, &block)
|
139
|
+
prepare_join(assembly_names, options)
|
190
140
|
end
|
191
141
|
|
192
142
|
# Builds a join (CoGroup) pipe. Requires a list of assembly names to join
|
193
|
-
# and :on to specify the group_fields.
|
194
|
-
|
195
|
-
|
143
|
+
# and :on to specify the group_fields. Note that a join "takes over" the
|
144
|
+
# Assembly in which it is built, so it is typically the first statement
|
145
|
+
# within the block of the assembly or branch. The block passed to this
|
146
|
+
# method will be evaluated in the context of Aggregations, not Assembly.
|
147
|
+
#
|
148
|
+
# The named options are:
|
149
|
+
# [on] The keys of the join, an array of strings if they are the same in
|
150
|
+
# all inputs, or a hash mapping assembly names to key names if they
|
151
|
+
# differ across inputs.
|
152
|
+
# [declared_fields] By default, a deduplicated array of incoming field
|
153
|
+
# names (see Cascading::dedup_fields). Specifies the
|
154
|
+
# names of the fields that will be available to
|
155
|
+
# aggregations or post-join if no aggregations are
|
156
|
+
# specified.
|
157
|
+
# [joiner] A specification of the c.p.j.Joiner to use. Values like :inner
|
158
|
+
# and 'inner', :right and 'right' are accepted, as well as an
|
159
|
+
# array specifying mixed joins. Typically, this is not provided,
|
160
|
+
# but one of the higher level join methods on Assembly is used
|
161
|
+
# directly (like Assembly#inner_join or Assembly#right_join).
|
162
|
+
#
|
163
|
+
# Example:
|
164
|
+
# assembly 'join_left_right' do
|
165
|
+
# join 'left', 'right', :on => ['key1', 'key2'], :joiner => :inner do
|
166
|
+
# sum 'val1', 'val2', :type => :long
|
167
|
+
# end
|
168
|
+
# end
|
169
|
+
def join(*args_with_options, &block)
|
170
|
+
options, assembly_names = args_with_options.extract_options!, args_with_options
|
196
171
|
options[:hash] = false
|
197
|
-
|
198
|
-
prepare_join(*args, &block)
|
172
|
+
prepare_join(assembly_names, options, &block)
|
199
173
|
end
|
200
174
|
alias co_group join
|
201
175
|
|
202
|
-
|
203
|
-
|
176
|
+
# Builds an inner join (CoGroup) pipe. Requires a list of assembly names to
|
177
|
+
# join and :on to specify the group_fields.
|
178
|
+
#
|
179
|
+
# The named options are:
|
180
|
+
# [on] The keys of the join, an array of strings if they are the same in
|
181
|
+
# all inputs, or a hash mapping assembly names to key names if they
|
182
|
+
# differ across inputs.
|
183
|
+
# [declared_fields] By default, a deduplicated array of incoming field
|
184
|
+
# names (see Cascading::dedup_fields). Specifies the
|
185
|
+
# names of the fields that will be available to
|
186
|
+
# aggregations or post-join if no aggregations are
|
187
|
+
# specified.
|
188
|
+
#
|
189
|
+
# Example:
|
190
|
+
# assembly 'join_left_right' do
|
191
|
+
# inner_join 'left', 'right', :on => ['key1', 'key2']
|
192
|
+
# sum 'val1', 'val2', :type => :long
|
193
|
+
# end
|
194
|
+
# end
|
195
|
+
def inner_join(*args_with_options, &block)
|
196
|
+
options = args_with_options.extract_options!
|
204
197
|
options[:joiner] = :inner
|
205
|
-
|
206
|
-
join(*
|
198
|
+
args_with_options << options
|
199
|
+
join(*args_with_options, &block)
|
207
200
|
end
|
208
201
|
|
209
|
-
|
210
|
-
|
202
|
+
# Builds a left join (CoGroup) pipe. Requires a list of assembly names to
|
203
|
+
# join and :on to specify the group_fields.
|
204
|
+
#
|
205
|
+
# The named options are:
|
206
|
+
# [on] The keys of the join, an array of strings if they are the same in
|
207
|
+
# all inputs, or a hash mapping assembly names to key names if they
|
208
|
+
# differ across inputs.
|
209
|
+
# [declared_fields] By default, a deduplicated array of incoming field
|
210
|
+
# names (see Cascading::dedup_fields). Specifies the
|
211
|
+
# names of the fields that will be available to
|
212
|
+
# aggregations or post-join if no aggregations are
|
213
|
+
# specified.
|
214
|
+
#
|
215
|
+
# Example:
|
216
|
+
# assembly 'join_left_right' do
|
217
|
+
# left_join 'left', 'right', :on => ['key1', 'key2'] do
|
218
|
+
# sum 'val1', 'val2', :type => :long
|
219
|
+
# end
|
220
|
+
# end
|
221
|
+
def left_join(*args_with_options, &block)
|
222
|
+
options = args_with_options.extract_options!
|
211
223
|
options[:joiner] = :left
|
212
|
-
|
213
|
-
join(*
|
224
|
+
args_with_options << options
|
225
|
+
join(*args_with_options, &block)
|
214
226
|
end
|
215
227
|
|
216
|
-
|
217
|
-
|
228
|
+
# Builds a right join (CoGroup) pipe. Requires a list of assembly names to
|
229
|
+
# join and :on to specify the group_fields.
|
230
|
+
#
|
231
|
+
# The named options are:
|
232
|
+
# [on] The keys of the join, an array of strings if they are the same in
|
233
|
+
# all inputs, or a hash mapping assembly names to key names if they
|
234
|
+
# differ across inputs.
|
235
|
+
# [declared_fields] By default, a deduplicated array of incoming field
|
236
|
+
# names (see Cascading::dedup_fields). Specifies the
|
237
|
+
# names of the fields that will be available to
|
238
|
+
# aggregations or post-join if no aggregations are
|
239
|
+
# specified.
|
240
|
+
#
|
241
|
+
# Example:
|
242
|
+
# assembly 'join_left_right' do
|
243
|
+
# right_join 'left', 'right', :on => ['key1', 'key2'] do
|
244
|
+
# sum 'val1', 'val2', :type => :long
|
245
|
+
# end
|
246
|
+
# end
|
247
|
+
def right_join(*args_with_options, &block)
|
248
|
+
options = args_with_options.extract_options!
|
218
249
|
options[:joiner] = :right
|
219
|
-
|
220
|
-
join(*
|
250
|
+
args_with_options << options
|
251
|
+
join(*args_with_options, &block)
|
221
252
|
end
|
222
253
|
|
223
|
-
|
224
|
-
|
254
|
+
# Builds an outer join (CoGroup) pipe. Requires a list of assembly names to
|
255
|
+
# join and :on to specify the group_fields.
|
256
|
+
#
|
257
|
+
# The named options are:
|
258
|
+
# [on] The keys of the join, an array of strings if they are the same in
|
259
|
+
# all inputs, or a hash mapping assembly names to key names if they
|
260
|
+
# differ across inputs.
|
261
|
+
# [declared_fields] By default, a deduplicated array of incoming field
|
262
|
+
# names (see Cascading::dedup_fields). Specifies the
|
263
|
+
# names of the fields that will be available to
|
264
|
+
# aggregations or post-join if no aggregations are
|
265
|
+
# specified.
|
266
|
+
#
|
267
|
+
# Example:
|
268
|
+
# assembly 'join_left_right' do
|
269
|
+
# outer_join 'left', 'right', :on => ['key1', 'key2'] do
|
270
|
+
# sum 'val1', 'val2', :type => :long
|
271
|
+
# end
|
272
|
+
# end
|
273
|
+
def outer_join(*args_with_options, &block)
|
274
|
+
options = args_with_options.extract_options!
|
225
275
|
options[:joiner] = :outer
|
226
|
-
|
227
|
-
join(*
|
276
|
+
args_with_options << options
|
277
|
+
join(*args_with_options, &block)
|
228
278
|
end
|
229
279
|
|
230
|
-
# Builds a
|
280
|
+
# Builds a child Assembly that branches this Assembly given a name and
|
281
|
+
# block.
|
282
|
+
#
|
283
|
+
# An assembly's name is quite important as it will determine:
|
284
|
+
# * The sources from which it will read, if any
|
285
|
+
# * The name to be used in joins or unions downstream
|
286
|
+
# * The name to be used to sink the output of the assembly downstream
|
287
|
+
#
|
288
|
+
# Many branches may be built within an assembly. The result of a branch is
|
289
|
+
# the same as the Flow#assembly constructor, an Assembly object.
|
290
|
+
#
|
291
|
+
# Example:
|
292
|
+
# assembly 'some_work' do
|
293
|
+
# ...
|
294
|
+
#
|
295
|
+
# branch 'more_work' do
|
296
|
+
# ...
|
297
|
+
# end
|
298
|
+
#
|
299
|
+
# branch 'yet_more_work' do
|
300
|
+
# ...
|
301
|
+
# end
|
302
|
+
# end
|
231
303
|
def branch(name, &block)
|
232
304
|
raise "Could not build branch '#{name}'; block required" unless block_given?
|
233
305
|
assembly = Assembly.new(name, self, @outgoing_scopes)
|
@@ -236,11 +308,27 @@ module Cascading
|
|
236
308
|
assembly
|
237
309
|
end
|
238
310
|
|
239
|
-
# Builds a new GroupBy pipe that groups on the fields given in
|
240
|
-
#
|
241
|
-
|
242
|
-
|
243
|
-
|
311
|
+
# Builds a new GroupBy pipe that groups on the fields given in
|
312
|
+
# args_with_options. The block passed to this method will be evaluated in
|
313
|
+
# the context of Aggregations, not Assembly.
|
314
|
+
#
|
315
|
+
# The named options are:
|
316
|
+
# [sort_by] Optional keys for within-group sort.
|
317
|
+
# [reverse] Boolean that can reverse the order of within-group sorting
|
318
|
+
# (only makes sense given :sort_by keys).
|
319
|
+
#
|
320
|
+
# Example:
|
321
|
+
# assembly 'total' do
|
322
|
+
# ...
|
323
|
+
# insert 'const' => 1
|
324
|
+
# group_by 'const' do
|
325
|
+
# count
|
326
|
+
# sum 'val1', 'val2', :type => :long
|
327
|
+
# end
|
328
|
+
# discard 'const'
|
329
|
+
# end
|
330
|
+
def group_by(*args_with_options, &block)
|
331
|
+
options, group_fields = args_with_options.extract_options!, fields(args_with_options)
|
244
332
|
sort_fields = fields(options[:sort_by])
|
245
333
|
reverse = options[:reverse]
|
246
334
|
|
@@ -251,16 +339,31 @@ module Cascading
|
|
251
339
|
# Unifies multiple incoming pipes sharing the same field structure using a
|
252
340
|
# GroupBy. Accepts :on like join and :sort_by and :reverse like group_by,
|
253
341
|
# as well as a block which may be used for a sequence of Every
|
254
|
-
# aggregations.
|
342
|
+
# aggregations. The block passed to this method will be evaluated in the
|
343
|
+
# context of Aggregations, not Assembly.
|
255
344
|
#
|
256
345
|
# By default, groups only on the first field (see line 189 of GroupBy.java)
|
257
|
-
|
258
|
-
|
346
|
+
#
|
347
|
+
# The named options are:
|
348
|
+
# [on] The keys of the union, which defaults to the first field in the
|
349
|
+
# first input assembly.
|
350
|
+
# [sort_by] Optional keys for sorting.
|
351
|
+
# [reverse] Boolean that can reverse the order of sorting
|
352
|
+
# (only makes sense given :sort_by keys).
|
353
|
+
#
|
354
|
+
# Example:
|
355
|
+
# assembly 'union_left_right' do
|
356
|
+
# union 'left', 'right' do
|
357
|
+
# sum 'val1', 'val2', :type => :long
|
358
|
+
# end
|
359
|
+
# end
|
360
|
+
def union(*args_with_options, &block)
|
361
|
+
options, assembly_names = args_with_options.extract_options!, args_with_options
|
259
362
|
group_fields = fields(options[:on])
|
260
363
|
sort_fields = fields(options[:sort_by])
|
261
364
|
reverse = options[:reverse]
|
262
365
|
|
263
|
-
pipes, _ = populate_incoming_scopes(
|
366
|
+
pipes, _ = populate_incoming_scopes(assembly_names)
|
264
367
|
|
265
368
|
# Must provide group_fields to ensure field name propagation
|
266
369
|
group_fields = fields(@incoming_scopes.first.values_fields.get(0)) unless group_fields
|
@@ -273,10 +376,15 @@ module Cascading
|
|
273
376
|
end
|
274
377
|
alias :union_pipes :union
|
275
378
|
|
276
|
-
# Allows you to plugin c.p.SubAssemblies to
|
277
|
-
#
|
278
|
-
#
|
279
|
-
#
|
379
|
+
# Allows you to plugin c.p.SubAssemblies to an Assembly under certain
|
380
|
+
# assumptions. Note the default is to extend the tail pipe of this
|
381
|
+
# Assembly using a linear SubAssembly. See SubAssembly class for details.
|
382
|
+
#
|
383
|
+
# Example:
|
384
|
+
# assembly 'id_rows' do
|
385
|
+
# ...
|
386
|
+
# sub_assembly Java::CascadingPipeAssembly::Discard.new(tail_pipe, fields('id'))
|
387
|
+
# end
|
280
388
|
def sub_assembly(sub_assembly, pipes = [tail_pipe], incoming_scopes = [scope])
|
281
389
|
sub_assembly = SubAssembly.new(self, sub_assembly)
|
282
390
|
sub_assembly.finalize(pipes, incoming_scopes)
|
@@ -287,17 +395,24 @@ module Cascading
|
|
287
395
|
sub_assembly
|
288
396
|
end
|
289
397
|
|
290
|
-
# Builds a basic
|
291
|
-
#
|
398
|
+
# Builds a basic each pipe and adds it to the current Assembly. Default
|
399
|
+
# arguments are all_fields, a default inherited from c.o.Each. Exactly one
|
400
|
+
# of :function and :filter must be specified and filters do not support an
|
401
|
+
# :output selector.
|
402
|
+
#
|
403
|
+
# The named options are:
|
404
|
+
# [filter] A Cascading Filter, mutually exclusive with :function.
|
405
|
+
# [function] A Cascading Function, mutually exclusive with :filter.
|
406
|
+
# [output] c.p.Each output selector, only valid with :function.
|
407
|
+
#
|
292
408
|
# Example:
|
293
|
-
#
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
out_fields = fields(options[:output])
|
299
|
-
|
409
|
+
# each fields(input_fields), :function => Java::CascadingOperation::Identity.new
|
410
|
+
# each 'field1', 'field2', :function => Java::CascadingOperation::Identity.new
|
411
|
+
def each(*args_with_options)
|
412
|
+
options, in_fields = args_with_options.extract_options!, fields(args_with_options)
|
413
|
+
out_fields = fields(options[:output]) # Default Fields.RESULTS from c.o.Each
|
300
414
|
operation = options[:filter] || options[:function]
|
415
|
+
raise 'each requires either :filter or :function' unless operation
|
301
416
|
raise 'c.p.Each does not support applying an output selector to a c.o.Filter' if options[:filter] && options[:output]
|
302
417
|
|
303
418
|
parameters = [tail_pipe, in_fields, operation, out_fields].compact
|
@@ -308,468 +423,156 @@ module Cascading
|
|
308
423
|
each
|
309
424
|
end
|
310
425
|
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
each fields(args), :function => Java::CascadingOperation::Identity.new
|
317
|
-
end
|
318
|
-
|
319
|
-
# Removes the specified fields from the current assembly.
|
320
|
-
# --
|
321
|
-
# Example:
|
322
|
-
# discard "field1", "field2"
|
323
|
-
def discard(*args)
|
324
|
-
discard_fields = fields(args)
|
325
|
-
keep_fields = difference_fields(scope.values_fields, discard_fields)
|
326
|
-
project(*keep_fields.to_a)
|
327
|
-
end
|
328
|
-
|
329
|
-
# Renames fields according to the mapping provided.
|
330
|
-
# --
|
331
|
-
# Example:
|
332
|
-
# rename "old_name" => "new_name"
|
333
|
-
def rename(name_map)
|
334
|
-
old_names = scope.values_fields.to_a
|
335
|
-
new_names = old_names.map{ |name| name_map[name] || name }
|
336
|
-
invalid = name_map.keys.sort - old_names
|
337
|
-
raise "invalid names: #{invalid.inspect}" unless invalid.empty?
|
338
|
-
|
339
|
-
each all_fields, :function => Java::CascadingOperation::Identity.new(fields(new_names))
|
340
|
-
end
|
341
|
-
|
342
|
-
def cast(type_map)
|
343
|
-
names = type_map.keys.sort
|
344
|
-
types = JAVA_TYPE_MAP.values_at(*type_map.values_at(*names))
|
345
|
-
fields = fields(names)
|
346
|
-
types = types.to_java(java.lang.Class)
|
347
|
-
each fields, :function => Java::CascadingOperation::Identity.new(fields, types)
|
348
|
-
end
|
349
|
-
|
350
|
-
def copy(*args)
|
351
|
-
options = args.extract_options!
|
352
|
-
from = args[0] || all_fields
|
353
|
-
into = args[1] || options[:into] || all_fields
|
354
|
-
each fields(from), :function => Java::CascadingOperation::Identity.new(fields(into)), :output => all_fields
|
355
|
-
end
|
356
|
-
|
357
|
-
# A pipe that does nothing.
|
358
|
-
def pass(*args)
|
359
|
-
each all_fields, :function => Java::CascadingOperation::Identity.new
|
360
|
-
end
|
426
|
+
include Operations
|
427
|
+
include IdentityOperations
|
428
|
+
include FilterOperations
|
429
|
+
include RegexOperations
|
430
|
+
include TextOperations
|
361
431
|
|
362
|
-
|
363
|
-
|
364
|
-
|
432
|
+
# Builds an each assertion pipe given a c.o.a.Assertion and adds it to the
|
433
|
+
# current Assembly.
|
434
|
+
#
|
435
|
+
# The named options are:
|
436
|
+
# [level] The assertion level; defaults to strict.
|
437
|
+
def assert(assertion, options = {})
|
365
438
|
assertion_level = options[:level] || Java::CascadingOperation::AssertionLevel::STRICT
|
366
439
|
|
367
440
|
parameters = [tail_pipe, assertion_level, assertion]
|
368
441
|
make_pipe(Java::CascadingPipe::Each, parameters)
|
369
442
|
end
|
370
443
|
|
371
|
-
# Builds a
|
372
|
-
|
373
|
-
|
374
|
-
# output.
|
375
|
-
#
|
376
|
-
# The other named options are:
|
377
|
-
# * <tt>:print_fields</tt> a boolean. If is set to true, then it prints every 10 tuples.
|
378
|
-
#
|
379
|
-
def debug(*args)
|
380
|
-
options = args.extract_options!
|
381
|
-
print_fields = options[:print_fields] || true
|
382
|
-
parameters = [print_fields].compact
|
383
|
-
debug = Java::CascadingOperation::Debug.new(*parameters)
|
384
|
-
debug.print_tuple_every = options[:tuple_interval] || 1
|
385
|
-
debug.print_fields_every = options[:fields_interval] || 10
|
386
|
-
each(all_fields, :filter => debug)
|
387
|
-
end
|
388
|
-
|
389
|
-
# Builds a pipe that assert the size of the tuple is the size specified in parameter.
|
390
|
-
#
|
391
|
-
# The method accept an unique uname argument : a number indicating the size expected.
|
392
|
-
def assert_size_equals(*args)
|
393
|
-
options = args.extract_options!
|
394
|
-
assertion = Java::CascadingOperationAssertion::AssertSizeEquals.new(args[0])
|
444
|
+
# Builds a pipe that asserts the size of the tuple is the specified size.
|
445
|
+
def assert_size_equals(size, options = {})
|
446
|
+
assertion = Java::CascadingOperationAssertion::AssertSizeEquals.new(size)
|
395
447
|
assert(assertion, options)
|
396
448
|
end
|
397
449
|
|
398
|
-
#
|
399
|
-
def assert_not_null(
|
400
|
-
options = args.extract_options!
|
450
|
+
# Builes a pipe that asserts none of the fiels in the tuple are null.
|
451
|
+
def assert_not_null(options = {})
|
401
452
|
assertion = Java::CascadingOperationAssertion::AssertNotNull.new
|
402
453
|
assert(assertion, options)
|
403
454
|
end
|
404
455
|
|
405
|
-
|
406
|
-
# using a specified regex pattern.
|
407
|
-
#
|
408
|
-
# If provided, the unamed arguments must be the fields to be parsed. If not provided, then all incoming
|
409
|
-
# fields are used.
|
410
|
-
#
|
411
|
-
# The named options are:
|
412
|
-
# * <tt>:pattern</tt> a string or regex. Specifies the regular expression used for parsing the argument fields.
|
413
|
-
# * <tt>:output</tt> a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
|
414
|
-
def parse(*args)
|
415
|
-
options = args.extract_options!
|
416
|
-
fields = args || all_fields
|
417
|
-
pattern = options[:pattern]
|
418
|
-
output = options[:output] || all_fields
|
419
|
-
each(fields, :function => regex_parser(pattern, options), :output => output)
|
420
|
-
end
|
456
|
+
private
|
421
457
|
|
422
|
-
|
423
|
-
|
424
|
-
|
425
|
-
# The second unnamed argument is an array of strings indicating the fields receiving the result of the split.
|
426
|
-
#
|
427
|
-
# The named options are:
|
428
|
-
# * <tt>:pattern</tt> a string or regex. Specifies the regular expression used for splitting the argument fields.
|
429
|
-
# * <tt>:output</tt> a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
|
430
|
-
def split(*args)
|
431
|
-
options = args.extract_options!
|
432
|
-
fields = options[:into] || args[1]
|
433
|
-
pattern = options[:pattern] || /[.,]*\s+/
|
434
|
-
output = options[:output] || all_fields
|
435
|
-
each(args[0], :function => regex_splitter(fields, :pattern => pattern), :output=>output)
|
436
|
-
end
|
437
|
-
|
438
|
-
# Builds a pipe that splits a field into new rows, using a specified regular expression.
|
439
|
-
#
|
440
|
-
# The first unnamed argument is the field to be split.
|
441
|
-
# The second unnamed argument is the field receiving the result of the split.
|
442
|
-
#
|
443
|
-
# The named options are:
|
444
|
-
# * <tt>:pattern</tt> a string or regex. Specifies the regular expression used for splitting the argument fields.
|
445
|
-
# * <tt>:output</tt> a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
|
446
|
-
def split_rows(*args)
|
447
|
-
options = args.extract_options!
|
448
|
-
fields = options[:into] || args[1]
|
449
|
-
pattern = options[:pattern] || /[.,]*\s+/
|
450
|
-
output = options[:output] || all_fields
|
451
|
-
each(args[0], :function => regex_split_generator(fields, :pattern => pattern), :output=>output)
|
452
|
-
end
|
453
|
-
|
454
|
-
# Builds a pipe that emits a new row for each regex group matched in a field, using a specified regular expression.
|
455
|
-
#
|
456
|
-
# The first unnamed argument is the field to be matched against.
|
457
|
-
# The second unnamed argument is the field receiving the result of the match.
|
458
|
-
#
|
459
|
-
# The named options are:
|
460
|
-
# * <tt>:pattern</tt> a string or regex. Specifies the regular expression used for matching the argument fields.
|
461
|
-
# * <tt>:output</tt> a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
|
462
|
-
def match_rows(*args)
|
463
|
-
options = args.extract_options!
|
464
|
-
fields = options[:into] || args[1]
|
465
|
-
pattern = options[:pattern] || /[\w]+/
|
466
|
-
output = options[:output] || all_fields
|
467
|
-
each(args[0], :function => regex_generator(fields, :pattern => pattern), :output=>output)
|
468
|
-
end
|
469
|
-
|
470
|
-
# Builds a pipe that parses the specified field as a date using hte provided format string.
|
471
|
-
# The unamed argument specifies the field to format.
|
472
|
-
#
|
473
|
-
# The named options are:
|
474
|
-
# * <tt>:into</tt> a string. It specifies the receiving field. By default, it will be named after
|
475
|
-
# the input argument.
|
476
|
-
# * <tt>:pattern</tt> a string. Specifies the date format.
|
477
|
-
# * <tt>:output</tt> a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
|
478
|
-
def parse_date(*args)
|
479
|
-
options = args.extract_options!
|
480
|
-
field = options[:into] || "#{args[0]}_parsed"
|
481
|
-
output = options[:output] || all_fields
|
482
|
-
pattern = options[:pattern] || "yyyy/MM/dd"
|
483
|
-
|
484
|
-
each args[0], :function => date_parser(field, pattern), :output => output
|
485
|
-
end
|
458
|
+
def make_pipe(type, parameters)
|
459
|
+
@tail_pipe = type.new(*parameters)
|
460
|
+
@outgoing_scopes[name] = Scope.outgoing_scope(tail_pipe, [scope])
|
486
461
|
|
487
|
-
|
488
|
-
#
|
489
|
-
# The unamed argument specifies the field to format.
|
490
|
-
#
|
491
|
-
# The named options are:
|
492
|
-
# * <tt>:into</tt> a string. It specifies the receiving field. By default, it will be named after
|
493
|
-
# the input argument.
|
494
|
-
# * <tt>:pattern</tt> a string. Specifies the date format.
|
495
|
-
# * <tt>:timezone</tt> a string. Specifies the timezone (defaults to UTC).
|
496
|
-
# * <tt>:output</tt> a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
|
497
|
-
def format_date(*args)
|
498
|
-
options = args.extract_options!
|
499
|
-
field = options[:into] || "#{args[0]}_formatted"
|
500
|
-
pattern = options[:pattern] || "yyyy/MM/dd"
|
501
|
-
output = options[:output] || all_fields
|
502
|
-
|
503
|
-
each args[0], :function => date_formatter(field, pattern, options[:timezone]), :output => output
|
462
|
+
tail_pipe
|
504
463
|
end
|
505
464
|
|
506
|
-
|
507
|
-
|
508
|
-
|
509
|
-
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
# * <tt>:replacement</tt> a string. Specifies the replacement.
|
514
|
-
# * <tt>:output</tt> a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
|
515
|
-
def replace(*args)
|
516
|
-
options = args.extract_options!
|
517
|
-
|
518
|
-
pattern = options[:pattern] || args[1]
|
519
|
-
replacement = options[:replacement] || args[2]
|
520
|
-
into = options[:into] || "#{args[0]}_replaced"
|
521
|
-
output = options[:output] || all_fields
|
522
|
-
|
523
|
-
each args[0], :function => regex_replace(into, pattern, replacement), :output => output
|
524
|
-
end
|
465
|
+
def populate_incoming_scopes(assembly_names, group_fields_args = {})
|
466
|
+
# NOTE: this overrides the existing incoming_scopes, which changes the
|
467
|
+
# way describe will function on this assembly
|
468
|
+
pipes, @incoming_scopes, group_fields = [], [], []
|
469
|
+
assembly_names.each do |assembly_name|
|
470
|
+
assembly = parent_flow.find_child(assembly_name)
|
471
|
+
raise "Could not find assembly '#{assembly_name}' from '#{name}'" unless assembly
|
525
472
|
|
526
|
-
|
527
|
-
|
528
|
-
|
529
|
-
# and as values, the values they must contain. For example:
|
530
|
-
#
|
531
|
-
# insert {"who" => "Grégoire", "when" => Time.now.strftime("%Y-%m-%d") }
|
532
|
-
#
|
533
|
-
# will insert two new fields: a field _who_ containing the string "Grégoire", and a field _when_ containing
|
534
|
-
# the formatted current date.
|
535
|
-
# The methods outputs all fields.
|
536
|
-
# The named options are:
|
537
|
-
def insert(args)
|
538
|
-
args.keys.sort.each do |field_name|
|
539
|
-
value = args[field_name]
|
540
|
-
|
541
|
-
if value.kind_of?(ExprStub)
|
542
|
-
value.validate_scope(scope)
|
543
|
-
each all_fields, :function => expression_function(field_name, :expression => value.expression, :parameters => value.types), :output => all_fields
|
544
|
-
else
|
545
|
-
each all_fields, :function => insert_function([field_name], :values => [value]), :output => all_fields
|
546
|
-
end
|
473
|
+
pipes << assembly.tail_pipe
|
474
|
+
@incoming_scopes << assembly.scope
|
475
|
+
group_fields << fields(group_fields_args[assembly_name]) if group_fields_args[assembly_name]
|
547
476
|
end
|
477
|
+
[pipes, group_fields]
|
548
478
|
end
|
549
479
|
|
550
|
-
|
551
|
-
|
552
|
-
|
553
|
-
#
|
554
|
-
# The named options are:
|
555
|
-
# * <tt>:pattern</tt> a string. Specifies a regular expression pattern used to filter the tuples. If this
|
556
|
-
# option is provided, then the filter is regular expression-based. This is incompatible with the _expression_ option.
|
557
|
-
# * <tt>:expression</tt> a string. Specifies a Janino expression used to filter the tuples. This option has the
|
558
|
-
# same effect than providing it as first unamed argument. If this option is provided, then the filter is Janino
|
559
|
-
# expression-based. This is incompatible with the _pattern_ option.
|
560
|
-
# * <tt>:validate</tt> a boolean. Passed into Cascading#expr to enable or disable
|
561
|
-
# expression validation. Defaults to true.
|
562
|
-
# * <tt>:validate_with</tt> a hash. Actual arguments used by Cascading#expr for
|
563
|
-
# expression validation. Defaults to {}.
|
564
|
-
def filter(*args)
|
565
|
-
options = args.extract_options!
|
566
|
-
from = options.delete(:from) || all_fields
|
567
|
-
expression = options.delete(:expression) || args.shift
|
568
|
-
regex = options.delete(:pattern)
|
569
|
-
validate = options.has_key?(:validate) ? options.delete(:validate) : true
|
570
|
-
validate_with = options.has_key?(:validate_with) ? options.delete(:validate_with) : {}
|
571
|
-
|
572
|
-
if expression
|
573
|
-
stub = expr(expression, { :validate => validate, :validate_with => validate_with })
|
574
|
-
types, expression = stub.types, stub.expression
|
575
|
-
|
576
|
-
stub.validate_scope(scope)
|
577
|
-
each from, :filter => expression_filter(
|
578
|
-
:parameters => types,
|
579
|
-
:expression => expression
|
580
|
-
)
|
581
|
-
elsif regex
|
582
|
-
each from, :filter => regex_filter(regex, options)
|
583
|
-
end
|
584
|
-
end
|
480
|
+
def apply_aggregations(group, incoming_scopes, &block)
|
481
|
+
aggregations = Aggregations.new(self, group, incoming_scopes)
|
482
|
+
aggregations.instance_eval(&block) if block_given?
|
585
483
|
|
586
|
-
|
587
|
-
|
588
|
-
|
589
|
-
|
590
|
-
|
484
|
+
# Sorting of any type means that we cannot use the AggregateBy optimization
|
485
|
+
if aggregations.can_aggregate_by? && !group.is_sorted && !group.is_sort_reversed
|
486
|
+
grouping_fields = group.key_selectors.values.first
|
487
|
+
group.key_selectors.values.each do |key_fields|
|
488
|
+
raise "Grouping fields mismatch: #{grouping_fields} expected; #{key_fields} found from #{group.key_selectors}" unless key_fields == grouping_fields
|
489
|
+
end
|
591
490
|
|
592
|
-
|
593
|
-
|
594
|
-
|
595
|
-
|
596
|
-
|
491
|
+
aggregate_by = sub_assembly(Java::CascadingPipeAssembly::AggregateBy.new(
|
492
|
+
name,
|
493
|
+
group.previous,
|
494
|
+
grouping_fields,
|
495
|
+
aggregations.aggregate_bys.to_java(Java::CascadingPipeAssembly::AggregateBy)
|
496
|
+
), group.previous, incoming_scopes)
|
597
497
|
|
598
|
-
|
599
|
-
|
600
|
-
|
601
|
-
|
602
|
-
|
603
|
-
# * <tt>:expression</tt> a string. Specifies a Janino expression used to filter the tuples. This option has the
|
604
|
-
# same effect than providing it as first unamed argument. If this option is provided, then the filter is Janino
|
605
|
-
# expression-based.
|
606
|
-
# * <tt>:validate</tt> a boolean. Passed into Cascading#expr to enable or disable
|
607
|
-
# expression validation. Defaults to true.
|
608
|
-
# * <tt>:validate_with</tt> a hash. Actual arguments used by Cascading#expr for
|
609
|
-
# expression validation. Defaults to {}.
|
610
|
-
def reject(*args)
|
611
|
-
options = args.extract_options
|
612
|
-
raise "Regex not allowed" if options && options[:pattern]
|
613
|
-
|
614
|
-
filter(*args)
|
615
|
-
end
|
498
|
+
aggregate_by
|
499
|
+
else
|
500
|
+
aggregations.finalize if block_given?
|
501
|
+
@tail_pipe = aggregations.tail_pipe
|
502
|
+
@outgoing_scopes[name] = aggregations.scope
|
616
503
|
|
617
|
-
|
618
|
-
#
|
619
|
-
# The first unamed argument, if provided, is a filtering expression (using the Janino syntax).
|
620
|
-
#
|
621
|
-
# The named options are:
|
622
|
-
# * <tt>:expression</tt> a string. Specifies a Janino expression used to select the tuples. This option has the
|
623
|
-
# same effect than providing it as first unamed argument. If this option is provided, then the filter is Janino
|
624
|
-
# expression-based.
|
625
|
-
# * <tt>:validate</tt> a boolean. Passed into Cascading#expr to enable or disable
|
626
|
-
# expression validation. Defaults to true.
|
627
|
-
# * <tt>:validate_with</tt> a hash. Actual arguments used by Cascading#expr for
|
628
|
-
# expression validation. Defaults to {}.
|
629
|
-
def where(*args)
|
630
|
-
options = args.extract_options
|
631
|
-
raise "Regex not allowed" if options && options[:pattern]
|
632
|
-
|
633
|
-
if options[:expression]
|
634
|
-
_, imports, expr = options[:expression].match(/^((?:\s*import.*;\s*)*)(.*)$/).to_a
|
635
|
-
options[:expression] = "#{imports}!(#{expr})"
|
636
|
-
elsif args[0]
|
637
|
-
_, imports, expr = args[0].match(/^((?:\s*import.*;\s*)*)(.*)$/).to_a
|
638
|
-
args[0] = "#{imports}!(#{expr})"
|
504
|
+
group
|
639
505
|
end
|
640
|
-
|
641
|
-
filter(*args)
|
642
506
|
end
|
643
507
|
|
644
|
-
|
645
|
-
|
646
|
-
# The named options are:
|
647
|
-
# * <tt>:from</tt> a string or array of strings. Specifies the input fields.
|
648
|
-
# * <tt>:express</tt> a string. The janino expression.
|
649
|
-
# * <tt>:into</tt> a string. Specified the name of the field to insert with the result of the evaluation.
|
650
|
-
# * <tt>:parameters</tt> a hash. Specifies the type mapping for the parameters. See Cascading::Operations.expression_function.
|
651
|
-
def eval_expression(*args)
|
652
|
-
options = args.extract_options!
|
653
|
-
|
654
|
-
into = options.delete(:into)
|
655
|
-
from = options.delete(:from) || all_fields
|
656
|
-
output = options.delete(:output) || all_fields
|
657
|
-
options[:expression] ||= args.shift
|
658
|
-
options[:parameters] ||= args.shift
|
659
|
-
|
660
|
-
each from, :function => expression_function(into, options), :output=>output
|
661
|
-
end
|
508
|
+
def prepare_join(assembly_names, options, &block)
|
509
|
+
pipes, _ = populate_incoming_scopes(assembly_names)
|
662
510
|
|
663
|
-
|
664
|
-
|
665
|
-
# The method accepts optional unamed argument specifying the fields to base the distinct on
|
666
|
-
# (all fields, by default).
|
667
|
-
def distinct(*args)
|
668
|
-
raise "Distinct is badly broken"
|
669
|
-
fields = args[0] || all_fields
|
670
|
-
group_by *fields
|
671
|
-
pass
|
672
|
-
end
|
673
|
-
|
674
|
-
def join_fields(*args)
|
675
|
-
options = args.extract_options!
|
676
|
-
output = options[:output] || all_fields
|
511
|
+
group_fields_args = options[:on]
|
512
|
+
raise 'join requires :on parameter' unless group_fields_args
|
677
513
|
|
678
|
-
|
679
|
-
|
514
|
+
if group_fields_args.kind_of?(String)
|
515
|
+
group_fields_args = [group_fields_args]
|
516
|
+
end
|
680
517
|
|
681
|
-
|
682
|
-
|
683
|
-
|
684
|
-
|
685
|
-
|
686
|
-
|
687
|
-
|
688
|
-
|
689
|
-
|
690
|
-
|
691
|
-
# in the order provided.
|
692
|
-
# * <tt>:num_values</tt> an integer specifying the number of fields to
|
693
|
-
# ungroup into each output tuple (excluding the key fields). All input
|
694
|
-
# fields will be ungrouped.
|
695
|
-
# * <tt>:input</tt> an array of field names that specifies the fields to
|
696
|
-
# input to UnGroup. Defaults to all_fields.
|
697
|
-
# * <tt>:into</tt> an array of field names. Default set by UnGroup.
|
698
|
-
# * <tt>:output</tt> an array of field names that specifies the fields to
|
699
|
-
# produce as output of UnGroup. Defaults to all_fields.
|
700
|
-
def ungroup(*args)
|
701
|
-
options = args.extract_options!
|
702
|
-
input = options[:input] || all_fields
|
703
|
-
into = fields(options[:into])
|
704
|
-
output = options[:output] || all_fields
|
705
|
-
key = fields(options[:key])
|
706
|
-
|
707
|
-
raise 'You must provide exactly one of :value_selectors or :num_values to ungroup' unless options.has_key?(:value_selectors) ^ options.has_key?(:num_values)
|
708
|
-
value_selectors = options[:value_selectors].map{ |vs| fields(vs) }.to_java(Java::CascadingTuple::Fields) if options.has_key?(:value_selectors)
|
709
|
-
num_values = options[:num_values] if options.has_key?(:num_values)
|
710
|
-
|
711
|
-
parameters = [into, key, value_selectors, num_values].compact
|
712
|
-
each input, :function => Java::CascadingOperationFunction::UnGroup.new(*parameters), :output => output
|
713
|
-
end
|
518
|
+
group_fields = []
|
519
|
+
if group_fields_args.kind_of?(Array)
|
520
|
+
pipes.size.times do
|
521
|
+
group_fields << fields(group_fields_args)
|
522
|
+
end
|
523
|
+
elsif group_fields_args.kind_of?(Hash)
|
524
|
+
pipes, group_fields = populate_incoming_scopes(group_fields_args.keys.sort, group_fields_args)
|
525
|
+
else
|
526
|
+
raise "Unsupported data type for :on in join: '#{group_fields_args.class}'"
|
527
|
+
end
|
714
528
|
|
715
|
-
|
716
|
-
|
717
|
-
|
718
|
-
|
719
|
-
|
720
|
-
|
721
|
-
# * <tt>filter</tt> Cascading Filter to apply.
|
722
|
-
# * <tt>keep_value</tt> Java value to produce when the filter would keep
|
723
|
-
# the given input.
|
724
|
-
# * <tt>remove_value</tt> Java value to produce when the filter would
|
725
|
-
# remove the given input.
|
726
|
-
#
|
727
|
-
# The named options are:
|
728
|
-
# * <tt>:into</tt> an output field name, defaulting to 'filter_value'.
|
729
|
-
# * <tt>:output</tt> an array of field names that specifies the fields to
|
730
|
-
# retain in the output tuple. Defaults to all_fields.
|
731
|
-
def set_value(input, filter, keep_value, remove_value, params = {})
|
732
|
-
into = fields(params[:into] || 'filter_value')
|
733
|
-
output = params[:output] || all_fields
|
734
|
-
each input, :function => Java::CascadingOperationFunction::SetValue.new(into, filter, keep_value, remove_value), :output => output
|
735
|
-
end
|
529
|
+
raise 'join requires non-empty :on parameter' if group_fields_args.empty?
|
530
|
+
group_fields = group_fields.to_java(Java::CascadingTuple::Fields)
|
531
|
+
incoming_fields = @incoming_scopes.map{ |s| s.values_fields }
|
532
|
+
declared_fields = fields(options[:declared_fields] || dedup_fields(*incoming_fields))
|
533
|
+
joiner = options[:joiner]
|
534
|
+
is_hash_join = options[:hash] || false
|
736
535
|
|
737
|
-
|
738
|
-
|
739
|
-
|
740
|
-
|
741
|
-
|
742
|
-
|
743
|
-
|
744
|
-
|
745
|
-
|
746
|
-
|
747
|
-
|
748
|
-
|
749
|
-
|
750
|
-
|
751
|
-
|
752
|
-
|
753
|
-
|
536
|
+
case joiner
|
537
|
+
when :inner, 'inner', nil
|
538
|
+
joiner = Java::CascadingPipeJoiner::InnerJoin.new
|
539
|
+
when :left, 'left'
|
540
|
+
joiner = Java::CascadingPipeJoiner::LeftJoin.new
|
541
|
+
when :right, 'right'
|
542
|
+
joiner = Java::CascadingPipeJoiner::RightJoin.new
|
543
|
+
when :outer, 'outer'
|
544
|
+
joiner = Java::CascadingPipeJoiner::OuterJoin.new
|
545
|
+
when Array
|
546
|
+
joiner = joiner.map do |t|
|
547
|
+
case t
|
548
|
+
when true, 1, :inner then true
|
549
|
+
when false, 0, :outer then false
|
550
|
+
else fail "invalid mixed joiner entry: #{t}"
|
551
|
+
end
|
552
|
+
end
|
553
|
+
joiner = Java::CascadingPipeJoiner::MixedJoin.new(joiner.to_java(:boolean))
|
554
|
+
end
|
754
555
|
|
755
|
-
|
756
|
-
|
757
|
-
|
758
|
-
|
759
|
-
|
760
|
-
|
761
|
-
|
762
|
-
|
763
|
-
|
764
|
-
|
765
|
-
|
766
|
-
|
767
|
-
|
768
|
-
|
769
|
-
|
770
|
-
|
771
|
-
|
772
|
-
|
556
|
+
if is_hash_join
|
557
|
+
parameters = [
|
558
|
+
pipes.to_java(Java::CascadingPipe::Pipe),
|
559
|
+
group_fields,
|
560
|
+
declared_fields,
|
561
|
+
joiner
|
562
|
+
]
|
563
|
+
group_assembly = Java::CascadingPipe::HashJoin.new(*parameters)
|
564
|
+
else
|
565
|
+
result_group_fields = dedup_fields(*group_fields)
|
566
|
+
parameters = [
|
567
|
+
pipes.to_java(Java::CascadingPipe::Pipe),
|
568
|
+
group_fields,
|
569
|
+
declared_fields,
|
570
|
+
result_group_fields,
|
571
|
+
joiner
|
572
|
+
]
|
573
|
+
group_assembly = Java::CascadingPipe::CoGroup.new(*parameters)
|
574
|
+
end
|
575
|
+
apply_aggregations(group_assembly, @incoming_scopes, &block)
|
773
576
|
end
|
774
577
|
end
|
775
578
|
end
|