cascading.jruby 0.0.10 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE.txt +13 -160
- data/README.md +35 -0
- data/lib/cascading.rb +8 -41
- data/lib/cascading/aggregations.rb +216 -71
- data/lib/cascading/assembly.rb +409 -606
- data/lib/cascading/base.rb +22 -0
- data/lib/cascading/cascade.rb +55 -18
- data/lib/cascading/cascading.rb +137 -47
- data/lib/cascading/expr_stub.rb +31 -17
- data/lib/cascading/ext/array.rb +17 -0
- data/lib/cascading/filter_operations.rb +101 -0
- data/lib/cascading/flow.rb +87 -23
- data/lib/cascading/identity_operations.rb +82 -0
- data/lib/cascading/mode.rb +14 -10
- data/lib/cascading/operations.rb +109 -174
- data/lib/cascading/regex_operations.rb +133 -0
- data/lib/cascading/scope.rb +32 -9
- data/lib/cascading/sub_assembly.rb +8 -5
- data/lib/cascading/tap.rb +41 -17
- data/lib/cascading/text_operations.rb +67 -0
- data/test/mock_assemblies.rb +55 -0
- data/test/test_assembly.rb +23 -25
- data/test/test_local_execution.rb +7 -7
- data/test/test_operations.rb +0 -10
- metadata +76 -74
- data/History.txt +0 -58
data/lib/cascading/assembly.rb
CHANGED
@@ -1,15 +1,50 @@
|
|
1
1
|
require 'cascading/base'
|
2
2
|
require 'cascading/operations'
|
3
|
+
require 'cascading/identity_operations'
|
4
|
+
require 'cascading/filter_operations'
|
5
|
+
require 'cascading/regex_operations'
|
6
|
+
require 'cascading/text_operations'
|
3
7
|
require 'cascading/aggregations'
|
4
8
|
require 'cascading/sub_assembly'
|
5
9
|
require 'cascading/ext/array'
|
6
10
|
|
7
11
|
module Cascading
|
12
|
+
# An Assembly is a sequence of Cascading pipes (Each, GroupBy, CoGroup,
|
13
|
+
# Every, and SubAssembly). This class will serve as your primary mechanism
|
14
|
+
# for doing work within a flow and contains all the functions and filters you
|
15
|
+
# will apply to a pipe (Eaches), as well as group_by, union, and join. For
|
16
|
+
# aggregators and buffers, please see Aggregations.
|
17
|
+
#
|
18
|
+
# Function and filter DSL rules:
|
19
|
+
# * Use positional arguments for required parameters
|
20
|
+
# * Use options = {} for optional parameters
|
21
|
+
# * Use *args sparingly, specifically when you need to accept a varying length list of fields
|
22
|
+
# * If you require both a varying length list of fields and optional parameters, then see the Array#extract_options! extension
|
23
|
+
# * If you choose to name a required parameter, add it to options = {} and throw an exception if the caller does not provide it
|
24
|
+
# * If you have a require parameter that is provided by one of a set of options names, throw an exception if the caller does not provide at least one value (see :function and :filter in Assembly#each for an example)
|
25
|
+
#
|
26
|
+
# Function and filter DSL standard optional parameter names:
|
27
|
+
# [input] c.p.Each argument selector
|
28
|
+
# [into] c.o.Operation field declaration
|
29
|
+
# [output] c.p.Each output selector
|
30
|
+
#
|
31
|
+
# A note on aliases: when a DSL method uniquely wraps a single Cascading
|
32
|
+
# operation, we attempt to provide an alias that matches the Cascading
|
33
|
+
# operation. However, Cascading operations are often nouns rather than verbs,
|
34
|
+
# and the latter are preferable for a dataflow DSL.
|
8
35
|
class Assembly < Cascading::Node
|
9
|
-
include Operations
|
10
|
-
|
11
36
|
attr_reader :head_pipe, :tail_pipe
|
12
37
|
|
38
|
+
# Do not use this constructor directly; instead, use Flow#assembly or
|
39
|
+
# Assembly#branch to build assemblies.
|
40
|
+
#
|
41
|
+
# Builds an Assembly given a name, parent, and optional outgoing_scopes
|
42
|
+
# (necessary only for branching).
|
43
|
+
#
|
44
|
+
# An assembly's name is quite important as it will determine:
|
45
|
+
# * The sources from which it will read, if any
|
46
|
+
# * The name to be used in joins or unions downstream
|
47
|
+
# * The name to be used to sink the output of the assembly downstream
|
13
48
|
def initialize(name, parent, outgoing_scopes = {})
|
14
49
|
super(name, parent)
|
15
50
|
|
@@ -27,6 +62,11 @@ module Cascading
|
|
27
62
|
@incoming_scopes = [scope]
|
28
63
|
end
|
29
64
|
|
65
|
+
# Produces a textual description of this Assembly. The description details
|
66
|
+
# the structure of the Assembly, its input and output fields and any
|
67
|
+
# children (branches). The offset parameter allows for this describe to be
|
68
|
+
# nested within a calling context, which lets us indent the structural
|
69
|
+
# hierarchy of a job.
|
30
70
|
def describe(offset = '')
|
31
71
|
incoming_scopes_desc = "#{@incoming_scopes.map{ |incoming_scope| incoming_scope.values_fields.to_a.inspect }.join(', ')}"
|
32
72
|
incoming_scopes_desc = "(#{incoming_scopes_desc})" unless @incoming_scopes.size == 1
|
@@ -35,199 +75,231 @@ module Cascading
|
|
35
75
|
description
|
36
76
|
end
|
37
77
|
|
78
|
+
# Rather than the immediate parent, this method returns the parent flow of
|
79
|
+
# this Assembly. If this is a branch, we must traverse the parents of
|
80
|
+
# parent assemblies.
|
38
81
|
def parent_flow
|
39
82
|
return parent if parent.kind_of?(Flow)
|
40
83
|
parent.parent_flow
|
41
84
|
end
|
42
85
|
|
86
|
+
# Accesses the outgoing scope of this Assembly at the point at which it is
|
87
|
+
# called. This is useful for grabbing the values_fields at any point in
|
88
|
+
# the construction of the Assembly. See Scope for details.
|
43
89
|
def scope
|
44
90
|
@outgoing_scopes[name]
|
45
91
|
end
|
46
92
|
|
93
|
+
# Prints information about the scope of this Assembly at the point at which
|
94
|
+
# it is called. This allows you to trace the propagation of field names
|
95
|
+
# through your job and is handy for debugging. See Scope for details.
|
47
96
|
def debug_scope
|
48
97
|
puts "Current scope for '#{name}':\n #{scope}\n----------\n"
|
49
98
|
end
|
50
99
|
|
51
|
-
|
52
|
-
|
53
|
-
@outgoing_scopes[name] = Scope.outgoing_scope(tail_pipe, [scope])
|
54
|
-
|
55
|
-
tail_pipe
|
56
|
-
end
|
57
|
-
private :make_pipe
|
58
|
-
|
59
|
-
def populate_incoming_scopes(assembly_names, group_fields_args = {})
|
60
|
-
# NOTE: this overrides the existing incoming_scopes, which changes the
|
61
|
-
# way describe will function on this assembly
|
62
|
-
pipes, @incoming_scopes, group_fields = [], [], []
|
63
|
-
assembly_names.each do |assembly_name|
|
64
|
-
assembly = parent_flow.find_child(assembly_name)
|
65
|
-
raise "Could not find assembly '#{assembly_name}' from '#{name}'" unless assembly
|
66
|
-
|
67
|
-
pipes << assembly.tail_pipe
|
68
|
-
@incoming_scopes << assembly.scope
|
69
|
-
group_fields << fields(group_fields_args[assembly_name]) if group_fields_args[assembly_name]
|
70
|
-
end
|
71
|
-
[pipes, group_fields]
|
72
|
-
end
|
73
|
-
private :populate_incoming_scopes
|
74
|
-
|
75
|
-
def apply_aggregations(group, incoming_scopes, &block)
|
76
|
-
aggregations = Aggregations.new(self, group, incoming_scopes)
|
77
|
-
aggregations.instance_eval(&block) if block_given?
|
78
|
-
|
79
|
-
# Sorting of any type means that we cannot use the AggregateBy optimization
|
80
|
-
if aggregations.can_aggregate_by? && !group.is_sorted && !group.is_sort_reversed
|
81
|
-
grouping_fields = group.key_selectors.values.first
|
82
|
-
group.key_selectors.values.each do |key_fields|
|
83
|
-
raise "Grouping fields mismatch: #{grouping_fields} expected; #{key_fields} found from #{group.key_selectors}" unless key_fields == grouping_fields
|
84
|
-
end
|
85
|
-
|
86
|
-
aggregate_by = sub_assembly(Java::CascadingPipeAssembly::AggregateBy.new(
|
87
|
-
name,
|
88
|
-
group.previous,
|
89
|
-
grouping_fields,
|
90
|
-
aggregations.aggregate_bys.to_java(Java::CascadingPipeAssembly::AggregateBy)
|
91
|
-
), group.previous, incoming_scopes)
|
92
|
-
|
93
|
-
aggregate_by
|
94
|
-
else
|
95
|
-
aggregations.finalize if block_given?
|
96
|
-
@tail_pipe = aggregations.tail_pipe
|
97
|
-
@outgoing_scopes[name] = aggregations.scope
|
98
|
-
|
99
|
-
group
|
100
|
-
end
|
101
|
-
end
|
102
|
-
private :apply_aggregations
|
103
|
-
|
100
|
+
# Prints detail about this Assembly including its name, head pipe, and tail
|
101
|
+
# pipe.
|
104
102
|
def to_s
|
105
103
|
"#{name} : head pipe : #{head_pipe} - tail pipe: #{tail_pipe}"
|
106
104
|
end
|
107
105
|
|
108
|
-
def prepare_join(*args, &block)
|
109
|
-
options = args.extract_options!
|
110
|
-
|
111
|
-
pipes, _ = populate_incoming_scopes(args)
|
112
|
-
|
113
|
-
group_fields_args = options[:on]
|
114
|
-
raise 'join requires :on parameter' unless group_fields_args
|
115
|
-
|
116
|
-
if group_fields_args.kind_of?(String)
|
117
|
-
group_fields_args = [group_fields_args]
|
118
|
-
end
|
119
|
-
|
120
|
-
group_fields = []
|
121
|
-
if group_fields_args.kind_of?(Array)
|
122
|
-
pipes.size.times do
|
123
|
-
group_fields << fields(group_fields_args)
|
124
|
-
end
|
125
|
-
elsif group_fields_args.kind_of?(Hash)
|
126
|
-
pipes, group_fields = populate_incoming_scopes(group_fields_args.keys.sort, group_fields_args)
|
127
|
-
else
|
128
|
-
raise "Unsupported data type for :on in join: '#{group_fields_args.class}'"
|
129
|
-
end
|
130
|
-
|
131
|
-
raise 'join requires non-empty :on parameter' if group_fields_args.empty?
|
132
|
-
group_fields = group_fields.to_java(Java::CascadingTuple::Fields)
|
133
|
-
incoming_fields = @incoming_scopes.map{ |s| s.values_fields }
|
134
|
-
declared_fields = fields(options[:declared_fields] || dedup_fields(*incoming_fields))
|
135
|
-
joiner = options[:joiner]
|
136
|
-
is_hash_join = options[:hash] || false
|
137
|
-
|
138
|
-
case joiner
|
139
|
-
when :inner, 'inner', nil
|
140
|
-
joiner = Java::CascadingPipeJoiner::InnerJoin.new
|
141
|
-
when :left, 'left'
|
142
|
-
joiner = Java::CascadingPipeJoiner::LeftJoin.new
|
143
|
-
when :right, 'right'
|
144
|
-
joiner = Java::CascadingPipeJoiner::RightJoin.new
|
145
|
-
when :outer, 'outer'
|
146
|
-
joiner = Java::CascadingPipeJoiner::OuterJoin.new
|
147
|
-
when Array
|
148
|
-
joiner = joiner.map do |t|
|
149
|
-
case t
|
150
|
-
when true, 1, :inner then true
|
151
|
-
when false, 0, :outer then false
|
152
|
-
else fail "invalid mixed joiner entry: #{t}"
|
153
|
-
end
|
154
|
-
end
|
155
|
-
joiner = Java::CascadingPipeJoiner::MixedJoin.new(joiner.to_java(:boolean))
|
156
|
-
end
|
157
|
-
|
158
|
-
if is_hash_join
|
159
|
-
raise ArgumentError, "hash joins don't support aggregations" if block_given?
|
160
|
-
parameters = [
|
161
|
-
pipes.to_java(Java::CascadingPipe::Pipe),
|
162
|
-
group_fields,
|
163
|
-
declared_fields,
|
164
|
-
joiner
|
165
|
-
]
|
166
|
-
group_assembly = Java::CascadingPipe::HashJoin.new(*parameters)
|
167
|
-
else
|
168
|
-
result_group_fields = dedup_fields(*group_fields)
|
169
|
-
parameters = [
|
170
|
-
pipes.to_java(Java::CascadingPipe::Pipe),
|
171
|
-
group_fields,
|
172
|
-
declared_fields,
|
173
|
-
result_group_fields,
|
174
|
-
joiner
|
175
|
-
]
|
176
|
-
group_assembly = Java::CascadingPipe::CoGroup.new(*parameters)
|
177
|
-
end
|
178
|
-
apply_aggregations(group_assembly, @incoming_scopes, &block)
|
179
|
-
end
|
180
|
-
private :prepare_join
|
181
|
-
|
182
106
|
# Builds a HashJoin pipe. This should be used carefully, as the right side
|
183
|
-
# of the join is accumulated entirely in memory. Requires a list of
|
184
|
-
# names to join and :on to specify the join_fields.
|
185
|
-
|
186
|
-
|
107
|
+
# of the join is accumulated entirely in memory. Requires a list of
|
108
|
+
# assembly names to join and :on to specify the join_fields. Note that a
|
109
|
+
# hash_join "takes over" the Assembly in which it is built, so it is
|
110
|
+
# typically the first statement within the block of the assembly or branch.
|
111
|
+
# Additionally, a hash join does not accept a block for aggregations like
|
112
|
+
# other joins; this restriction is enforced here, but comes directly from
|
113
|
+
# Cascading.
|
114
|
+
#
|
115
|
+
# The named options are:
|
116
|
+
# [on] The keys of the join, an array of strings if they are the same in
|
117
|
+
# all inputs, or a hash mapping assembly names to key names if they
|
118
|
+
# differ across inputs.
|
119
|
+
# [declared_fields] By default, a deduplicated array of incoming field
|
120
|
+
# names (see Cascading::dedup_fields). Specifies the
|
121
|
+
# names of the fields that will be available to
|
122
|
+
# aggregations or post-join if no aggregations are
|
123
|
+
# specified.
|
124
|
+
# [joiner] A specification of the c.p.j.Joiner to use. Values like :inner
|
125
|
+
# and 'inner', :right and 'right' are accepted, as well as an
|
126
|
+
# array specifying mixed joins. Typically, this is not provided,
|
127
|
+
# but one of the higher level join methods on Assembly is used
|
128
|
+
# directly (like Assembly#inner_join or Assembly#right_join).
|
129
|
+
#
|
130
|
+
# Example:
|
131
|
+
# assembly 'join_left_right' do
|
132
|
+
# hash_join 'left', 'right', :on => ['key1', 'key2'], :joiner => :inner
|
133
|
+
# end
|
134
|
+
def hash_join(*args_with_options)
|
135
|
+
raise ArgumentError, "HashJoin doesn't support aggregations so the block provided to hash_join will be ignored" if block_given?
|
136
|
+
|
137
|
+
options, assembly_names = args_with_options.extract_options!, args_with_options
|
187
138
|
options[:hash] = true
|
188
|
-
|
189
|
-
prepare_join(*args, &block)
|
139
|
+
prepare_join(assembly_names, options)
|
190
140
|
end
|
191
141
|
|
192
142
|
# Builds a join (CoGroup) pipe. Requires a list of assembly names to join
|
193
|
-
# and :on to specify the group_fields.
|
194
|
-
|
195
|
-
|
143
|
+
# and :on to specify the group_fields. Note that a join "takes over" the
|
144
|
+
# Assembly in which it is built, so it is typically the first statement
|
145
|
+
# within the block of the assembly or branch. The block passed to this
|
146
|
+
# method will be evaluated in the context of Aggregations, not Assembly.
|
147
|
+
#
|
148
|
+
# The named options are:
|
149
|
+
# [on] The keys of the join, an array of strings if they are the same in
|
150
|
+
# all inputs, or a hash mapping assembly names to key names if they
|
151
|
+
# differ across inputs.
|
152
|
+
# [declared_fields] By default, a deduplicated array of incoming field
|
153
|
+
# names (see Cascading::dedup_fields). Specifies the
|
154
|
+
# names of the fields that will be available to
|
155
|
+
# aggregations or post-join if no aggregations are
|
156
|
+
# specified.
|
157
|
+
# [joiner] A specification of the c.p.j.Joiner to use. Values like :inner
|
158
|
+
# and 'inner', :right and 'right' are accepted, as well as an
|
159
|
+
# array specifying mixed joins. Typically, this is not provided,
|
160
|
+
# but one of the higher level join methods on Assembly is used
|
161
|
+
# directly (like Assembly#inner_join or Assembly#right_join).
|
162
|
+
#
|
163
|
+
# Example:
|
164
|
+
# assembly 'join_left_right' do
|
165
|
+
# join 'left', 'right', :on => ['key1', 'key2'], :joiner => :inner do
|
166
|
+
# sum 'val1', 'val2', :type => :long
|
167
|
+
# end
|
168
|
+
# end
|
169
|
+
def join(*args_with_options, &block)
|
170
|
+
options, assembly_names = args_with_options.extract_options!, args_with_options
|
196
171
|
options[:hash] = false
|
197
|
-
|
198
|
-
prepare_join(*args, &block)
|
172
|
+
prepare_join(assembly_names, options, &block)
|
199
173
|
end
|
200
174
|
alias co_group join
|
201
175
|
|
202
|
-
|
203
|
-
|
176
|
+
# Builds an inner join (CoGroup) pipe. Requires a list of assembly names to
|
177
|
+
# join and :on to specify the group_fields.
|
178
|
+
#
|
179
|
+
# The named options are:
|
180
|
+
# [on] The keys of the join, an array of strings if they are the same in
|
181
|
+
# all inputs, or a hash mapping assembly names to key names if they
|
182
|
+
# differ across inputs.
|
183
|
+
# [declared_fields] By default, a deduplicated array of incoming field
|
184
|
+
# names (see Cascading::dedup_fields). Specifies the
|
185
|
+
# names of the fields that will be available to
|
186
|
+
# aggregations or post-join if no aggregations are
|
187
|
+
# specified.
|
188
|
+
#
|
189
|
+
# Example:
|
190
|
+
# assembly 'join_left_right' do
|
191
|
+
# inner_join 'left', 'right', :on => ['key1', 'key2']
|
192
|
+
# sum 'val1', 'val2', :type => :long
|
193
|
+
# end
|
194
|
+
# end
|
195
|
+
def inner_join(*args_with_options, &block)
|
196
|
+
options = args_with_options.extract_options!
|
204
197
|
options[:joiner] = :inner
|
205
|
-
|
206
|
-
join(*
|
198
|
+
args_with_options << options
|
199
|
+
join(*args_with_options, &block)
|
207
200
|
end
|
208
201
|
|
209
|
-
|
210
|
-
|
202
|
+
# Builds a left join (CoGroup) pipe. Requires a list of assembly names to
|
203
|
+
# join and :on to specify the group_fields.
|
204
|
+
#
|
205
|
+
# The named options are:
|
206
|
+
# [on] The keys of the join, an array of strings if they are the same in
|
207
|
+
# all inputs, or a hash mapping assembly names to key names if they
|
208
|
+
# differ across inputs.
|
209
|
+
# [declared_fields] By default, a deduplicated array of incoming field
|
210
|
+
# names (see Cascading::dedup_fields). Specifies the
|
211
|
+
# names of the fields that will be available to
|
212
|
+
# aggregations or post-join if no aggregations are
|
213
|
+
# specified.
|
214
|
+
#
|
215
|
+
# Example:
|
216
|
+
# assembly 'join_left_right' do
|
217
|
+
# left_join 'left', 'right', :on => ['key1', 'key2'] do
|
218
|
+
# sum 'val1', 'val2', :type => :long
|
219
|
+
# end
|
220
|
+
# end
|
221
|
+
def left_join(*args_with_options, &block)
|
222
|
+
options = args_with_options.extract_options!
|
211
223
|
options[:joiner] = :left
|
212
|
-
|
213
|
-
join(*
|
224
|
+
args_with_options << options
|
225
|
+
join(*args_with_options, &block)
|
214
226
|
end
|
215
227
|
|
216
|
-
|
217
|
-
|
228
|
+
# Builds a right join (CoGroup) pipe. Requires a list of assembly names to
|
229
|
+
# join and :on to specify the group_fields.
|
230
|
+
#
|
231
|
+
# The named options are:
|
232
|
+
# [on] The keys of the join, an array of strings if they are the same in
|
233
|
+
# all inputs, or a hash mapping assembly names to key names if they
|
234
|
+
# differ across inputs.
|
235
|
+
# [declared_fields] By default, a deduplicated array of incoming field
|
236
|
+
# names (see Cascading::dedup_fields). Specifies the
|
237
|
+
# names of the fields that will be available to
|
238
|
+
# aggregations or post-join if no aggregations are
|
239
|
+
# specified.
|
240
|
+
#
|
241
|
+
# Example:
|
242
|
+
# assembly 'join_left_right' do
|
243
|
+
# right_join 'left', 'right', :on => ['key1', 'key2'] do
|
244
|
+
# sum 'val1', 'val2', :type => :long
|
245
|
+
# end
|
246
|
+
# end
|
247
|
+
def right_join(*args_with_options, &block)
|
248
|
+
options = args_with_options.extract_options!
|
218
249
|
options[:joiner] = :right
|
219
|
-
|
220
|
-
join(*
|
250
|
+
args_with_options << options
|
251
|
+
join(*args_with_options, &block)
|
221
252
|
end
|
222
253
|
|
223
|
-
|
224
|
-
|
254
|
+
# Builds an outer join (CoGroup) pipe. Requires a list of assembly names to
|
255
|
+
# join and :on to specify the group_fields.
|
256
|
+
#
|
257
|
+
# The named options are:
|
258
|
+
# [on] The keys of the join, an array of strings if they are the same in
|
259
|
+
# all inputs, or a hash mapping assembly names to key names if they
|
260
|
+
# differ across inputs.
|
261
|
+
# [declared_fields] By default, a deduplicated array of incoming field
|
262
|
+
# names (see Cascading::dedup_fields). Specifies the
|
263
|
+
# names of the fields that will be available to
|
264
|
+
# aggregations or post-join if no aggregations are
|
265
|
+
# specified.
|
266
|
+
#
|
267
|
+
# Example:
|
268
|
+
# assembly 'join_left_right' do
|
269
|
+
# outer_join 'left', 'right', :on => ['key1', 'key2'] do
|
270
|
+
# sum 'val1', 'val2', :type => :long
|
271
|
+
# end
|
272
|
+
# end
|
273
|
+
def outer_join(*args_with_options, &block)
|
274
|
+
options = args_with_options.extract_options!
|
225
275
|
options[:joiner] = :outer
|
226
|
-
|
227
|
-
join(*
|
276
|
+
args_with_options << options
|
277
|
+
join(*args_with_options, &block)
|
228
278
|
end
|
229
279
|
|
230
|
-
# Builds a
|
280
|
+
# Builds a child Assembly that branches this Assembly given a name and
|
281
|
+
# block.
|
282
|
+
#
|
283
|
+
# An assembly's name is quite important as it will determine:
|
284
|
+
# * The sources from which it will read, if any
|
285
|
+
# * The name to be used in joins or unions downstream
|
286
|
+
# * The name to be used to sink the output of the assembly downstream
|
287
|
+
#
|
288
|
+
# Many branches may be built within an assembly. The result of a branch is
|
289
|
+
# the same as the Flow#assembly constructor, an Assembly object.
|
290
|
+
#
|
291
|
+
# Example:
|
292
|
+
# assembly 'some_work' do
|
293
|
+
# ...
|
294
|
+
#
|
295
|
+
# branch 'more_work' do
|
296
|
+
# ...
|
297
|
+
# end
|
298
|
+
#
|
299
|
+
# branch 'yet_more_work' do
|
300
|
+
# ...
|
301
|
+
# end
|
302
|
+
# end
|
231
303
|
def branch(name, &block)
|
232
304
|
raise "Could not build branch '#{name}'; block required" unless block_given?
|
233
305
|
assembly = Assembly.new(name, self, @outgoing_scopes)
|
@@ -236,11 +308,27 @@ module Cascading
|
|
236
308
|
assembly
|
237
309
|
end
|
238
310
|
|
239
|
-
# Builds a new GroupBy pipe that groups on the fields given in
|
240
|
-
#
|
241
|
-
|
242
|
-
|
243
|
-
|
311
|
+
# Builds a new GroupBy pipe that groups on the fields given in
|
312
|
+
# args_with_options. The block passed to this method will be evaluated in
|
313
|
+
# the context of Aggregations, not Assembly.
|
314
|
+
#
|
315
|
+
# The named options are:
|
316
|
+
# [sort_by] Optional keys for within-group sort.
|
317
|
+
# [reverse] Boolean that can reverse the order of within-group sorting
|
318
|
+
# (only makes sense given :sort_by keys).
|
319
|
+
#
|
320
|
+
# Example:
|
321
|
+
# assembly 'total' do
|
322
|
+
# ...
|
323
|
+
# insert 'const' => 1
|
324
|
+
# group_by 'const' do
|
325
|
+
# count
|
326
|
+
# sum 'val1', 'val2', :type => :long
|
327
|
+
# end
|
328
|
+
# discard 'const'
|
329
|
+
# end
|
330
|
+
def group_by(*args_with_options, &block)
|
331
|
+
options, group_fields = args_with_options.extract_options!, fields(args_with_options)
|
244
332
|
sort_fields = fields(options[:sort_by])
|
245
333
|
reverse = options[:reverse]
|
246
334
|
|
@@ -251,16 +339,31 @@ module Cascading
|
|
251
339
|
# Unifies multiple incoming pipes sharing the same field structure using a
|
252
340
|
# GroupBy. Accepts :on like join and :sort_by and :reverse like group_by,
|
253
341
|
# as well as a block which may be used for a sequence of Every
|
254
|
-
# aggregations.
|
342
|
+
# aggregations. The block passed to this method will be evaluated in the
|
343
|
+
# context of Aggregations, not Assembly.
|
255
344
|
#
|
256
345
|
# By default, groups only on the first field (see line 189 of GroupBy.java)
|
257
|
-
|
258
|
-
|
346
|
+
#
|
347
|
+
# The named options are:
|
348
|
+
# [on] The keys of the union, which defaults to the first field in the
|
349
|
+
# first input assembly.
|
350
|
+
# [sort_by] Optional keys for sorting.
|
351
|
+
# [reverse] Boolean that can reverse the order of sorting
|
352
|
+
# (only makes sense given :sort_by keys).
|
353
|
+
#
|
354
|
+
# Example:
|
355
|
+
# assembly 'union_left_right' do
|
356
|
+
# union 'left', 'right' do
|
357
|
+
# sum 'val1', 'val2', :type => :long
|
358
|
+
# end
|
359
|
+
# end
|
360
|
+
def union(*args_with_options, &block)
|
361
|
+
options, assembly_names = args_with_options.extract_options!, args_with_options
|
259
362
|
group_fields = fields(options[:on])
|
260
363
|
sort_fields = fields(options[:sort_by])
|
261
364
|
reverse = options[:reverse]
|
262
365
|
|
263
|
-
pipes, _ = populate_incoming_scopes(
|
366
|
+
pipes, _ = populate_incoming_scopes(assembly_names)
|
264
367
|
|
265
368
|
# Must provide group_fields to ensure field name propagation
|
266
369
|
group_fields = fields(@incoming_scopes.first.values_fields.get(0)) unless group_fields
|
@@ -273,10 +376,15 @@ module Cascading
|
|
273
376
|
end
|
274
377
|
alias :union_pipes :union
|
275
378
|
|
276
|
-
# Allows you to plugin c.p.SubAssemblies to
|
277
|
-
#
|
278
|
-
#
|
279
|
-
#
|
379
|
+
# Allows you to plugin c.p.SubAssemblies to an Assembly under certain
|
380
|
+
# assumptions. Note the default is to extend the tail pipe of this
|
381
|
+
# Assembly using a linear SubAssembly. See SubAssembly class for details.
|
382
|
+
#
|
383
|
+
# Example:
|
384
|
+
# assembly 'id_rows' do
|
385
|
+
# ...
|
386
|
+
# sub_assembly Java::CascadingPipeAssembly::Discard.new(tail_pipe, fields('id'))
|
387
|
+
# end
|
280
388
|
def sub_assembly(sub_assembly, pipes = [tail_pipe], incoming_scopes = [scope])
|
281
389
|
sub_assembly = SubAssembly.new(self, sub_assembly)
|
282
390
|
sub_assembly.finalize(pipes, incoming_scopes)
|
@@ -287,17 +395,24 @@ module Cascading
|
|
287
395
|
sub_assembly
|
288
396
|
end
|
289
397
|
|
290
|
-
# Builds a basic
|
291
|
-
#
|
398
|
+
# Builds a basic each pipe and adds it to the current Assembly. Default
|
399
|
+
# arguments are all_fields, a default inherited from c.o.Each. Exactly one
|
400
|
+
# of :function and :filter must be specified and filters do not support an
|
401
|
+
# :output selector.
|
402
|
+
#
|
403
|
+
# The named options are:
|
404
|
+
# [filter] A Cascading Filter, mutually exclusive with :function.
|
405
|
+
# [function] A Cascading Function, mutually exclusive with :filter.
|
406
|
+
# [output] c.p.Each output selector, only valid with :function.
|
407
|
+
#
|
292
408
|
# Example:
|
293
|
-
#
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
out_fields = fields(options[:output])
|
299
|
-
|
409
|
+
# each fields(input_fields), :function => Java::CascadingOperation::Identity.new
|
410
|
+
# each 'field1', 'field2', :function => Java::CascadingOperation::Identity.new
|
411
|
+
def each(*args_with_options)
|
412
|
+
options, in_fields = args_with_options.extract_options!, fields(args_with_options)
|
413
|
+
out_fields = fields(options[:output]) # Default Fields.RESULTS from c.o.Each
|
300
414
|
operation = options[:filter] || options[:function]
|
415
|
+
raise 'each requires either :filter or :function' unless operation
|
301
416
|
raise 'c.p.Each does not support applying an output selector to a c.o.Filter' if options[:filter] && options[:output]
|
302
417
|
|
303
418
|
parameters = [tail_pipe, in_fields, operation, out_fields].compact
|
@@ -308,468 +423,156 @@ module Cascading
|
|
308
423
|
each
|
309
424
|
end
|
310
425
|
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
each fields(args), :function => Java::CascadingOperation::Identity.new
|
317
|
-
end
|
318
|
-
|
319
|
-
# Removes the specified fields from the current assembly.
|
320
|
-
# --
|
321
|
-
# Example:
|
322
|
-
# discard "field1", "field2"
|
323
|
-
def discard(*args)
|
324
|
-
discard_fields = fields(args)
|
325
|
-
keep_fields = difference_fields(scope.values_fields, discard_fields)
|
326
|
-
project(*keep_fields.to_a)
|
327
|
-
end
|
328
|
-
|
329
|
-
# Renames fields according to the mapping provided.
|
330
|
-
# --
|
331
|
-
# Example:
|
332
|
-
# rename "old_name" => "new_name"
|
333
|
-
def rename(name_map)
|
334
|
-
old_names = scope.values_fields.to_a
|
335
|
-
new_names = old_names.map{ |name| name_map[name] || name }
|
336
|
-
invalid = name_map.keys.sort - old_names
|
337
|
-
raise "invalid names: #{invalid.inspect}" unless invalid.empty?
|
338
|
-
|
339
|
-
each all_fields, :function => Java::CascadingOperation::Identity.new(fields(new_names))
|
340
|
-
end
|
341
|
-
|
342
|
-
def cast(type_map)
|
343
|
-
names = type_map.keys.sort
|
344
|
-
types = JAVA_TYPE_MAP.values_at(*type_map.values_at(*names))
|
345
|
-
fields = fields(names)
|
346
|
-
types = types.to_java(java.lang.Class)
|
347
|
-
each fields, :function => Java::CascadingOperation::Identity.new(fields, types)
|
348
|
-
end
|
349
|
-
|
350
|
-
def copy(*args)
|
351
|
-
options = args.extract_options!
|
352
|
-
from = args[0] || all_fields
|
353
|
-
into = args[1] || options[:into] || all_fields
|
354
|
-
each fields(from), :function => Java::CascadingOperation::Identity.new(fields(into)), :output => all_fields
|
355
|
-
end
|
356
|
-
|
357
|
-
# A pipe that does nothing.
|
358
|
-
def pass(*args)
|
359
|
-
each all_fields, :function => Java::CascadingOperation::Identity.new
|
360
|
-
end
|
426
|
+
include Operations
|
427
|
+
include IdentityOperations
|
428
|
+
include FilterOperations
|
429
|
+
include RegexOperations
|
430
|
+
include TextOperations
|
361
431
|
|
362
|
-
|
363
|
-
|
364
|
-
|
432
|
+
# Builds an each assertion pipe given a c.o.a.Assertion and adds it to the
|
433
|
+
# current Assembly.
|
434
|
+
#
|
435
|
+
# The named options are:
|
436
|
+
# [level] The assertion level; defaults to strict.
|
437
|
+
def assert(assertion, options = {})
|
365
438
|
assertion_level = options[:level] || Java::CascadingOperation::AssertionLevel::STRICT
|
366
439
|
|
367
440
|
parameters = [tail_pipe, assertion_level, assertion]
|
368
441
|
make_pipe(Java::CascadingPipe::Each, parameters)
|
369
442
|
end
|
370
443
|
|
371
|
-
# Builds a
|
372
|
-
|
373
|
-
|
374
|
-
# output.
|
375
|
-
#
|
376
|
-
# The other named options are:
|
377
|
-
# * <tt>:print_fields</tt> a boolean. If is set to true, then it prints every 10 tuples.
|
378
|
-
#
|
379
|
-
def debug(*args)
|
380
|
-
options = args.extract_options!
|
381
|
-
print_fields = options[:print_fields] || true
|
382
|
-
parameters = [print_fields].compact
|
383
|
-
debug = Java::CascadingOperation::Debug.new(*parameters)
|
384
|
-
debug.print_tuple_every = options[:tuple_interval] || 1
|
385
|
-
debug.print_fields_every = options[:fields_interval] || 10
|
386
|
-
each(all_fields, :filter => debug)
|
387
|
-
end
|
388
|
-
|
389
|
-
# Builds a pipe that assert the size of the tuple is the size specified in parameter.
|
390
|
-
#
|
391
|
-
# The method accept an unique uname argument : a number indicating the size expected.
|
392
|
-
def assert_size_equals(*args)
|
393
|
-
options = args.extract_options!
|
394
|
-
assertion = Java::CascadingOperationAssertion::AssertSizeEquals.new(args[0])
|
444
|
+
# Builds a pipe that asserts the size of the tuple is the specified size.
|
445
|
+
def assert_size_equals(size, options = {})
|
446
|
+
assertion = Java::CascadingOperationAssertion::AssertSizeEquals.new(size)
|
395
447
|
assert(assertion, options)
|
396
448
|
end
|
397
449
|
|
398
|
-
#
|
399
|
-
def assert_not_null(
|
400
|
-
options = args.extract_options!
|
450
|
+
# Builes a pipe that asserts none of the fiels in the tuple are null.
|
451
|
+
def assert_not_null(options = {})
|
401
452
|
assertion = Java::CascadingOperationAssertion::AssertNotNull.new
|
402
453
|
assert(assertion, options)
|
403
454
|
end
|
404
455
|
|
405
|
-
|
406
|
-
# using a specified regex pattern.
|
407
|
-
#
|
408
|
-
# If provided, the unamed arguments must be the fields to be parsed. If not provided, then all incoming
|
409
|
-
# fields are used.
|
410
|
-
#
|
411
|
-
# The named options are:
|
412
|
-
# * <tt>:pattern</tt> a string or regex. Specifies the regular expression used for parsing the argument fields.
|
413
|
-
# * <tt>:output</tt> a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
|
414
|
-
def parse(*args)
|
415
|
-
options = args.extract_options!
|
416
|
-
fields = args || all_fields
|
417
|
-
pattern = options[:pattern]
|
418
|
-
output = options[:output] || all_fields
|
419
|
-
each(fields, :function => regex_parser(pattern, options), :output => output)
|
420
|
-
end
|
456
|
+
private
|
421
457
|
|
422
|
-
|
423
|
-
|
424
|
-
|
425
|
-
# The second unnamed argument is an array of strings indicating the fields receiving the result of the split.
|
426
|
-
#
|
427
|
-
# The named options are:
|
428
|
-
# * <tt>:pattern</tt> a string or regex. Specifies the regular expression used for splitting the argument fields.
|
429
|
-
# * <tt>:output</tt> a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
|
430
|
-
def split(*args)
|
431
|
-
options = args.extract_options!
|
432
|
-
fields = options[:into] || args[1]
|
433
|
-
pattern = options[:pattern] || /[.,]*\s+/
|
434
|
-
output = options[:output] || all_fields
|
435
|
-
each(args[0], :function => regex_splitter(fields, :pattern => pattern), :output=>output)
|
436
|
-
end
|
437
|
-
|
438
|
-
# Builds a pipe that splits a field into new rows, using a specified regular expression.
|
439
|
-
#
|
440
|
-
# The first unnamed argument is the field to be split.
|
441
|
-
# The second unnamed argument is the field receiving the result of the split.
|
442
|
-
#
|
443
|
-
# The named options are:
|
444
|
-
# * <tt>:pattern</tt> a string or regex. Specifies the regular expression used for splitting the argument fields.
|
445
|
-
# * <tt>:output</tt> a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
|
446
|
-
def split_rows(*args)
|
447
|
-
options = args.extract_options!
|
448
|
-
fields = options[:into] || args[1]
|
449
|
-
pattern = options[:pattern] || /[.,]*\s+/
|
450
|
-
output = options[:output] || all_fields
|
451
|
-
each(args[0], :function => regex_split_generator(fields, :pattern => pattern), :output=>output)
|
452
|
-
end
|
453
|
-
|
454
|
-
# Builds a pipe that emits a new row for each regex group matched in a field, using a specified regular expression.
|
455
|
-
#
|
456
|
-
# The first unnamed argument is the field to be matched against.
|
457
|
-
# The second unnamed argument is the field receiving the result of the match.
|
458
|
-
#
|
459
|
-
# The named options are:
|
460
|
-
# * <tt>:pattern</tt> a string or regex. Specifies the regular expression used for matching the argument fields.
|
461
|
-
# * <tt>:output</tt> a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
|
462
|
-
def match_rows(*args)
|
463
|
-
options = args.extract_options!
|
464
|
-
fields = options[:into] || args[1]
|
465
|
-
pattern = options[:pattern] || /[\w]+/
|
466
|
-
output = options[:output] || all_fields
|
467
|
-
each(args[0], :function => regex_generator(fields, :pattern => pattern), :output=>output)
|
468
|
-
end
|
469
|
-
|
470
|
-
# Builds a pipe that parses the specified field as a date using hte provided format string.
|
471
|
-
# The unamed argument specifies the field to format.
|
472
|
-
#
|
473
|
-
# The named options are:
|
474
|
-
# * <tt>:into</tt> a string. It specifies the receiving field. By default, it will be named after
|
475
|
-
# the input argument.
|
476
|
-
# * <tt>:pattern</tt> a string. Specifies the date format.
|
477
|
-
# * <tt>:output</tt> a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
|
478
|
-
def parse_date(*args)
|
479
|
-
options = args.extract_options!
|
480
|
-
field = options[:into] || "#{args[0]}_parsed"
|
481
|
-
output = options[:output] || all_fields
|
482
|
-
pattern = options[:pattern] || "yyyy/MM/dd"
|
483
|
-
|
484
|
-
each args[0], :function => date_parser(field, pattern), :output => output
|
485
|
-
end
|
458
|
+
def make_pipe(type, parameters)
|
459
|
+
@tail_pipe = type.new(*parameters)
|
460
|
+
@outgoing_scopes[name] = Scope.outgoing_scope(tail_pipe, [scope])
|
486
461
|
|
487
|
-
|
488
|
-
#
|
489
|
-
# The unamed argument specifies the field to format.
|
490
|
-
#
|
491
|
-
# The named options are:
|
492
|
-
# * <tt>:into</tt> a string. It specifies the receiving field. By default, it will be named after
|
493
|
-
# the input argument.
|
494
|
-
# * <tt>:pattern</tt> a string. Specifies the date format.
|
495
|
-
# * <tt>:timezone</tt> a string. Specifies the timezone (defaults to UTC).
|
496
|
-
# * <tt>:output</tt> a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
|
497
|
-
def format_date(*args)
|
498
|
-
options = args.extract_options!
|
499
|
-
field = options[:into] || "#{args[0]}_formatted"
|
500
|
-
pattern = options[:pattern] || "yyyy/MM/dd"
|
501
|
-
output = options[:output] || all_fields
|
502
|
-
|
503
|
-
each args[0], :function => date_formatter(field, pattern, options[:timezone]), :output => output
|
462
|
+
tail_pipe
|
504
463
|
end
|
505
464
|
|
506
|
-
|
507
|
-
|
508
|
-
|
509
|
-
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
# * <tt>:replacement</tt> a string. Specifies the replacement.
|
514
|
-
# * <tt>:output</tt> a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
|
515
|
-
def replace(*args)
|
516
|
-
options = args.extract_options!
|
517
|
-
|
518
|
-
pattern = options[:pattern] || args[1]
|
519
|
-
replacement = options[:replacement] || args[2]
|
520
|
-
into = options[:into] || "#{args[0]}_replaced"
|
521
|
-
output = options[:output] || all_fields
|
522
|
-
|
523
|
-
each args[0], :function => regex_replace(into, pattern, replacement), :output => output
|
524
|
-
end
|
465
|
+
def populate_incoming_scopes(assembly_names, group_fields_args = {})
|
466
|
+
# NOTE: this overrides the existing incoming_scopes, which changes the
|
467
|
+
# way describe will function on this assembly
|
468
|
+
pipes, @incoming_scopes, group_fields = [], [], []
|
469
|
+
assembly_names.each do |assembly_name|
|
470
|
+
assembly = parent_flow.find_child(assembly_name)
|
471
|
+
raise "Could not find assembly '#{assembly_name}' from '#{name}'" unless assembly
|
525
472
|
|
526
|
-
|
527
|
-
|
528
|
-
|
529
|
-
# and as values, the values they must contain. For example:
|
530
|
-
#
|
531
|
-
# insert {"who" => "Grégoire", "when" => Time.now.strftime("%Y-%m-%d") }
|
532
|
-
#
|
533
|
-
# will insert two new fields: a field _who_ containing the string "Grégoire", and a field _when_ containing
|
534
|
-
# the formatted current date.
|
535
|
-
# The methods outputs all fields.
|
536
|
-
# The named options are:
|
537
|
-
def insert(args)
|
538
|
-
args.keys.sort.each do |field_name|
|
539
|
-
value = args[field_name]
|
540
|
-
|
541
|
-
if value.kind_of?(ExprStub)
|
542
|
-
value.validate_scope(scope)
|
543
|
-
each all_fields, :function => expression_function(field_name, :expression => value.expression, :parameters => value.types), :output => all_fields
|
544
|
-
else
|
545
|
-
each all_fields, :function => insert_function([field_name], :values => [value]), :output => all_fields
|
546
|
-
end
|
473
|
+
pipes << assembly.tail_pipe
|
474
|
+
@incoming_scopes << assembly.scope
|
475
|
+
group_fields << fields(group_fields_args[assembly_name]) if group_fields_args[assembly_name]
|
547
476
|
end
|
477
|
+
[pipes, group_fields]
|
548
478
|
end
|
549
479
|
|
550
|
-
|
551
|
-
|
552
|
-
|
553
|
-
#
|
554
|
-
# The named options are:
|
555
|
-
# * <tt>:pattern</tt> a string. Specifies a regular expression pattern used to filter the tuples. If this
|
556
|
-
# option is provided, then the filter is regular expression-based. This is incompatible with the _expression_ option.
|
557
|
-
# * <tt>:expression</tt> a string. Specifies a Janino expression used to filter the tuples. This option has the
|
558
|
-
# same effect than providing it as first unamed argument. If this option is provided, then the filter is Janino
|
559
|
-
# expression-based. This is incompatible with the _pattern_ option.
|
560
|
-
# * <tt>:validate</tt> a boolean. Passed into Cascading#expr to enable or disable
|
561
|
-
# expression validation. Defaults to true.
|
562
|
-
# * <tt>:validate_with</tt> a hash. Actual arguments used by Cascading#expr for
|
563
|
-
# expression validation. Defaults to {}.
|
564
|
-
def filter(*args)
|
565
|
-
options = args.extract_options!
|
566
|
-
from = options.delete(:from) || all_fields
|
567
|
-
expression = options.delete(:expression) || args.shift
|
568
|
-
regex = options.delete(:pattern)
|
569
|
-
validate = options.has_key?(:validate) ? options.delete(:validate) : true
|
570
|
-
validate_with = options.has_key?(:validate_with) ? options.delete(:validate_with) : {}
|
571
|
-
|
572
|
-
if expression
|
573
|
-
stub = expr(expression, { :validate => validate, :validate_with => validate_with })
|
574
|
-
types, expression = stub.types, stub.expression
|
575
|
-
|
576
|
-
stub.validate_scope(scope)
|
577
|
-
each from, :filter => expression_filter(
|
578
|
-
:parameters => types,
|
579
|
-
:expression => expression
|
580
|
-
)
|
581
|
-
elsif regex
|
582
|
-
each from, :filter => regex_filter(regex, options)
|
583
|
-
end
|
584
|
-
end
|
480
|
+
def apply_aggregations(group, incoming_scopes, &block)
|
481
|
+
aggregations = Aggregations.new(self, group, incoming_scopes)
|
482
|
+
aggregations.instance_eval(&block) if block_given?
|
585
483
|
|
586
|
-
|
587
|
-
|
588
|
-
|
589
|
-
|
590
|
-
|
484
|
+
# Sorting of any type means that we cannot use the AggregateBy optimization
|
485
|
+
if aggregations.can_aggregate_by? && !group.is_sorted && !group.is_sort_reversed
|
486
|
+
grouping_fields = group.key_selectors.values.first
|
487
|
+
group.key_selectors.values.each do |key_fields|
|
488
|
+
raise "Grouping fields mismatch: #{grouping_fields} expected; #{key_fields} found from #{group.key_selectors}" unless key_fields == grouping_fields
|
489
|
+
end
|
591
490
|
|
592
|
-
|
593
|
-
|
594
|
-
|
595
|
-
|
596
|
-
|
491
|
+
aggregate_by = sub_assembly(Java::CascadingPipeAssembly::AggregateBy.new(
|
492
|
+
name,
|
493
|
+
group.previous,
|
494
|
+
grouping_fields,
|
495
|
+
aggregations.aggregate_bys.to_java(Java::CascadingPipeAssembly::AggregateBy)
|
496
|
+
), group.previous, incoming_scopes)
|
597
497
|
|
598
|
-
|
599
|
-
|
600
|
-
|
601
|
-
|
602
|
-
|
603
|
-
# * <tt>:expression</tt> a string. Specifies a Janino expression used to filter the tuples. This option has the
|
604
|
-
# same effect than providing it as first unamed argument. If this option is provided, then the filter is Janino
|
605
|
-
# expression-based.
|
606
|
-
# * <tt>:validate</tt> a boolean. Passed into Cascading#expr to enable or disable
|
607
|
-
# expression validation. Defaults to true.
|
608
|
-
# * <tt>:validate_with</tt> a hash. Actual arguments used by Cascading#expr for
|
609
|
-
# expression validation. Defaults to {}.
|
610
|
-
def reject(*args)
|
611
|
-
options = args.extract_options
|
612
|
-
raise "Regex not allowed" if options && options[:pattern]
|
613
|
-
|
614
|
-
filter(*args)
|
615
|
-
end
|
498
|
+
aggregate_by
|
499
|
+
else
|
500
|
+
aggregations.finalize if block_given?
|
501
|
+
@tail_pipe = aggregations.tail_pipe
|
502
|
+
@outgoing_scopes[name] = aggregations.scope
|
616
503
|
|
617
|
-
|
618
|
-
#
|
619
|
-
# The first unamed argument, if provided, is a filtering expression (using the Janino syntax).
|
620
|
-
#
|
621
|
-
# The named options are:
|
622
|
-
# * <tt>:expression</tt> a string. Specifies a Janino expression used to select the tuples. This option has the
|
623
|
-
# same effect than providing it as first unamed argument. If this option is provided, then the filter is Janino
|
624
|
-
# expression-based.
|
625
|
-
# * <tt>:validate</tt> a boolean. Passed into Cascading#expr to enable or disable
|
626
|
-
# expression validation. Defaults to true.
|
627
|
-
# * <tt>:validate_with</tt> a hash. Actual arguments used by Cascading#expr for
|
628
|
-
# expression validation. Defaults to {}.
|
629
|
-
def where(*args)
|
630
|
-
options = args.extract_options
|
631
|
-
raise "Regex not allowed" if options && options[:pattern]
|
632
|
-
|
633
|
-
if options[:expression]
|
634
|
-
_, imports, expr = options[:expression].match(/^((?:\s*import.*;\s*)*)(.*)$/).to_a
|
635
|
-
options[:expression] = "#{imports}!(#{expr})"
|
636
|
-
elsif args[0]
|
637
|
-
_, imports, expr = args[0].match(/^((?:\s*import.*;\s*)*)(.*)$/).to_a
|
638
|
-
args[0] = "#{imports}!(#{expr})"
|
504
|
+
group
|
639
505
|
end
|
640
|
-
|
641
|
-
filter(*args)
|
642
506
|
end
|
643
507
|
|
644
|
-
|
645
|
-
|
646
|
-
# The named options are:
|
647
|
-
# * <tt>:from</tt> a string or array of strings. Specifies the input fields.
|
648
|
-
# * <tt>:express</tt> a string. The janino expression.
|
649
|
-
# * <tt>:into</tt> a string. Specified the name of the field to insert with the result of the evaluation.
|
650
|
-
# * <tt>:parameters</tt> a hash. Specifies the type mapping for the parameters. See Cascading::Operations.expression_function.
|
651
|
-
def eval_expression(*args)
|
652
|
-
options = args.extract_options!
|
653
|
-
|
654
|
-
into = options.delete(:into)
|
655
|
-
from = options.delete(:from) || all_fields
|
656
|
-
output = options.delete(:output) || all_fields
|
657
|
-
options[:expression] ||= args.shift
|
658
|
-
options[:parameters] ||= args.shift
|
659
|
-
|
660
|
-
each from, :function => expression_function(into, options), :output=>output
|
661
|
-
end
|
508
|
+
def prepare_join(assembly_names, options, &block)
|
509
|
+
pipes, _ = populate_incoming_scopes(assembly_names)
|
662
510
|
|
663
|
-
|
664
|
-
|
665
|
-
# The method accepts optional unamed argument specifying the fields to base the distinct on
|
666
|
-
# (all fields, by default).
|
667
|
-
def distinct(*args)
|
668
|
-
raise "Distinct is badly broken"
|
669
|
-
fields = args[0] || all_fields
|
670
|
-
group_by *fields
|
671
|
-
pass
|
672
|
-
end
|
673
|
-
|
674
|
-
def join_fields(*args)
|
675
|
-
options = args.extract_options!
|
676
|
-
output = options[:output] || all_fields
|
511
|
+
group_fields_args = options[:on]
|
512
|
+
raise 'join requires :on parameter' unless group_fields_args
|
677
513
|
|
678
|
-
|
679
|
-
|
514
|
+
if group_fields_args.kind_of?(String)
|
515
|
+
group_fields_args = [group_fields_args]
|
516
|
+
end
|
680
517
|
|
681
|
-
|
682
|
-
|
683
|
-
|
684
|
-
|
685
|
-
|
686
|
-
|
687
|
-
|
688
|
-
|
689
|
-
|
690
|
-
|
691
|
-
# in the order provided.
|
692
|
-
# * <tt>:num_values</tt> an integer specifying the number of fields to
|
693
|
-
# ungroup into each output tuple (excluding the key fields). All input
|
694
|
-
# fields will be ungrouped.
|
695
|
-
# * <tt>:input</tt> an array of field names that specifies the fields to
|
696
|
-
# input to UnGroup. Defaults to all_fields.
|
697
|
-
# * <tt>:into</tt> an array of field names. Default set by UnGroup.
|
698
|
-
# * <tt>:output</tt> an array of field names that specifies the fields to
|
699
|
-
# produce as output of UnGroup. Defaults to all_fields.
|
700
|
-
def ungroup(*args)
|
701
|
-
options = args.extract_options!
|
702
|
-
input = options[:input] || all_fields
|
703
|
-
into = fields(options[:into])
|
704
|
-
output = options[:output] || all_fields
|
705
|
-
key = fields(options[:key])
|
706
|
-
|
707
|
-
raise 'You must provide exactly one of :value_selectors or :num_values to ungroup' unless options.has_key?(:value_selectors) ^ options.has_key?(:num_values)
|
708
|
-
value_selectors = options[:value_selectors].map{ |vs| fields(vs) }.to_java(Java::CascadingTuple::Fields) if options.has_key?(:value_selectors)
|
709
|
-
num_values = options[:num_values] if options.has_key?(:num_values)
|
710
|
-
|
711
|
-
parameters = [into, key, value_selectors, num_values].compact
|
712
|
-
each input, :function => Java::CascadingOperationFunction::UnGroup.new(*parameters), :output => output
|
713
|
-
end
|
518
|
+
group_fields = []
|
519
|
+
if group_fields_args.kind_of?(Array)
|
520
|
+
pipes.size.times do
|
521
|
+
group_fields << fields(group_fields_args)
|
522
|
+
end
|
523
|
+
elsif group_fields_args.kind_of?(Hash)
|
524
|
+
pipes, group_fields = populate_incoming_scopes(group_fields_args.keys.sort, group_fields_args)
|
525
|
+
else
|
526
|
+
raise "Unsupported data type for :on in join: '#{group_fields_args.class}'"
|
527
|
+
end
|
714
528
|
|
715
|
-
|
716
|
-
|
717
|
-
|
718
|
-
|
719
|
-
|
720
|
-
|
721
|
-
# * <tt>filter</tt> Cascading Filter to apply.
|
722
|
-
# * <tt>keep_value</tt> Java value to produce when the filter would keep
|
723
|
-
# the given input.
|
724
|
-
# * <tt>remove_value</tt> Java value to produce when the filter would
|
725
|
-
# remove the given input.
|
726
|
-
#
|
727
|
-
# The named options are:
|
728
|
-
# * <tt>:into</tt> an output field name, defaulting to 'filter_value'.
|
729
|
-
# * <tt>:output</tt> an array of field names that specifies the fields to
|
730
|
-
# retain in the output tuple. Defaults to all_fields.
|
731
|
-
def set_value(input, filter, keep_value, remove_value, params = {})
|
732
|
-
into = fields(params[:into] || 'filter_value')
|
733
|
-
output = params[:output] || all_fields
|
734
|
-
each input, :function => Java::CascadingOperationFunction::SetValue.new(into, filter, keep_value, remove_value), :output => output
|
735
|
-
end
|
529
|
+
raise 'join requires non-empty :on parameter' if group_fields_args.empty?
|
530
|
+
group_fields = group_fields.to_java(Java::CascadingTuple::Fields)
|
531
|
+
incoming_fields = @incoming_scopes.map{ |s| s.values_fields }
|
532
|
+
declared_fields = fields(options[:declared_fields] || dedup_fields(*incoming_fields))
|
533
|
+
joiner = options[:joiner]
|
534
|
+
is_hash_join = options[:hash] || false
|
736
535
|
|
737
|
-
|
738
|
-
|
739
|
-
|
740
|
-
|
741
|
-
|
742
|
-
|
743
|
-
|
744
|
-
|
745
|
-
|
746
|
-
|
747
|
-
|
748
|
-
|
749
|
-
|
750
|
-
|
751
|
-
|
752
|
-
|
753
|
-
|
536
|
+
case joiner
|
537
|
+
when :inner, 'inner', nil
|
538
|
+
joiner = Java::CascadingPipeJoiner::InnerJoin.new
|
539
|
+
when :left, 'left'
|
540
|
+
joiner = Java::CascadingPipeJoiner::LeftJoin.new
|
541
|
+
when :right, 'right'
|
542
|
+
joiner = Java::CascadingPipeJoiner::RightJoin.new
|
543
|
+
when :outer, 'outer'
|
544
|
+
joiner = Java::CascadingPipeJoiner::OuterJoin.new
|
545
|
+
when Array
|
546
|
+
joiner = joiner.map do |t|
|
547
|
+
case t
|
548
|
+
when true, 1, :inner then true
|
549
|
+
when false, 0, :outer then false
|
550
|
+
else fail "invalid mixed joiner entry: #{t}"
|
551
|
+
end
|
552
|
+
end
|
553
|
+
joiner = Java::CascadingPipeJoiner::MixedJoin.new(joiner.to_java(:boolean))
|
554
|
+
end
|
754
555
|
|
755
|
-
|
756
|
-
|
757
|
-
|
758
|
-
|
759
|
-
|
760
|
-
|
761
|
-
|
762
|
-
|
763
|
-
|
764
|
-
|
765
|
-
|
766
|
-
|
767
|
-
|
768
|
-
|
769
|
-
|
770
|
-
|
771
|
-
|
772
|
-
|
556
|
+
if is_hash_join
|
557
|
+
parameters = [
|
558
|
+
pipes.to_java(Java::CascadingPipe::Pipe),
|
559
|
+
group_fields,
|
560
|
+
declared_fields,
|
561
|
+
joiner
|
562
|
+
]
|
563
|
+
group_assembly = Java::CascadingPipe::HashJoin.new(*parameters)
|
564
|
+
else
|
565
|
+
result_group_fields = dedup_fields(*group_fields)
|
566
|
+
parameters = [
|
567
|
+
pipes.to_java(Java::CascadingPipe::Pipe),
|
568
|
+
group_fields,
|
569
|
+
declared_fields,
|
570
|
+
result_group_fields,
|
571
|
+
joiner
|
572
|
+
]
|
573
|
+
group_assembly = Java::CascadingPipe::CoGroup.new(*parameters)
|
574
|
+
end
|
575
|
+
apply_aggregations(group_assembly, @incoming_scopes, &block)
|
773
576
|
end
|
774
577
|
end
|
775
578
|
end
|