cascading.jruby 0.0.10 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE.txt +13 -160
- data/README.md +35 -0
- data/lib/cascading.rb +8 -41
- data/lib/cascading/aggregations.rb +216 -71
- data/lib/cascading/assembly.rb +409 -606
- data/lib/cascading/base.rb +22 -0
- data/lib/cascading/cascade.rb +55 -18
- data/lib/cascading/cascading.rb +137 -47
- data/lib/cascading/expr_stub.rb +31 -17
- data/lib/cascading/ext/array.rb +17 -0
- data/lib/cascading/filter_operations.rb +101 -0
- data/lib/cascading/flow.rb +87 -23
- data/lib/cascading/identity_operations.rb +82 -0
- data/lib/cascading/mode.rb +14 -10
- data/lib/cascading/operations.rb +109 -174
- data/lib/cascading/regex_operations.rb +133 -0
- data/lib/cascading/scope.rb +32 -9
- data/lib/cascading/sub_assembly.rb +8 -5
- data/lib/cascading/tap.rb +41 -17
- data/lib/cascading/text_operations.rb +67 -0
- data/test/mock_assemblies.rb +55 -0
- data/test/test_assembly.rb +23 -25
- data/test/test_local_execution.rb +7 -7
- data/test/test_operations.rb +0 -10
- metadata +76 -74
- data/History.txt +0 -58
data/LICENSE.txt
CHANGED
@@ -1,165 +1,18 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
License:
|
2
|
+
Project and contact information: http://github.com/mrwalker/cascading.jruby
|
3
3
|
|
4
|
-
|
5
|
-
|
6
|
-
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
you may not use this file except in compliance with the License.
|
6
|
+
You may obtain a copy of the License at
|
7
7
|
|
8
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
9
|
|
9
|
-
|
10
|
-
|
11
|
-
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
See the License for the specific language governing permissions and
|
14
|
+
limitations under the License.
|
12
15
|
|
13
|
-
|
16
|
+
Third-party Licenses:
|
14
17
|
|
15
|
-
|
16
|
-
General Public License, and the "GNU GPL" refers to version 3 of the GNU
|
17
|
-
General Public License.
|
18
|
-
|
19
|
-
"The Library" refers to a covered work governed by this License,
|
20
|
-
other than an Application or a Combined Work as defined below.
|
21
|
-
|
22
|
-
An "Application" is any work that makes use of an interface provided
|
23
|
-
by the Library, but which is not otherwise based on the Library.
|
24
|
-
Defining a subclass of a class defined by the Library is deemed a mode
|
25
|
-
of using an interface provided by the Library.
|
26
|
-
|
27
|
-
A "Combined Work" is a work produced by combining or linking an
|
28
|
-
Application with the Library. The particular version of the Library
|
29
|
-
with which the Combined Work was made is also called the "Linked
|
30
|
-
Version".
|
31
|
-
|
32
|
-
The "Minimal Corresponding Source" for a Combined Work means the
|
33
|
-
Corresponding Source for the Combined Work, excluding any source code
|
34
|
-
for portions of the Combined Work that, considered in isolation, are
|
35
|
-
based on the Application, and not on the Linked Version.
|
36
|
-
|
37
|
-
The "Corresponding Application Code" for a Combined Work means the
|
38
|
-
object code and/or source code for the Application, including any data
|
39
|
-
and utility programs needed for reproducing the Combined Work from the
|
40
|
-
Application, but excluding the System Libraries of the Combined Work.
|
41
|
-
|
42
|
-
1. Exception to Section 3 of the GNU GPL.
|
43
|
-
|
44
|
-
You may convey a covered work under sections 3 and 4 of this License
|
45
|
-
without being bound by section 3 of the GNU GPL.
|
46
|
-
|
47
|
-
2. Conveying Modified Versions.
|
48
|
-
|
49
|
-
If you modify a copy of the Library, and, in your modifications, a
|
50
|
-
facility refers to a function or data to be supplied by an Application
|
51
|
-
that uses the facility (other than as an argument passed when the
|
52
|
-
facility is invoked), then you may convey a copy of the modified
|
53
|
-
version:
|
54
|
-
|
55
|
-
a) under this License, provided that you make a good faith effort to
|
56
|
-
ensure that, in the event an Application does not supply the
|
57
|
-
function or data, the facility still operates, and performs
|
58
|
-
whatever part of its purpose remains meaningful, or
|
59
|
-
|
60
|
-
b) under the GNU GPL, with none of the additional permissions of
|
61
|
-
this License applicable to that copy.
|
62
|
-
|
63
|
-
3. Object Code Incorporating Material from Library Header Files.
|
64
|
-
|
65
|
-
The object code form of an Application may incorporate material from
|
66
|
-
a header file that is part of the Library. You may convey such object
|
67
|
-
code under terms of your choice, provided that, if the incorporated
|
68
|
-
material is not limited to numerical parameters, data structure
|
69
|
-
layouts and accessors, or small macros, inline functions and templates
|
70
|
-
(ten or fewer lines in length), you do both of the following:
|
71
|
-
|
72
|
-
a) Give prominent notice with each copy of the object code that the
|
73
|
-
Library is used in it and that the Library and its use are
|
74
|
-
covered by this License.
|
75
|
-
|
76
|
-
b) Accompany the object code with a copy of the GNU GPL and this license
|
77
|
-
document.
|
78
|
-
|
79
|
-
4. Combined Works.
|
80
|
-
|
81
|
-
You may convey a Combined Work under terms of your choice that,
|
82
|
-
taken together, effectively do not restrict modification of the
|
83
|
-
portions of the Library contained in the Combined Work and reverse
|
84
|
-
engineering for debugging such modifications, if you also do each of
|
85
|
-
the following:
|
86
|
-
|
87
|
-
a) Give prominent notice with each copy of the Combined Work that
|
88
|
-
the Library is used in it and that the Library and its use are
|
89
|
-
covered by this License.
|
90
|
-
|
91
|
-
b) Accompany the Combined Work with a copy of the GNU GPL and this license
|
92
|
-
document.
|
93
|
-
|
94
|
-
c) For a Combined Work that displays copyright notices during
|
95
|
-
execution, include the copyright notice for the Library among
|
96
|
-
these notices, as well as a reference directing the user to the
|
97
|
-
copies of the GNU GPL and this license document.
|
98
|
-
|
99
|
-
d) Do one of the following:
|
100
|
-
|
101
|
-
0) Convey the Minimal Corresponding Source under the terms of this
|
102
|
-
License, and the Corresponding Application Code in a form
|
103
|
-
suitable for, and under terms that permit, the user to
|
104
|
-
recombine or relink the Application with a modified version of
|
105
|
-
the Linked Version to produce a modified Combined Work, in the
|
106
|
-
manner specified by section 6 of the GNU GPL for conveying
|
107
|
-
Corresponding Source.
|
108
|
-
|
109
|
-
1) Use a suitable shared library mechanism for linking with the
|
110
|
-
Library. A suitable mechanism is one that (a) uses at run time
|
111
|
-
a copy of the Library already present on the user's computer
|
112
|
-
system, and (b) will operate properly with a modified version
|
113
|
-
of the Library that is interface-compatible with the Linked
|
114
|
-
Version.
|
115
|
-
|
116
|
-
e) Provide Installation Information, but only if you would otherwise
|
117
|
-
be required to provide such information under section 6 of the
|
118
|
-
GNU GPL, and only to the extent that such information is
|
119
|
-
necessary to install and execute a modified version of the
|
120
|
-
Combined Work produced by recombining or relinking the
|
121
|
-
Application with a modified version of the Linked Version. (If
|
122
|
-
you use option 4d0, the Installation Information must accompany
|
123
|
-
the Minimal Corresponding Source and Corresponding Application
|
124
|
-
Code. If you use option 4d1, you must provide the Installation
|
125
|
-
Information in the manner specified by section 6 of the GNU GPL
|
126
|
-
for conveying Corresponding Source.)
|
127
|
-
|
128
|
-
5. Combined Libraries.
|
129
|
-
|
130
|
-
You may place library facilities that are a work based on the
|
131
|
-
Library side by side in a single library together with other library
|
132
|
-
facilities that are not Applications and are not covered by this
|
133
|
-
License, and convey such a combined library under terms of your
|
134
|
-
choice, if you do both of the following:
|
135
|
-
|
136
|
-
a) Accompany the combined library with a copy of the same work based
|
137
|
-
on the Library, uncombined with any other library facilities,
|
138
|
-
conveyed under the terms of this License.
|
139
|
-
|
140
|
-
b) Give prominent notice with the combined library that part of it
|
141
|
-
is a work based on the Library, and explaining where to find the
|
142
|
-
accompanying uncombined form of the same work.
|
143
|
-
|
144
|
-
6. Revised Versions of the GNU Lesser General Public License.
|
145
|
-
|
146
|
-
The Free Software Foundation may publish revised and/or new versions
|
147
|
-
of the GNU Lesser General Public License from time to time. Such new
|
148
|
-
versions will be similar in spirit to the present version, but may
|
149
|
-
differ in detail to address new problems or concerns.
|
150
|
-
|
151
|
-
Each version is given a distinguishing version number. If the
|
152
|
-
Library as you received it specifies that a certain numbered version
|
153
|
-
of the GNU Lesser General Public License "or any later version"
|
154
|
-
applies to it, you have the option of following the terms and
|
155
|
-
conditions either of that published version or of any later version
|
156
|
-
published by the Free Software Foundation. If the Library as you
|
157
|
-
received it does not specify a version number of the GNU Lesser
|
158
|
-
General Public License, you may choose any version of the GNU Lesser
|
159
|
-
General Public License ever published by the Free Software Foundation.
|
160
|
-
|
161
|
-
If the Library as you received it specifies that a proxy can decide
|
162
|
-
whether future versions of the GNU Lesser General Public License shall
|
163
|
-
apply, that proxy's public statement of acceptance of any version is
|
164
|
-
permanent authorization for you to choose that version for the
|
165
|
-
Library.
|
18
|
+
All third-party dependencies are listed in ivy.xml.
|
data/README.md
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
# Cascading.JRuby [![Build Status](https://secure.travis-ci.org/mrwalker/cascading.jruby.png)](http://travis-ci.org/mrwalker/cascading.jruby)
|
2
|
+
|
3
|
+
cascading.jruby is a DSL for [Cascading](http://www.cascading.org/), which is a dataflow API written in Java. With cascading.jruby, Ruby programmers can rapidly script efficient MapReduce jobs for Hadoop.
|
4
|
+
|
5
|
+
To give you a quick idea of what a cascading.jruby job looks like, here's word count:
|
6
|
+
|
7
|
+
```ruby
|
8
|
+
require 'rubygems'
|
9
|
+
require 'cascading'
|
10
|
+
|
11
|
+
input_path = ARGV.shift || (raise 'input_path required')
|
12
|
+
|
13
|
+
cascade 'wordcount', :mode => :local do
|
14
|
+
flow 'wordcount' do
|
15
|
+
source 'input', tap(input_path)
|
16
|
+
|
17
|
+
assembly 'input' do
|
18
|
+
split_rows 'line', /[.,]*\s+/, 'word', :output => 'word'
|
19
|
+
group_by 'word' do
|
20
|
+
count
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
sink 'input', tap('output/wordcount', :sink_mode => :replace)
|
25
|
+
end
|
26
|
+
end.complete
|
27
|
+
```
|
28
|
+
|
29
|
+
cascading.jruby provides a clean Ruby interface to Cascading, but doesn't attempt to add abstractions on top of it. Therefore, you should be acquainted with the [Cascading](http://docs.cascading.org/cascading/2.0/userguide/html/) [API](http://docs.cascading.org/cascading/2.0/javadoc/) before you begin.
|
30
|
+
|
31
|
+
For operations you can apply to your dataflow within a pipe assembly, see the [Assembly](http://rubydoc.info/gems/cascading.jruby/1.0.0/Cascading/Assembly) class. For operations available within a block passed to a group_by, union, or join, see the [Aggregations](http://rubydoc.info/gems/cascading.jruby/1.0.0/Cascading/Aggregations) class.
|
32
|
+
|
33
|
+
Note that the Ruby code you write merely constructs a Cascading job, so no JRuby runtime is required on your cluster. This stands in contrast with writing [Hadoop streaming jobs in Ruby](http://www.quora.com/How-do-the-different-options-for-Ruby-on-Hadoop-compare). To run cascading.jruby applications on a Hadoop cluster, you must use [Jading](https://github.com/mrwalker/jading) to package them into a job jar.
|
34
|
+
|
35
|
+
cascading.jruby has been tested on JRuby versions 1.2.0, 1.4.0, 1.5.3, 1.6.5, 1.6.7.2, 1.7.0, and 1.7.3.
|
data/lib/cascading.rb
CHANGED
@@ -2,59 +2,26 @@ require 'java'
|
|
2
2
|
|
3
3
|
module Cascading
|
4
4
|
# :stopdoc:
|
5
|
-
VERSION = '0.0
|
6
|
-
LIBPATH = ::File.expand_path(::File.dirname(__FILE__)) + ::File::SEPARATOR
|
7
|
-
PATH = ::File.dirname(LIBPATH) + ::File::SEPARATOR
|
8
|
-
CASCADING_HOME = ENV['CASCADING_HOME']
|
9
|
-
HADOOP_HOME = ENV['HADOOP_HOME']
|
10
|
-
|
11
|
-
# :startdoc:
|
12
|
-
|
13
|
-
# Returns the version string for the library.
|
14
|
-
#
|
15
|
-
def self.version
|
16
|
-
VERSION
|
17
|
-
end
|
18
|
-
|
19
|
-
# Returns the library path for the module. If any arguments are given,
|
20
|
-
# they will be joined to the end of the libray path using
|
21
|
-
# <tt>File.join</tt>.
|
22
|
-
#
|
23
|
-
def self.libpath( *args )
|
24
|
-
args.empty? ? LIBPATH : ::File.join(LIBPATH, args.flatten)
|
25
|
-
end
|
26
|
-
|
27
|
-
# Returns the lpath for the module. If any arguments are given,
|
28
|
-
# they will be joined to the end of the path using
|
29
|
-
# <tt>File.join</tt>.
|
30
|
-
#
|
31
|
-
def self.path( *args )
|
32
|
-
args.empty? ? PATH : ::File.join(PATH, args.flatten)
|
33
|
-
end
|
34
|
-
|
35
|
-
def self.require_all_jars(from = ::File.join(::File.dirname(__FILE__), "..", "jars"))
|
36
|
-
search_me = ::File.expand_path(
|
37
|
-
::File.join(from, '**', '*.jar'))
|
38
|
-
Dir.glob(search_me).sort.each do |jar|
|
39
|
-
require jar
|
40
|
-
end
|
41
|
-
end
|
5
|
+
VERSION = '1.0.0'
|
42
6
|
end
|
43
7
|
|
44
|
-
|
45
|
-
Cascading.require_all_jars(Cascading::CASCADING_HOME) if Cascading::CASCADING_HOME
|
46
|
-
|
8
|
+
require 'cascading/aggregations'
|
47
9
|
require 'cascading/assembly'
|
48
10
|
require 'cascading/base'
|
49
11
|
require 'cascading/cascade'
|
50
12
|
require 'cascading/cascading'
|
51
13
|
require 'cascading/cascading_exception'
|
52
14
|
require 'cascading/expr_stub'
|
15
|
+
require 'cascading/filter_operations'
|
53
16
|
require 'cascading/flow'
|
17
|
+
require 'cascading/identity_operations'
|
54
18
|
require 'cascading/mode'
|
55
19
|
require 'cascading/operations'
|
20
|
+
require 'cascading/regex_operations'
|
56
21
|
require 'cascading/scope'
|
22
|
+
require 'cascading/sub_assembly'
|
57
23
|
require 'cascading/tap'
|
24
|
+
require 'cascading/text_operations'
|
58
25
|
|
59
|
-
# include module to make
|
26
|
+
# include module to make it available at top level
|
60
27
|
include Cascading
|
@@ -1,28 +1,39 @@
|
|
1
|
-
require 'cascading/operations'
|
2
1
|
require 'cascading/scope'
|
3
2
|
require 'cascading/ext/array'
|
4
3
|
|
5
4
|
module Cascading
|
5
|
+
# Aggregations is the context available to you within the block of a group_by,
|
6
|
+
# union, or join that allows you to apply Every pipes to the result of those
|
7
|
+
# operations. You may apply aggregators and buffers within this context
|
8
|
+
# subject to several rules laid out by Cascading.
|
9
|
+
#
|
6
10
|
# Rules enforced by Aggregations:
|
7
11
|
# * Contains either 1 Buffer or >= 1 Aggregator (explicitly checked)
|
8
|
-
# * No GroupBys, CoGroups, Joins, or Merges (methods for these pipes do not
|
9
|
-
# exist on Aggregations)
|
12
|
+
# * No GroupBys, CoGroups, Joins, or Merges (methods for these pipes do not exist on Aggregations)
|
10
13
|
# * No Eaches (Aggregations#each does not exist)
|
11
14
|
# * Aggregations may not branch (Aggregations#branch does not exist)
|
12
15
|
#
|
13
16
|
# Externally enforced rules:
|
14
17
|
# * May be empty (in which case, Aggregations is not instantiated)
|
15
|
-
# * Must follow a GroupBy or CoGroup (not a
|
18
|
+
# * Must follow a GroupBy or CoGroup (not a HashJoin or Merge)
|
16
19
|
#
|
17
20
|
# Optimizations:
|
18
|
-
# * If the leading Group is a GroupBy and all subsequent Everies are
|
19
|
-
#
|
20
|
-
#
|
21
|
+
# * If the leading Group is a GroupBy and all subsequent Everies are Aggregators that have a corresponding AggregateBy, Aggregations can replace the GroupBy/Aggregator pipe with a single composite AggregateBy
|
22
|
+
#
|
23
|
+
# Aggregator and buffer DSL standard optional parameter names:
|
24
|
+
# [input] c.p.Every argument selector
|
25
|
+
# [into] c.o.Operation field declaration
|
26
|
+
# [output] c.p.Every output selector
|
21
27
|
class Aggregations
|
22
|
-
include Operations
|
23
|
-
|
24
28
|
attr_reader :assembly, :tail_pipe, :scope, :aggregate_bys
|
25
29
|
|
30
|
+
# Do not use this constructor directly; instead, pass a block containing
|
31
|
+
# the desired aggregations to a group_by, union, or join and it will be
|
32
|
+
# instantiated for you.
|
33
|
+
#
|
34
|
+
# Builds the context in which a sequence of Every aggregations may be
|
35
|
+
# evaluated in the given assembly appended to the given group pipe and with
|
36
|
+
# the given incoming_scopes.
|
26
37
|
def initialize(assembly, group, incoming_scopes)
|
27
38
|
@assembly = assembly
|
28
39
|
@tail_pipe = group
|
@@ -32,23 +43,14 @@ module Cascading
|
|
32
43
|
@aggregate_bys = tail_pipe.is_group_by ? [] : nil
|
33
44
|
end
|
34
45
|
|
46
|
+
# Prints information about the scope of these Aggregations at the point at
|
47
|
+
# which it is called. This allows you to trace the propagation of field
|
48
|
+
# names through your job and is handy for debugging. See Scope for
|
49
|
+
# details.
|
35
50
|
def debug_scope
|
36
51
|
puts "Current scope of aggregations for '#{assembly.name}':\n #{scope}\n----------\n"
|
37
52
|
end
|
38
53
|
|
39
|
-
def make_pipe(type, parameters)
|
40
|
-
pipe = type.new(*parameters)
|
41
|
-
|
42
|
-
# Enforce 1 Buffer or >= 1 Aggregator rule
|
43
|
-
if tail_pipe.kind_of?(Java::CascadingPipe::Every)
|
44
|
-
raise 'Buffer must be sole aggregation' if tail_pipe.buffer? || (tail_pipe.aggregator? && pipe.buffer?)
|
45
|
-
end
|
46
|
-
|
47
|
-
@tail_pipe = pipe
|
48
|
-
@scope = Scope.outgoing_scope(tail_pipe, [scope])
|
49
|
-
end
|
50
|
-
private :make_pipe
|
51
|
-
|
52
54
|
# We can replace these aggregations with the corresponding composite
|
53
55
|
# AggregateBy if the leading Group was a GroupBy and all subsequent
|
54
56
|
# Aggregators had a corresponding AggregateBy (which we've encoded in the
|
@@ -69,13 +71,27 @@ module Cascading
|
|
69
71
|
|
70
72
|
# Builds an every pipe and adds it to the current list of aggregations.
|
71
73
|
# Note that this list may be either exactly 1 Buffer or any number of
|
72
|
-
# Aggregators.
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
74
|
+
# Aggregators. Exactly one of :aggregator or :buffer must be specified and
|
75
|
+
# :aggregator may be accompanied by a corresponding :aggregate_by.
|
76
|
+
#
|
77
|
+
# The named options are:
|
78
|
+
# [aggregator] A Cascading Aggregator, mutually exclusive with :buffer.
|
79
|
+
# [aggregate_by] A Cascading AggregateBy that corresponds to the given
|
80
|
+
# :aggregator. Only makes sense with the :aggregator option
|
81
|
+
# and does not exist for all Aggregators. Providing nothing
|
82
|
+
# or nil will cause all Aggregations to operate as normal,
|
83
|
+
# without being compiled into a composite AggregateBy.
|
84
|
+
# [buffer] A Cascading Buffer, mutually exclusive with :aggregator.
|
85
|
+
# [output] c.p.Every output selector.
|
86
|
+
#
|
87
|
+
# Example:
|
88
|
+
# every 'field1', 'field2', :aggregator => sum_aggregator, :aggregate_by => sum_by, :output => all_fields
|
89
|
+
# every fields(input_fields), :buffer => Java::SomePackage::SomeBuffer.new, :output => all_fields
|
90
|
+
def every(*args_with_options)
|
91
|
+
options, in_fields = args_with_options.extract_options!, fields(args_with_options)
|
77
92
|
out_fields = fields(options[:output])
|
78
93
|
operation = options[:aggregator] || options[:buffer]
|
94
|
+
raise 'every requires either :aggregator or :buffer' unless operation
|
79
95
|
|
80
96
|
if options[:aggregate_by] && aggregate_bys
|
81
97
|
aggregate_bys << options[:aggregate_by]
|
@@ -84,71 +100,152 @@ module Cascading
|
|
84
100
|
end
|
85
101
|
|
86
102
|
parameters = [tail_pipe, in_fields, operation, out_fields].compact
|
87
|
-
make_pipe(Java::CascadingPipe::Every, parameters)
|
88
|
-
|
103
|
+
every = make_pipe(Java::CascadingPipe::Every, parameters)
|
104
|
+
raise ':aggregator specified but c.o.Buffer provided' if options[:aggregator] && every.is_buffer
|
105
|
+
raise ':buffer specified but c.o.Aggregator provided' if options[:buffer] && every.is_aggregator
|
89
106
|
|
90
|
-
|
91
|
-
|
107
|
+
every
|
108
|
+
end
|
92
109
|
|
93
|
-
|
110
|
+
# Builds an every assertion pipe given a c.o.a.Assertion and adds it to the
|
111
|
+
# current list of aggregations. Note this breaks a chain of AggregateBys.
|
112
|
+
#
|
113
|
+
# The named options are:
|
114
|
+
# [level] The assertion level; defaults to strict.
|
115
|
+
def assert_group(assertion, options = {})
|
94
116
|
assertion_level = options[:level] || Java::CascadingOperation::AssertionLevel::STRICT
|
95
117
|
|
96
118
|
parameters = [tail_pipe, assertion_level, assertion]
|
97
119
|
make_pipe(Java::CascadingPipe::Every, parameters)
|
98
120
|
end
|
99
121
|
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
assertion = Java::CascadingOperationAssertion::AssertGroupSizeEquals.new(
|
122
|
+
# Builds a pipe that asserts the size of the current group is the specified
|
123
|
+
# size for all groups.
|
124
|
+
def assert_group_size_equals(size, options = {})
|
125
|
+
assertion = Java::CascadingOperationAssertion::AssertGroupSizeEquals.new(size)
|
104
126
|
assert_group(assertion, options)
|
105
127
|
end
|
106
128
|
|
107
|
-
#
|
129
|
+
# Computes the minima of the specified fields within each group. Fields
|
130
|
+
# may be a list or a map for renaming. Note that fields are sorted by
|
131
|
+
# input name when a map is provided.
|
108
132
|
#
|
109
|
-
#
|
110
|
-
#
|
111
|
-
# insert) and an options hash.
|
133
|
+
# The named options are:
|
134
|
+
# [ignore] Java Array of Objects of values to be ignored.
|
112
135
|
#
|
113
|
-
#
|
114
|
-
#
|
115
|
-
#
|
116
|
-
#
|
117
|
-
#
|
118
|
-
|
119
|
-
|
136
|
+
# Examples:
|
137
|
+
# assembly 'aggregate' do
|
138
|
+
# ...
|
139
|
+
# insert 'const' => 1
|
140
|
+
# group_by 'const' do
|
141
|
+
# min 'field1', 'field2'
|
142
|
+
# min 'field3' => 'fieldA', 'field4' => 'fieldB'
|
143
|
+
# end
|
144
|
+
# discard 'const'
|
145
|
+
# end
|
146
|
+
def min(*args_with_options)
|
147
|
+
composite_aggregator(args_with_options, Java::CascadingOperationAggregator::Min)
|
148
|
+
end
|
120
149
|
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
150
|
+
# Computes the maxima of the specified fields within each group. Fields
|
151
|
+
# may be a list or a map for renaming. Note that fields are sorted by
|
152
|
+
# input name when a map is provided.
|
153
|
+
#
|
154
|
+
# The named options are:
|
155
|
+
# [ignore] Java Array of Objects of values to be ignored.
|
156
|
+
#
|
157
|
+
# Examples:
|
158
|
+
# assembly 'aggregate' do
|
159
|
+
# ...
|
160
|
+
# insert 'const' => 1
|
161
|
+
# group_by 'const' do
|
162
|
+
# max 'field1', 'field2'
|
163
|
+
# max 'field3' => 'fieldA', 'field4' => 'fieldB'
|
164
|
+
# end
|
165
|
+
# discard 'const'
|
166
|
+
# end
|
167
|
+
def max(*args_with_options)
|
168
|
+
composite_aggregator(args_with_options, Java::CascadingOperationAggregator::Max)
|
169
|
+
end
|
170
|
+
|
171
|
+
# Returns the first value within each group for the specified fields.
|
172
|
+
# Fields may be a list or a map for renaming. Note that fields are sorted
|
173
|
+
# by input name when a map is provided.
|
174
|
+
#
|
175
|
+
# The named options are:
|
176
|
+
# [ignore] Java Array of Tuples which should be ignored
|
177
|
+
#
|
178
|
+
# Examples:
|
179
|
+
# assembly 'aggregate' do
|
180
|
+
# ...
|
181
|
+
# group_by 'key1', 'key2' do
|
182
|
+
# first 'field1', 'field2'
|
183
|
+
# first 'field3' => 'fieldA', 'field4' => 'fieldB'
|
184
|
+
# end
|
185
|
+
# end
|
186
|
+
def first(*args_with_options)
|
187
|
+
composite_aggregator(args_with_options, Java::CascadingOperationAggregator::First)
|
126
188
|
end
|
127
189
|
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
190
|
+
# Returns the last value within each group for the specified fields.
|
191
|
+
# Fields may be a list or a map for renaming. Note that fields are sorted
|
192
|
+
# by input name when a map is provided.
|
193
|
+
#
|
194
|
+
# The named options are:
|
195
|
+
# [ignore] Java Array of Tuples which should be ignored
|
196
|
+
#
|
197
|
+
# Examples:
|
198
|
+
# assembly 'aggregate' do
|
199
|
+
# ...
|
200
|
+
# group_by 'key1', 'key2' do
|
201
|
+
# last 'field1', 'field2'
|
202
|
+
# last 'field3' => 'fieldA', 'field4' => 'fieldB'
|
203
|
+
# end
|
204
|
+
# end
|
205
|
+
def last(*args_with_options)
|
206
|
+
composite_aggregator(args_with_options, Java::CascadingOperationAggregator::Last)
|
207
|
+
end
|
132
208
|
|
133
|
-
# Counts elements of
|
134
|
-
# output count field
|
209
|
+
# Counts elements of each group. May optionally specify the name of the
|
210
|
+
# output count field, which defaults to 'count'.
|
211
|
+
#
|
212
|
+
# Examples:
|
213
|
+
# assembly 'aggregate' do
|
214
|
+
# ...
|
215
|
+
# group_by 'key1', 'key2' do
|
216
|
+
# count
|
217
|
+
# count 'key1_key2_count'
|
218
|
+
# end
|
219
|
+
# end
|
135
220
|
def count(name = 'count')
|
136
221
|
count_aggregator = Java::CascadingOperationAggregator::Count.new(fields(name))
|
137
222
|
count_by = Java::CascadingPipeAssembly::CountBy.new(fields(name))
|
138
223
|
every(last_grouping_fields, :aggregator => count_aggregator, :output => all_fields, :aggregate_by => count_by)
|
139
224
|
end
|
140
225
|
|
141
|
-
# Sums
|
142
|
-
#
|
143
|
-
#
|
144
|
-
#
|
145
|
-
#
|
146
|
-
# of
|
147
|
-
|
148
|
-
|
226
|
+
# Sums the specified fields within each group. Fields may be a list or
|
227
|
+
# provided through the :mapping option for renaming. Note that fields are
|
228
|
+
# sorted by name when a map is provided.
|
229
|
+
#
|
230
|
+
# The named options are:
|
231
|
+
# [mapping] Map of input to output field names if renaming is desired.
|
232
|
+
# Results in output fields sorted by input field.
|
233
|
+
# [type] Controls the type of the output, specified using values from the
|
234
|
+
# Cascading::JAVA_TYPE_MAP as in Janino expressions (:double, :long, etc.)
|
235
|
+
#
|
236
|
+
# Examples:
|
237
|
+
# assembly 'aggregate' do
|
238
|
+
# ...
|
239
|
+
# group_by 'key1', 'key2' do
|
240
|
+
# sum 'field1', 'field2', :type => :long
|
241
|
+
# sum :mapping => { 'field3' => 'fieldA', 'field4' => 'fieldB' }, :type => :double
|
242
|
+
# end
|
243
|
+
# end
|
244
|
+
def sum(*args_with_options)
|
245
|
+
options, in_fields = args_with_options.extract_options!, args_with_options
|
149
246
|
type = JAVA_TYPE_MAP[options[:type]]
|
150
247
|
|
151
|
-
mapping = options[:mapping] ? options[:mapping].sort :
|
248
|
+
mapping = options[:mapping] ? options[:mapping].sort : in_fields.zip(in_fields)
|
152
249
|
mapping.each do |in_field, out_field|
|
153
250
|
sum_aggregator = Java::CascadingOperationAggregator::Sum.new(*[fields(out_field), type].compact)
|
154
251
|
# NOTE: SumBy requires a type in wip-286, unlike Sum (see Sum.java line 42 for default)
|
@@ -158,10 +255,22 @@ module Cascading
|
|
158
255
|
raise "sum invoked on 0 fields (note :mapping must be provided to explicitly rename fields)" if mapping.empty?
|
159
256
|
end
|
160
257
|
|
161
|
-
# Averages
|
162
|
-
# that
|
163
|
-
|
164
|
-
|
258
|
+
# Averages the specified fields within each group. Fields may be a list or
|
259
|
+
# a map for renaming. Note that fields are sorted by input name when a map
|
260
|
+
# is provided.
|
261
|
+
#
|
262
|
+
# Examples:
|
263
|
+
# assembly 'aggregate' do
|
264
|
+
# ...
|
265
|
+
# insert 'const' => 1
|
266
|
+
# group_by 'const' do
|
267
|
+
# max 'field1', 'field2'
|
268
|
+
# max 'field3' => 'fieldA', 'field4' => 'fieldB'
|
269
|
+
# end
|
270
|
+
# discard 'const'
|
271
|
+
# end
|
272
|
+
def average(*fields_or_field_map)
|
273
|
+
field_map, _ = extract_field_map(fields_or_field_map)
|
165
274
|
|
166
275
|
field_map.each do |in_field, out_field|
|
167
276
|
average_aggregator = Java::CascadingOperationAggregator::Average.new(fields(out_field))
|
@@ -173,6 +282,42 @@ module Cascading
|
|
173
282
|
|
174
283
|
private
|
175
284
|
|
285
|
+
def make_pipe(type, parameters)
|
286
|
+
pipe = type.new(*parameters)
|
287
|
+
|
288
|
+
# Enforce 1 Buffer or >= 1 Aggregator rule
|
289
|
+
if tail_pipe.kind_of?(Java::CascadingPipe::Every)
|
290
|
+
raise 'Buffer must be sole aggregation' if tail_pipe.buffer? || (tail_pipe.aggregator? && pipe.buffer?)
|
291
|
+
end
|
292
|
+
|
293
|
+
@tail_pipe = pipe
|
294
|
+
@scope = Scope.outgoing_scope(tail_pipe, [scope])
|
295
|
+
|
296
|
+
tail_pipe
|
297
|
+
end
|
298
|
+
|
299
|
+
# Builds a series of every pipes for aggregation.
|
300
|
+
#
|
301
|
+
# Args can either be a list of fields to aggregate and an options hash or
|
302
|
+
# a hash that maps input field name to output field name (similar to
|
303
|
+
# insert) and an options hash.
|
304
|
+
#
|
305
|
+
# The named options are:
|
306
|
+
# [ignore] Java Array of Objects (for min and max) or Tuples (for first and
|
307
|
+
# last) of values for the aggregator to ignore.
|
308
|
+
def composite_aggregator(args, aggregator)
|
309
|
+
field_map, options = extract_field_map(args)
|
310
|
+
|
311
|
+
field_map.each do |in_field, out_field|
|
312
|
+
every(
|
313
|
+
in_field,
|
314
|
+
:aggregator => aggregator.new(*[fields(out_field), options[:ignore]].compact),
|
315
|
+
:output => all_fields
|
316
|
+
)
|
317
|
+
end
|
318
|
+
raise "Composite aggregator '#{aggregator}' invoked on 0 fields" if field_map.empty?
|
319
|
+
end
|
320
|
+
|
176
321
|
# Extracts a field mapping, input field => output field, by accepting a
|
177
322
|
# hash in the first argument. If no hash is provided, then maps arguments
|
178
323
|
# onto themselves which names outputs the same as inputs. Additionally
|