cascading.jruby 0.0.7 → 0.0.8
Sign up to get free protection for your applications and to get access to all the features.
- data/.travis.yml +6 -0
- data/Gemfile +6 -0
- data/Gemfile.lock +12 -0
- data/HACKING.md +12 -4
- data/History.txt +8 -0
- data/README.md +2 -2
- data/Rakefile +3 -2
- data/TODO +2 -2
- data/ivy.xml +25 -0
- data/ivysettings.xml +7 -0
- data/lib/cascading.rb +1 -1
- data/lib/cascading/aggregations.rb +190 -0
- data/lib/cascading/assembly.rb +138 -194
- data/lib/cascading/cascading.rb +8 -3
- data/lib/cascading/cascading_exception.rb +12 -10
- data/lib/cascading/flow.rb +3 -2
- data/lib/cascading/operations.rb +1 -23
- data/lib/cascading/scope.rb +27 -24
- data/lib/cascading/sub_assembly.rb +93 -0
- data/samples/copy.rb +3 -1
- data/samples/data/data_group_by.txt +7 -0
- data/samples/data/genealogy/names/dist.all.last +88799 -0
- data/samples/data/gutenberg/the_outline_of_science_vol_1 +12761 -0
- data/samples/group_by.rb +61 -0
- data/samples/logwordcount.rb +3 -1
- data/samples/scorenames.rb +2 -1
- data/samples/sub_assembly.rb +30 -0
- data/samples/union.rb +3 -1
- data/spec/scope_spec.rb +47 -66
- data/spec/spec_util.rb +4 -4
- data/tags +69 -44
- data/tasks/ant.rake +9 -5
- data/tasks/samples.rake +6 -0
- data/tasks/test.rake +1 -1
- data/test/mock_assemblies.rb +55 -0
- data/test/test_aggregations.rb +443 -0
- data/test/test_assembly.rb +437 -196
- data/test/test_exceptions.rb +3 -3
- data/test/test_local_execution.rb +168 -0
- data/test/test_operations.rb +0 -7
- metadata +23 -2
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
data/HACKING.md
CHANGED
@@ -2,13 +2,21 @@
|
|
2
2
|
|
3
3
|
Some hacking info on `cascading.jruby`:
|
4
4
|
|
5
|
-
|
5
|
+
For local development, install with (requires [bundler](http://gembundler.com/)):
|
6
6
|
|
7
|
-
|
7
|
+
bundle install
|
8
8
|
|
9
|
-
|
9
|
+
To run the tests (will download Cascading and Hadoop jars):
|
10
10
|
|
11
|
-
jruby -S
|
11
|
+
jruby -S bundle exec rake
|
12
|
+
|
13
|
+
To create the gem:
|
14
|
+
|
15
|
+
jruby -S bundle exec rake gem
|
16
|
+
|
17
|
+
To install it locally:
|
18
|
+
|
19
|
+
jruby -S gem install pkg/cascading.jruby-xxx.gem
|
12
20
|
|
13
21
|
The `Cascading::Operations` module is mixed-in the `Cascading::Assembly` class to provide some shortcuts for common operations.
|
14
22
|
|
data/History.txt
CHANGED
@@ -1,3 +1,11 @@
|
|
1
|
+
0.0.8 - AggregateBy and upgrade to Cascading 2.0.0 wip-286
|
2
|
+
|
3
|
+
This release upgrades to Cascading 2.0.0 wip-286, but again does not implement
|
4
|
+
local mode. It does, however, replace group_by/union which contain only count,
|
5
|
+
sum, and average with the corresponding composite AggregateBy. Various
|
6
|
+
refactorings were required to support this, including the addition of limited
|
7
|
+
support for SubAssemblies.
|
8
|
+
|
1
9
|
0.0.7 - Upgrade to Cascading 2.0.0 wip-255
|
2
10
|
|
3
11
|
This release upgrades to Cascading 2.0.0 wip-255, but implements neither local
|
data/README.md
CHANGED
@@ -1,8 +1,8 @@
|
|
1
|
-
# Cascading.JRuby
|
1
|
+
# Cascading.JRuby [![Build Status](https://secure.travis-ci.org/mrwalker/cascading.jruby.png)](http://travis-ci.org/mrwalker/cascading.jruby)
|
2
2
|
|
3
3
|
`cascading.jruby` is a small DSL above [Cascading](http://www.cascading.org/).
|
4
4
|
|
5
|
-
It requires Hadoop (>= 0.20.2) and Cascading (
|
5
|
+
It requires Hadoop (>= 0.20.2) and [Cascading 2.0.0-wip-286](http://files.concurrentinc.com/cascading/2.0/cascading-2.0.0-wip-286-hadoop-0.20.2%2B.tgz) to be set via the environment variables: `HADOOP_HOME` and `CASCADING_HOME`
|
6
6
|
|
7
7
|
It has been tested on JRuby versions 1.2.0, 1.4.0, 1.5.3, and 1.6.5.
|
8
8
|
|
data/Rakefile
CHANGED
@@ -18,8 +18,9 @@ end
|
|
18
18
|
ensure_in_path 'lib'
|
19
19
|
|
20
20
|
require 'cascading'
|
21
|
+
require 'bundler/setup'
|
21
22
|
|
22
|
-
task :default => 'test
|
23
|
+
task :default => 'test'
|
23
24
|
|
24
25
|
task :run do
|
25
26
|
# ensure_in_path "samples"
|
@@ -28,7 +29,7 @@ task :run do
|
|
28
29
|
end
|
29
30
|
|
30
31
|
desc 'Remove gem and Java build files'
|
31
|
-
task :clean => ['ant:clean', 'gem:clean'] do
|
32
|
+
task :clean => ['ant:clean', 'gem:clean', 'samples:clean'] do
|
32
33
|
puts 'Build files removed'
|
33
34
|
end
|
34
35
|
|
data/TODO
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
Documentation
|
2
2
|
|
3
|
-
|
4
|
-
Look into totally eliminating registries
|
3
|
+
Import local_job/Job from Etsy, which can eliminate registries
|
5
4
|
|
6
5
|
Bug fixes on github
|
7
6
|
Enforce more runtime rules at composition time
|
8
7
|
Standardize helper contracts
|
9
8
|
Possibly combine unit tests...into unit tests because RSpec sucks and swallows stack traces
|
10
9
|
|
10
|
+
(Jading)
|
11
11
|
Split out runner
|
12
12
|
Make runner implement Tool
|
13
13
|
Create build tool for job jar
|
data/ivy.xml
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
<?xml version="1.0" encoding="ISO-8859-1"?>
|
2
|
+
<ivy-module version="2.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
3
|
+
xsi:noNamespaceSchemaLocation="http://ant.apache.org/ivy/schemas/ivy.xsd">
|
4
|
+
<info organisation="com.etsy" module="cascading.jruby" status="integration" />
|
5
|
+
|
6
|
+
<configurations>
|
7
|
+
<conf name="default" visibility="public" description="runtime dependencies and master artifact can be used with this conf" extends="runtime,master" />
|
8
|
+
<conf name="master" visibility="public" description="contains only the artifact published by this module itself, with no transitive dependencies" />
|
9
|
+
<conf name="compile" visibility="public" description="this is the default scope, used if none is specified. Compile dependencies are available in all classpaths." />
|
10
|
+
<conf name="provided" visibility="public" description="this is much like compile, but indicates you expect the JDK or a container to provide it. It is only available on the compilation classpath, and is not transitive." />
|
11
|
+
<conf name="runtime" visibility="public" description="this scope indicates that the dependency is not required for compilation, but is for execution. It is in the runtime and test classpaths, but not the compile classpath." extends="compile" />
|
12
|
+
<conf name="test" visibility="private" description="this scope indicates that the dependency is not required for normal use of the application, and is only available for the test compilation and execution phases." extends="runtime" />
|
13
|
+
<conf name="system" visibility="public" description="this scope is similar to provided except that you have to provide the JAR which contains it explicitly. The artifact is always available and is not looked up in a repository." />
|
14
|
+
<conf name="sources" visibility="public" description="this configuration contains the source artifact of this module, if any." />
|
15
|
+
<conf name="javadoc" visibility="public" description="this configuration contains the javadoc artifact of this module, if any." />
|
16
|
+
<conf name="optional" visibility="public" description="contains all optional dependencies" />
|
17
|
+
</configurations>
|
18
|
+
|
19
|
+
<dependencies>
|
20
|
+
<dependency org="cascading" name="cascading-core" rev="2.0.0-wip-286" conf="default" />
|
21
|
+
<dependency org="cascading" name="cascading-local" rev="2.0.0-wip-286" conf="default" />
|
22
|
+
<dependency org="cascading" name="cascading-hadoop" rev="2.0.0-wip-286" conf="default" />
|
23
|
+
<dependency org="org.jruby" name="jruby" rev="1.6.5" conf="default" />
|
24
|
+
</dependencies>
|
25
|
+
</ivy-module>
|
data/ivysettings.xml
ADDED
data/lib/cascading.rb
CHANGED
@@ -0,0 +1,190 @@
|
|
1
|
+
require 'cascading/operations'
|
2
|
+
require 'cascading/scope'
|
3
|
+
require 'cascading/ext/array'
|
4
|
+
|
5
|
+
module Cascading
|
6
|
+
# Rules enforced by Aggregations:
|
7
|
+
# Contains either 1 Buffer or >= 1 Aggregator (explicitly checked)
|
8
|
+
# No GroupBys, CoGroups, Joins, or Merges (methods for these pipes do not exist on Aggregations)
|
9
|
+
# No Eaches (Aggregations#each does not exist)
|
10
|
+
# Aggregations may not branch (Aggregations#branch does not exist)
|
11
|
+
#
|
12
|
+
# Externally enforced rules:
|
13
|
+
# May be empty (in which case, Aggregations is not instantiated)
|
14
|
+
# Must follow a GroupBy or CoGroup (not a Join or Merge)
|
15
|
+
#
|
16
|
+
# Optimizations:
|
17
|
+
# If the leading Group is a GroupBy and all subsequent Everies are
|
18
|
+
# Aggregators that have a corresponding AggregateBy, Aggregations can
|
19
|
+
# replace the GroupBy/Aggregator pipe with a single composite AggregateBy.
|
20
|
+
class Aggregations
|
21
|
+
include Operations
|
22
|
+
|
23
|
+
attr_reader :assembly, :tail_pipe, :scope, :aggregate_bys
|
24
|
+
|
25
|
+
def initialize(assembly, group, incoming_scopes)
|
26
|
+
@assembly = assembly
|
27
|
+
@tail_pipe = group
|
28
|
+
@scope = Scope.outgoing_scope(tail_pipe, incoming_scopes)
|
29
|
+
|
30
|
+
# AggregateBy optimization only applies to GroupBy
|
31
|
+
@aggregate_bys = tail_pipe.is_group_by ? [] : nil
|
32
|
+
end
|
33
|
+
|
34
|
+
def debug_scope
|
35
|
+
puts "Current scope of aggregations for '#{assembly.name}':\n #{scope}\n----------\n"
|
36
|
+
end
|
37
|
+
|
38
|
+
def make_pipe(type, parameters)
|
39
|
+
pipe = type.new(*parameters)
|
40
|
+
|
41
|
+
# Enforce 1 Buffer or >= 1 Aggregator rule
|
42
|
+
if tail_pipe.kind_of?(Java::CascadingPipe::Every)
|
43
|
+
raise 'Buffer must be sole aggregation' if tail_pipe.buffer? || (tail_pipe.aggregator? && pipe.buffer?)
|
44
|
+
end
|
45
|
+
|
46
|
+
@tail_pipe = pipe
|
47
|
+
@scope = Scope.outgoing_scope(tail_pipe, [scope])
|
48
|
+
end
|
49
|
+
private :make_pipe
|
50
|
+
|
51
|
+
# We can replace these aggregations with the corresponding composite
|
52
|
+
# AggregateBy if the leading Group was a GroupBy and all subsequent
|
53
|
+
# Aggregators had a corresponding AggregateBy (which we've encoded in the
|
54
|
+
# list of aggregate_bys being a non-empty array).
|
55
|
+
def can_aggregate_by?
|
56
|
+
!aggregate_bys.nil? && !aggregate_bys.empty?
|
57
|
+
end
|
58
|
+
|
59
|
+
# "Fix" out values fields after a sequence of Everies. This is a field
|
60
|
+
# name metadata fix which is why the Identity is not planned into the
|
61
|
+
# resulting Cascading pipe. Without it, all values fields would propagate
|
62
|
+
# through non-empty aggregations, which doesn't match Cascading's planner's
|
63
|
+
# behavior.
|
64
|
+
def finalize
|
65
|
+
discard_each = Java::CascadingPipe::Each.new(tail_pipe, all_fields, Java::CascadingOperation::Identity.new)
|
66
|
+
@scope = Scope.outgoing_scope(discard_each, [scope])
|
67
|
+
end
|
68
|
+
|
69
|
+
# Builds an every pipe and adds it to the current list of aggregations.
|
70
|
+
# Note that this list may be either exactly 1 Buffer or any number of
|
71
|
+
# Aggregators.
|
72
|
+
def every(*args)
|
73
|
+
options = args.extract_options!
|
74
|
+
|
75
|
+
in_fields = fields(args)
|
76
|
+
out_fields = fields(options[:output])
|
77
|
+
operation = options[:aggregator] || options[:buffer]
|
78
|
+
|
79
|
+
if options[:aggregate_by] && aggregate_bys
|
80
|
+
aggregate_bys << options[:aggregate_by]
|
81
|
+
else
|
82
|
+
@aggregate_bys = nil
|
83
|
+
end
|
84
|
+
|
85
|
+
parameters = [tail_pipe, in_fields, operation, out_fields].compact
|
86
|
+
make_pipe(Java::CascadingPipe::Every, parameters)
|
87
|
+
end
|
88
|
+
|
89
|
+
def assert_group(*args)
|
90
|
+
options = args.extract_options!
|
91
|
+
|
92
|
+
assertion = args[0]
|
93
|
+
assertion_level = options[:level] || Java::CascadingOperation::AssertionLevel::STRICT
|
94
|
+
|
95
|
+
parameters = [tail_pipe, assertion_level, assertion]
|
96
|
+
make_pipe(Java::CascadingPipe::Every, parameters)
|
97
|
+
end
|
98
|
+
|
99
|
+
def assert_group_size_equals(*args)
|
100
|
+
options = args.extract_options!
|
101
|
+
|
102
|
+
assertion = Java::CascadingOperationAssertion::AssertGroupSizeEquals.new(args[0])
|
103
|
+
assert_group(assertion, options)
|
104
|
+
end
|
105
|
+
|
106
|
+
# Builds a series of every pipes for aggregation.
|
107
|
+
#
|
108
|
+
# Args can either be a list of fields to aggregate and an options hash or
|
109
|
+
# a hash that maps input field name to output field name (similar to
|
110
|
+
# insert) and an options hash.
|
111
|
+
#
|
112
|
+
# Options include:
|
113
|
+
# * <tt>:ignore</tt> a Java Array of Objects (for min and max) or Tuples
|
114
|
+
# (for first and last) of values for the aggregator to ignore
|
115
|
+
#
|
116
|
+
# <tt>function</tt> is a symbol that is the method to call to construct the Cascading Aggregator.
|
117
|
+
def composite_aggregator(args, function)
|
118
|
+
field_map, options = extract_field_map(args)
|
119
|
+
|
120
|
+
field_map.each do |in_field, out_field|
|
121
|
+
agg = self.send(function, out_field, options)
|
122
|
+
every(in_field, :aggregator => agg, :output => all_fields)
|
123
|
+
end
|
124
|
+
raise "Composite aggregator '#{function.to_s.gsub('_function', '')}' invoked on 0 fields" if field_map.empty?
|
125
|
+
end
|
126
|
+
|
127
|
+
def min(*args); composite_aggregator(args, :min_function); end
|
128
|
+
def max(*args); composite_aggregator(args, :max_function); end
|
129
|
+
def first(*args); composite_aggregator(args, :first_function); end
|
130
|
+
def last(*args); composite_aggregator(args, :last_function); end
|
131
|
+
|
132
|
+
# Counts elements of a group. May optionally specify the name of the
|
133
|
+
# output count field (defaults to 'count').
|
134
|
+
def count(name = 'count')
|
135
|
+
count_aggregator = Java::CascadingOperationAggregator::Count.new(fields(name))
|
136
|
+
count_by = Java::CascadingPipeAssembly::CountBy.new(fields(name))
|
137
|
+
every(last_grouping_fields, :aggregator => count_aggregator, :output => all_fields, :aggregate_by => count_by)
|
138
|
+
end
|
139
|
+
|
140
|
+
# Sums one or more fields. Fields to be summed may either be provided as
|
141
|
+
# the arguments to sum (in which case they will be aggregated into a field
|
142
|
+
# of the same name in the given order), or via a hash using the :mapping
|
143
|
+
# parameter (in which case they will be aggregated from the field named by
|
144
|
+
# the key into the field named by the value after being sorted). The type
|
145
|
+
# of the output sum may be controlled with the :type parameter.
|
146
|
+
def sum(*args)
|
147
|
+
options = args.extract_options!
|
148
|
+
type = JAVA_TYPE_MAP[options[:type]]
|
149
|
+
|
150
|
+
mapping = options[:mapping] ? options[:mapping].sort : args.zip(args)
|
151
|
+
mapping.each do |in_field, out_field|
|
152
|
+
sum_aggregator = Java::CascadingOperationAggregator::Sum.new(*[fields(out_field), type].compact)
|
153
|
+
# NOTE: SumBy requires a type in wip-286, unlike Sum (see Sum.java line 42 for default)
|
154
|
+
sum_by = Java::CascadingPipeAssembly::SumBy.new(fields(in_field), fields(out_field), type || Java::double.java_class)
|
155
|
+
every(in_field, :aggregator => sum_aggregator, :output => all_fields, :aggregate_by => sum_by)
|
156
|
+
end
|
157
|
+
raise "sum invoked on 0 fields (note :mapping must be provided to explicitly rename fields)" if mapping.empty?
|
158
|
+
end
|
159
|
+
|
160
|
+
# Averages one or more fields. The contract of average is identical to
|
161
|
+
# that of other composite aggregators, but it accepts no options.
|
162
|
+
def average(*args)
|
163
|
+
field_map, _ = extract_field_map(args)
|
164
|
+
|
165
|
+
field_map.each do |in_field, out_field|
|
166
|
+
average_aggregator = Java::CascadingOperationAggregator::Average.new(fields(out_field))
|
167
|
+
average_by = Java::CascadingPipeAssembly::AverageBy.new(fields(in_field), fields(out_field))
|
168
|
+
every(in_field, :aggregator => average_aggregator, :output => all_fields, :aggregate_by => average_by)
|
169
|
+
end
|
170
|
+
raise "average invoked on 0 fields" if field_map.empty?
|
171
|
+
end
|
172
|
+
|
173
|
+
private
|
174
|
+
|
175
|
+
# Extracts a field mapping, input field => output field, by accepting a
|
176
|
+
# hash in the first argument. If no hash is provided, then maps arguments
|
177
|
+
# onto themselves which names outputs the same as inputs. Additionally
|
178
|
+
# extracts options from args.
|
179
|
+
def extract_field_map(args)
|
180
|
+
if !args.empty? && args.first.kind_of?(Hash)
|
181
|
+
field_map = args.shift.sort
|
182
|
+
options = args.extract_options!
|
183
|
+
else
|
184
|
+
options = args.extract_options!
|
185
|
+
field_map = args.zip(args)
|
186
|
+
end
|
187
|
+
[field_map, options]
|
188
|
+
end
|
189
|
+
end
|
190
|
+
end
|
data/lib/cascading/assembly.rb
CHANGED
@@ -1,21 +1,23 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
1
2
|
# Copyright 2009, Grégoire Marabout. All Rights Reserved.
|
2
3
|
#
|
3
4
|
# This is free software. Please see the LICENSE and COPYING files for details.
|
4
5
|
|
5
6
|
require 'cascading/base'
|
6
7
|
require 'cascading/operations'
|
8
|
+
require 'cascading/aggregations'
|
9
|
+
require 'cascading/sub_assembly'
|
7
10
|
require 'cascading/ext/array'
|
8
11
|
|
9
12
|
module Cascading
|
10
13
|
class Assembly < Cascading::Node
|
11
14
|
include Operations
|
12
15
|
|
13
|
-
|
16
|
+
attr_reader :head_pipe, :tail_pipe
|
14
17
|
|
15
18
|
def initialize(name, parent, outgoing_scopes = {})
|
16
19
|
super(name, parent)
|
17
20
|
|
18
|
-
@every_applied = false
|
19
21
|
@outgoing_scopes = outgoing_scopes
|
20
22
|
if parent.kind_of?(Assembly)
|
21
23
|
@head_pipe = Java::CascadingPipe::Pipe.new(name, parent.tail_pipe)
|
@@ -26,13 +28,13 @@ module Cascading
|
|
26
28
|
@head_pipe = Java::CascadingPipe::Pipe.new(name)
|
27
29
|
@outgoing_scopes[name] ||= Scope.empty_scope(name)
|
28
30
|
end
|
29
|
-
@tail_pipe =
|
31
|
+
@tail_pipe = head_pipe
|
30
32
|
@incoming_scopes = [scope]
|
31
33
|
end
|
32
34
|
|
33
35
|
def describe(offset = '')
|
34
|
-
incoming_scopes_desc = "#{incoming_scopes.map{ |incoming_scope| incoming_scope.values_fields.to_a.inspect }.join(', ')}"
|
35
|
-
incoming_scopes_desc = "(#{incoming_scopes_desc})" unless incoming_scopes.size == 1
|
36
|
+
incoming_scopes_desc = "#{@incoming_scopes.map{ |incoming_scope| incoming_scope.values_fields.to_a.inspect }.join(', ')}"
|
37
|
+
incoming_scopes_desc = "(#{incoming_scopes_desc})" unless @incoming_scopes.size == 1
|
36
38
|
description = "#{offset}#{name}:assembly :: #{incoming_scopes_desc} -> #{scope.values_fields.to_a.inspect}"
|
37
39
|
description += "\n#{child_names.map{ |child| children[child].describe("#{offset} ") }.join("\n")}" unless children.empty?
|
38
40
|
description
|
@@ -51,108 +53,122 @@ module Cascading
|
|
51
53
|
puts "Current scope for '#{name}':\n #{scope}\n----------\n"
|
52
54
|
end
|
53
55
|
|
54
|
-
def
|
55
|
-
|
56
|
-
@
|
57
|
-
end
|
56
|
+
def make_pipe(type, parameters)
|
57
|
+
@tail_pipe = type.new(*parameters)
|
58
|
+
@outgoing_scopes[name] = Scope.outgoing_scope(tail_pipe, [scope])
|
58
59
|
|
59
|
-
|
60
|
-
make_pipe(type, parameters, scope.grouping_key_fields)
|
61
|
-
@every_applied = true
|
60
|
+
tail_pipe
|
62
61
|
end
|
62
|
+
private :make_pipe
|
63
63
|
|
64
|
-
def
|
65
|
-
|
64
|
+
def populate_incoming_scopes(assembly_names, group_fields_args = {})
|
65
|
+
# NOTE: this overrides the existing incoming_scopes, which changes the
|
66
|
+
# way describe will function on this assembly
|
67
|
+
pipes, @incoming_scopes, group_fields = [], [], []
|
68
|
+
assembly_names.each do |assembly_name|
|
69
|
+
assembly = parent_flow.find_child(assembly_name)
|
70
|
+
raise "Could not find assembly '#{assembly_name}' from '#{name}'" unless assembly
|
71
|
+
|
72
|
+
pipes << assembly.tail_pipe
|
73
|
+
@incoming_scopes << assembly.scope
|
74
|
+
group_fields << fields(group_fields_args[assembly_name]) if group_fields_args[assembly_name]
|
75
|
+
end
|
76
|
+
[pipes, group_fields]
|
66
77
|
end
|
78
|
+
private :populate_incoming_scopes
|
67
79
|
|
68
|
-
def
|
69
|
-
|
80
|
+
def apply_aggregations(group, incoming_scopes, &block)
|
81
|
+
aggregations = Aggregations.new(self, group, incoming_scopes)
|
82
|
+
aggregations.instance_eval(&block) if block_given?
|
70
83
|
|
71
|
-
#
|
72
|
-
|
73
|
-
|
84
|
+
# Sorting of any type means that we cannot use the AggregateBy optimization
|
85
|
+
if aggregations.can_aggregate_by? && !group.is_sorted && !group.is_sort_reversed
|
86
|
+
grouping_fields = group.key_selectors.values.first
|
87
|
+
group.key_selectors.values.each do |key_fields|
|
88
|
+
raise "Grouping fields mismatch: #{grouping_fields} expected; #{key_fields} found from #{group.key_selectors}" unless key_fields == grouping_fields
|
89
|
+
end
|
74
90
|
|
75
|
-
|
76
|
-
|
91
|
+
aggregate_by = sub_assembly(Java::CascadingPipeAssembly::AggregateBy.new(
|
92
|
+
name,
|
93
|
+
group.previous,
|
94
|
+
grouping_fields,
|
95
|
+
aggregations.aggregate_bys.to_java(Java::CascadingPipeAssembly::AggregateBy)
|
96
|
+
), group.previous, incoming_scopes)
|
77
97
|
|
78
|
-
|
79
|
-
|
80
|
-
|
98
|
+
aggregate_by
|
99
|
+
else
|
100
|
+
aggregations.finalize if block_given?
|
101
|
+
@tail_pipe = aggregations.tail_pipe
|
102
|
+
@outgoing_scopes[name] = aggregations.scope
|
103
|
+
|
104
|
+
group
|
105
|
+
end
|
81
106
|
end
|
107
|
+
private :apply_aggregations
|
82
108
|
|
83
109
|
def to_s
|
84
|
-
"#{name} : head pipe : #{
|
110
|
+
"#{name} : head pipe : #{head_pipe} - tail pipe: #{tail_pipe}"
|
85
111
|
end
|
86
112
|
|
87
|
-
# Builds a join (CoGroup) pipe. Requires a list of assembly names to join
|
113
|
+
# Builds a join (CoGroup) pipe. Requires a list of assembly names to join
|
114
|
+
# and :on to specify the group_fields.
|
88
115
|
def join(*args, &block)
|
89
116
|
options = args.extract_options!
|
90
117
|
|
91
|
-
pipes,
|
92
|
-
args.each do |assembly_name|
|
93
|
-
assembly = parent_flow.find_child(assembly_name)
|
94
|
-
raise "Could not find assembly '#{assembly_name}' in join" unless assembly
|
118
|
+
pipes, _ = populate_incoming_scopes(args)
|
95
119
|
|
96
|
-
|
97
|
-
|
98
|
-
end
|
120
|
+
group_fields_args = options[:on]
|
121
|
+
raise 'join requires :on parameter' unless group_fields_args
|
99
122
|
|
100
|
-
group_fields_args = options.delete(:on)
|
101
123
|
if group_fields_args.kind_of?(String)
|
102
124
|
group_fields_args = [group_fields_args]
|
103
125
|
end
|
104
|
-
|
126
|
+
|
105
127
|
group_fields = []
|
106
128
|
if group_fields_args.kind_of?(Array)
|
107
129
|
pipes.size.times do
|
108
130
|
group_fields << fields(group_fields_args)
|
109
131
|
end
|
110
132
|
elsif group_fields_args.kind_of?(Hash)
|
111
|
-
pipes,
|
112
|
-
|
113
|
-
|
114
|
-
v = group_fields_args[assembly_name]
|
115
|
-
assembly = parent_flow.find_child(assembly_name)
|
116
|
-
raise "Could not find assembly '#{assembly_name}' in join" unless assembly
|
117
|
-
|
118
|
-
pipes << assembly.tail_pipe
|
119
|
-
incoming_scopes << @outgoing_scopes[assembly.name]
|
120
|
-
group_fields << fields(v)
|
121
|
-
group_fields_names = group_fields_args[keys.first].to_a
|
122
|
-
end
|
133
|
+
pipes, group_fields = populate_incoming_scopes(group_fields_args.keys.sort, group_fields_args)
|
134
|
+
else
|
135
|
+
raise "Unsupported data type for :on in join: '#{group_fields_args.class}'"
|
123
136
|
end
|
124
137
|
|
138
|
+
raise 'join requires non-empty :on parameter' if group_fields_args.empty?
|
125
139
|
group_fields = group_fields.to_java(Java::CascadingTuple::Fields)
|
126
|
-
incoming_fields = incoming_scopes.map{ |s| s.values_fields }
|
140
|
+
incoming_fields = @incoming_scopes.map{ |s| s.values_fields }
|
127
141
|
declared_fields = fields(options[:declared_fields] || dedup_fields(*incoming_fields))
|
128
|
-
joiner = options
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
else fail "invalid mixed joiner entry: #{t}"
|
146
|
-
end
|
142
|
+
joiner = options[:joiner]
|
143
|
+
|
144
|
+
case joiner
|
145
|
+
when :inner, 'inner', nil
|
146
|
+
joiner = Java::CascadingPipeJoiner::InnerJoin.new
|
147
|
+
when :left, 'left'
|
148
|
+
joiner = Java::CascadingPipeJoiner::LeftJoin.new
|
149
|
+
when :right, 'right'
|
150
|
+
joiner = Java::CascadingPipeJoiner::RightJoin.new
|
151
|
+
when :outer, 'outer'
|
152
|
+
joiner = Java::CascadingPipeJoiner::OuterJoin.new
|
153
|
+
when Array
|
154
|
+
joiner = joiner.map do |t|
|
155
|
+
case t
|
156
|
+
when true, 1, :inner then true
|
157
|
+
when false, 0, :outer then false
|
158
|
+
else fail "invalid mixed joiner entry: #{t}"
|
147
159
|
end
|
148
|
-
joiner = Java::CascadingPipeJoiner::MixedJoin.new(joiner.to_java(:boolean))
|
149
160
|
end
|
161
|
+
joiner = Java::CascadingPipeJoiner::MixedJoin.new(joiner.to_java(:boolean))
|
150
162
|
end
|
151
|
-
|
152
|
-
parameters = [
|
153
|
-
|
154
|
-
|
155
|
-
|
163
|
+
result_group_fields = dedup_fields(*group_fields)
|
164
|
+
parameters = [
|
165
|
+
pipes.to_java(Java::CascadingPipe::Pipe),
|
166
|
+
group_fields,
|
167
|
+
declared_fields,
|
168
|
+
result_group_fields,
|
169
|
+
joiner
|
170
|
+
]
|
171
|
+
apply_aggregations(Java::CascadingPipe::CoGroup.new(*parameters), @incoming_scopes, &block)
|
156
172
|
end
|
157
173
|
alias co_group join
|
158
174
|
|
@@ -193,66 +209,76 @@ module Cascading
|
|
193
209
|
assembly
|
194
210
|
end
|
195
211
|
|
196
|
-
# Builds a new
|
197
|
-
#
|
212
|
+
# Builds a new GroupBy pipe that groups on the fields given in args.
|
213
|
+
# Any block passed to this method should contain only Everies.
|
198
214
|
def group_by(*args, &block)
|
199
215
|
options = args.extract_options!
|
200
|
-
|
201
216
|
group_fields = fields(args)
|
202
|
-
|
203
|
-
sort_fields = fields(options[:sort_by] || args)
|
217
|
+
sort_fields = fields(options[:sort_by])
|
204
218
|
reverse = options[:reverse]
|
205
219
|
|
206
|
-
parameters = [
|
207
|
-
|
208
|
-
do_every_block_and_rename_fields(args, [scope], &block)
|
220
|
+
parameters = [tail_pipe, group_fields, sort_fields, reverse].compact
|
221
|
+
apply_aggregations(Java::CascadingPipe::GroupBy.new(*parameters), [scope], &block)
|
209
222
|
end
|
210
223
|
|
211
|
-
# Unifies
|
212
|
-
#
|
213
|
-
#
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
224
|
+
# Unifies multiple incoming pipes sharing the same field structure using a
|
225
|
+
# GroupBy. Accepts :on like join and :sort_by and :reverse like group_by,
|
226
|
+
# as well as a block which may be used for a sequence of Every
|
227
|
+
# aggregations.
|
228
|
+
#
|
229
|
+
# By default, groups only on the first field (see line 189 of GroupBy.java)
|
230
|
+
def union(*args, &block)
|
231
|
+
options = args.extract_options!
|
232
|
+
group_fields = fields(options[:on])
|
233
|
+
sort_fields = fields(options[:sort_by])
|
234
|
+
reverse = options[:reverse]
|
221
235
|
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
236
|
+
pipes, _ = populate_incoming_scopes(args)
|
237
|
+
|
238
|
+
# Must provide group_fields to ensure field name propagation
|
239
|
+
group_fields = fields(@incoming_scopes.first.values_fields.get(0)) unless group_fields
|
240
|
+
|
241
|
+
# FIXME: GroupBy is missing a constructor for union in wip-255
|
242
|
+
sort_fields = group_fields if !sort_fields && !reverse.nil?
|
243
|
+
|
244
|
+
parameters = [pipes.to_java(Java::CascadingPipe::Pipe), group_fields, sort_fields, reverse].compact
|
245
|
+
apply_aggregations(Java::CascadingPipe::GroupBy.new(*parameters), @incoming_scopes, &block)
|
227
246
|
end
|
247
|
+
alias :union_pipes :union
|
228
248
|
|
229
|
-
#
|
230
|
-
|
231
|
-
|
249
|
+
# Allows you to plugin c.p.SubAssemblies to a cascading.jruby Assembly
|
250
|
+
# under certain assumptions. Note the default is to extend the tail pipe
|
251
|
+
# of this Assembly using a linear SubAssembly. See SubAssembly class for
|
252
|
+
# details.
|
253
|
+
def sub_assembly(sub_assembly, pipes = [tail_pipe], incoming_scopes = [scope])
|
254
|
+
sub_assembly = SubAssembly.new(self, sub_assembly)
|
255
|
+
sub_assembly.finalize(pipes, incoming_scopes)
|
232
256
|
|
233
|
-
|
234
|
-
|
235
|
-
operation = options[:aggregator] || options[:buffer]
|
257
|
+
@tail_pipe = sub_assembly.tail_pipe
|
258
|
+
@outgoing_scopes[name] = sub_assembly.scope
|
236
259
|
|
237
|
-
|
238
|
-
make_every(Java::CascadingPipe::Every, *parameters)
|
260
|
+
sub_assembly
|
239
261
|
end
|
240
262
|
|
241
263
|
# Builds a basic _each_ pipe, and adds it to the current assembly.
|
242
264
|
# --
|
243
265
|
# Example:
|
244
|
-
# each
|
245
|
-
# :pattern => /[.,]*\s+/),
|
246
|
-
# :output=>["id", "name", "val1", "val2"]
|
266
|
+
# each 'line', :function => regex_splitter(['name', 'val1', 'val2', 'id'], :pattern => /[.,]*\s+/), :output => ['id', 'name', 'val1', 'val2']
|
247
267
|
def each(*args)
|
248
268
|
options = args.extract_options!
|
249
269
|
|
250
270
|
in_fields = fields(args)
|
251
271
|
out_fields = fields(options[:output])
|
272
|
+
|
252
273
|
operation = options[:filter] || options[:function]
|
274
|
+
raise 'c.p.Each does not support applying an output selector to a c.o.Filter' if options[:filter] && options[:output]
|
275
|
+
|
276
|
+
parameters = [tail_pipe, in_fields, operation, out_fields].compact
|
277
|
+
each = make_pipe(Java::CascadingPipe::Each, parameters)
|
278
|
+
raise ':function specified but c.o.Filter provided' if options[:function] && each.is_filter
|
279
|
+
raise ':filter specified but c.o.Function provided' if options[:filter] && each.is_function
|
253
280
|
|
254
|
-
|
255
|
-
make_each(Java::CascadingPipe::Each, *parameters)
|
281
|
+
each
|
256
282
|
end
|
257
283
|
|
258
284
|
# Restricts the current assembly to the specified fields.
|
@@ -273,14 +299,6 @@ module Cascading
|
|
273
299
|
project(*keep_fields.to_a)
|
274
300
|
end
|
275
301
|
|
276
|
-
# Assign new names to initial fields in positional order.
|
277
|
-
# --
|
278
|
-
# Example:
|
279
|
-
# bind_names "field1", "field2"
|
280
|
-
def bind_names(*new_names)
|
281
|
-
each all_fields, :function => Java::CascadingOperation::Identity.new(fields(new_names))
|
282
|
-
end
|
283
|
-
|
284
302
|
# Renames fields according to the mapping provided.
|
285
303
|
# --
|
286
304
|
# Example:
|
@@ -318,14 +336,9 @@ module Cascading
|
|
318
336
|
options = args.extract_options!
|
319
337
|
assertion = args[0]
|
320
338
|
assertion_level = options[:level] || Java::CascadingOperation::AssertionLevel::STRICT
|
321
|
-
make_each(Java::CascadingPipe::Each, @tail_pipe, assertion_level, assertion)
|
322
|
-
end
|
323
339
|
|
324
|
-
|
325
|
-
|
326
|
-
assertion = args[0]
|
327
|
-
assertion_level = options[:level] || Java::CascadingOperation::AssertionLevel::STRICT
|
328
|
-
make_every(Java::CascadingPipe::Every, @tail_pipe, assertion_level, assertion)
|
340
|
+
parameters = [tail_pipe, assertion_level, assertion]
|
341
|
+
make_pipe(Java::CascadingPipe::Each, parameters)
|
329
342
|
end
|
330
343
|
|
331
344
|
# Builds a debugging pipe.
|
@@ -362,67 +375,6 @@ module Cascading
|
|
362
375
|
assert(assertion, options)
|
363
376
|
end
|
364
377
|
|
365
|
-
def assert_group_size_equals(*args)
|
366
|
-
options = args.extract_options!
|
367
|
-
assertion = Java::CascadingOperationAssertion::AssertGroupSizeEquals.new(args[0])
|
368
|
-
assert_group(assertion, options)
|
369
|
-
end
|
370
|
-
|
371
|
-
# Builds a series of every pipes for aggregation.
|
372
|
-
#
|
373
|
-
# Args can either be a list of fields to aggregate and an options hash or
|
374
|
-
# a hash that maps input field name to output field name (similar to
|
375
|
-
# insert) and an options hash.
|
376
|
-
#
|
377
|
-
# Options include:
|
378
|
-
# * <tt>:ignore</tt> a Java Array of Objects (for min and max) or Tuples
|
379
|
-
# (for first and last) of values for the aggregator to ignore
|
380
|
-
#
|
381
|
-
# <tt>function</tt> is a symbol that is the method to call to construct the Cascading Aggregator.
|
382
|
-
def composite_aggregator(args, function)
|
383
|
-
if !args.empty? && args.first.kind_of?(Hash)
|
384
|
-
field_map = args.shift.sort
|
385
|
-
options = args.extract_options!
|
386
|
-
else
|
387
|
-
options = args.extract_options!
|
388
|
-
field_map = args.zip(args)
|
389
|
-
end
|
390
|
-
field_map.each do |in_field, out_field|
|
391
|
-
agg = self.send(function, out_field, options)
|
392
|
-
every(in_field, :aggregator => agg, :output => all_fields)
|
393
|
-
end
|
394
|
-
puts "WARNING: composite aggregator '#{function.to_s.gsub('_function', '')}' invoked on 0 fields; will be ignored" if field_map.empty?
|
395
|
-
end
|
396
|
-
|
397
|
-
def min(*args); composite_aggregator(args, :min_function); end
|
398
|
-
def max(*args); composite_aggregator(args, :max_function); end
|
399
|
-
def first(*args); composite_aggregator(args, :first_function); end
|
400
|
-
def last(*args); composite_aggregator(args, :last_function); end
|
401
|
-
def average(*args); composite_aggregator(args, :average_function); end
|
402
|
-
|
403
|
-
# Counts elements of a group. First unnamed parameter is the name of the
|
404
|
-
# output count field (defaults to 'count' if it is not provided).
|
405
|
-
def count(*args)
|
406
|
-
options = args.extract_options!
|
407
|
-
name = args[0] || 'count'
|
408
|
-
every(last_grouping_fields, :aggregator => count_function(name, options), :output => all_fields)
|
409
|
-
end
|
410
|
-
|
411
|
-
# Fields to be summed may either be provided as an array, in which case
|
412
|
-
# they will be aggregated into the same field in the given order, or as a
|
413
|
-
# hash, in which case they will be aggregated from the field named by the
|
414
|
-
# key into the field named by the value after being sorted.
|
415
|
-
def sum(*args)
|
416
|
-
options = args.extract_options!
|
417
|
-
type = JAVA_TYPE_MAP[options[:type]]
|
418
|
-
raise "No type specified for sum" unless type
|
419
|
-
|
420
|
-
mapping = options[:mapping] ? options[:mapping].sort : args.zip(args)
|
421
|
-
mapping.each do |in_field, out_field|
|
422
|
-
every(in_field, :aggregator => sum_function(out_field, :type => type), :output => all_fields)
|
423
|
-
end
|
424
|
-
end
|
425
|
-
|
426
378
|
# Builds a _parse_ pipe. This pipe will parse the fields specified in input (first unamed arguments),
|
427
379
|
# using a specified regex pattern.
|
428
380
|
#
|
@@ -437,7 +389,7 @@ module Cascading
|
|
437
389
|
fields = args || all_fields
|
438
390
|
pattern = options[:pattern]
|
439
391
|
output = options[:output] || all_fields
|
440
|
-
each(fields, :
|
392
|
+
each(fields, :function => regex_parser(pattern, options), :output => output)
|
441
393
|
end
|
442
394
|
|
443
395
|
# Builds a pipe that splits a field into other fields, using a specified regular expression.
|
@@ -692,14 +644,6 @@ module Cascading
|
|
692
644
|
pass
|
693
645
|
end
|
694
646
|
|
695
|
-
# Builds a pipe that will unify (merge) pipes. The method accepts the list of pipes as argument.
|
696
|
-
# Tuples unified must share the same fields.
|
697
|
-
def union(*args)
|
698
|
-
options = args.extract_options!
|
699
|
-
pipes = args
|
700
|
-
union_pipes pipes
|
701
|
-
end
|
702
|
-
|
703
647
|
def join_fields(*args)
|
704
648
|
options = args.extract_options!
|
705
649
|
output = options[:output] || all_fields
|