cascading.jruby 0.0.8 → 0.0.9
Sign up to get free protection for your applications and to get access to all the features.
- data/HACKING.md +1 -1
- data/History.txt +6 -0
- data/README.md +1 -1
- data/ivy.xml +3 -3
- data/lib/cascading/aggregations.rb +14 -13
- data/lib/cascading/base.rb +27 -9
- data/lib/cascading/cascade.rb +12 -3
- data/lib/cascading/cascading.rb +35 -44
- data/lib/cascading/cascading_exception.rb +5 -5
- data/lib/cascading/flow.rb +23 -32
- data/lib/cascading/mode.rb +78 -0
- data/lib/cascading/operations.rb +10 -4
- data/lib/cascading/scope.rb +8 -2
- data/lib/cascading/sub_assembly.rb +6 -6
- data/lib/cascading/tap.rb +81 -0
- data/lib/cascading.rb +3 -1
- data/samples/branch.rb +2 -2
- data/samples/copy.rb +2 -2
- data/samples/group_by.rb +5 -5
- data/samples/join.rb +2 -2
- data/samples/logwordcount.rb +3 -4
- data/samples/project.rb +2 -2
- data/samples/rename.rb +2 -2
- data/samples/scorenames.rb +2 -2
- data/samples/splitter.rb +2 -2
- data/samples/sub_assembly.rb +2 -2
- data/samples/union.rb +2 -2
- data/spec/cascading_spec.rb +17 -17
- data/spec/spec_util.rb +9 -9
- data/tags +92 -41
- data/tasks/ant.rake +6 -1
- data/test/test_assembly.rb +14 -7
- data/test/test_cascade.rb +55 -0
- data/test/test_cascading.rb +12 -15
- data/test/test_flow.rb +53 -36
- data/test/test_local_execution.rb +7 -7
- data/test/test_operations.rb +61 -0
- metadata +4 -2
data/HACKING.md
CHANGED
data/History.txt
CHANGED
@@ -1,3 +1,9 @@
|
|
1
|
+
0.0.9 - Cascading local mode and upgrade to Cascading 2.0.0
|
2
|
+
|
3
|
+
This release upgrades to Cascading 2.0.0 (final) and introduces Cascading local
|
4
|
+
mode. Ambiguous node names are now prohibited and the insert helper correctly
|
5
|
+
supports constants.
|
6
|
+
|
1
7
|
0.0.8 - AggregateBy and upgrade to Cascading 2.0.0 wip-286
|
2
8
|
|
3
9
|
This release upgrades to Cascading 2.0.0 wip-286, but again does not implement
|
data/README.md
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
`cascading.jruby` is a small DSL above [Cascading](http://www.cascading.org/).
|
4
4
|
|
5
|
-
It requires Hadoop (>= 0.20.2) and [Cascading 2.0.0
|
5
|
+
It requires Hadoop (>= 0.20.2) and [Cascading 2.0.0](http://files.cascading.org/cascading/2.0/cascading-2.0.0.tgz) to be set via the environment variables: `HADOOP_HOME` and `CASCADING_HOME`
|
6
6
|
|
7
7
|
It has been tested on JRuby versions 1.2.0, 1.4.0, 1.5.3, and 1.6.5.
|
8
8
|
|
data/ivy.xml
CHANGED
@@ -17,9 +17,9 @@
|
|
17
17
|
</configurations>
|
18
18
|
|
19
19
|
<dependencies>
|
20
|
-
<dependency org="cascading" name="cascading-core" rev="2.0.0
|
21
|
-
<dependency org="cascading" name="cascading-local" rev="2.0.0
|
22
|
-
<dependency org="cascading" name="cascading-hadoop" rev="2.0.0
|
20
|
+
<dependency org="cascading" name="cascading-core" rev="2.0.0" conf="default" />
|
21
|
+
<dependency org="cascading" name="cascading-local" rev="2.0.0" conf="default" />
|
22
|
+
<dependency org="cascading" name="cascading-hadoop" rev="2.0.0" conf="default" />
|
23
23
|
<dependency org="org.jruby" name="jruby" rev="1.6.5" conf="default" />
|
24
24
|
</dependencies>
|
25
25
|
</ivy-module>
|
@@ -4,19 +4,20 @@ require 'cascading/ext/array'
|
|
4
4
|
|
5
5
|
module Cascading
|
6
6
|
# Rules enforced by Aggregations:
|
7
|
-
#
|
8
|
-
#
|
9
|
-
#
|
10
|
-
#
|
7
|
+
# * Contains either 1 Buffer or >= 1 Aggregator (explicitly checked)
|
8
|
+
# * No GroupBys, CoGroups, Joins, or Merges (methods for these pipes do not
|
9
|
+
# exist on Aggregations)
|
10
|
+
# * No Eaches (Aggregations#each does not exist)
|
11
|
+
# * Aggregations may not branch (Aggregations#branch does not exist)
|
11
12
|
#
|
12
13
|
# Externally enforced rules:
|
13
|
-
#
|
14
|
-
#
|
14
|
+
# * May be empty (in which case, Aggregations is not instantiated)
|
15
|
+
# * Must follow a GroupBy or CoGroup (not a Join or Merge)
|
15
16
|
#
|
16
17
|
# Optimizations:
|
17
|
-
#
|
18
|
-
#
|
19
|
-
#
|
18
|
+
# * If the leading Group is a GroupBy and all subsequent Everies are
|
19
|
+
# Aggregators that have a corresponding AggregateBy, Aggregations can replace
|
20
|
+
# the GroupBy/Aggregator pipe with a single composite AggregateBy
|
20
21
|
class Aggregations
|
21
22
|
include Operations
|
22
23
|
|
@@ -110,10 +111,10 @@ module Cascading
|
|
110
111
|
# insert) and an options hash.
|
111
112
|
#
|
112
113
|
# Options include:
|
113
|
-
#
|
114
|
-
#
|
115
|
-
#
|
116
|
-
#
|
114
|
+
# * <tt>:ignore</tt> a Java Array of Objects (for min and max) or Tuples
|
115
|
+
# (for first and last) of values for the aggregator to ignore
|
116
|
+
# * <tt>function</tt> is a symbol that is the method to call to construct
|
117
|
+
# the Cascading Aggregator.
|
117
118
|
def composite_aggregator(args, function)
|
118
119
|
field_map, options = extract_field_map(args)
|
119
120
|
|
data/lib/cascading/base.rb
CHANGED
@@ -14,9 +14,12 @@ module Cascading
|
|
14
14
|
@last_child = nil
|
15
15
|
end
|
16
16
|
|
17
|
+
# Children must be uniquely named within the scope of each Node. This
|
18
|
+
# ensures, for example, two assemblies are not created within the same flow
|
19
|
+
# with the same name, causing joins, unions, and sinks on them to be
|
20
|
+
# ambiguous.
|
17
21
|
def add_child(node)
|
18
|
-
|
19
|
-
warn "WARNING: adding '#{node.qualified_name}', but node named '#{node.name}' already exists at '#{child.qualified_name}'" if child
|
22
|
+
raise AmbiguousNodeNameException.new("Attempted to add '#{node.qualified_name}', but node named '#{node.name}' already exists") if @children[node.name]
|
20
23
|
|
21
24
|
@children[node.name] = node
|
22
25
|
@child_names << node.name
|
@@ -33,21 +36,36 @@ module Cascading
|
|
33
36
|
end
|
34
37
|
alias desc describe
|
35
38
|
|
39
|
+
# In order to find a child, we require it to be uniquely named within this
|
40
|
+
# Node and its children. This ensures, for example, branches in peer
|
41
|
+
# assemblies or branches and assemblies do not conflict in joins, unions,
|
42
|
+
# and sinks.
|
36
43
|
def find_child(name)
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
nil
|
44
|
+
all_children_with_name = find_all_children_with_name(name)
|
45
|
+
qualified_names = all_children_with_name.map{ |child| child.qualified_name }
|
46
|
+
raise AmbiguousNodeNameException.new("Ambiguous lookup of child by name '#{name}'; found '#{qualified_names.join("', '")}'") if all_children_with_name.size > 1
|
47
|
+
|
48
|
+
all_children_with_name.first
|
43
49
|
end
|
44
50
|
|
45
51
|
def root
|
46
52
|
return self unless parent
|
47
53
|
parent.root
|
48
54
|
end
|
55
|
+
|
56
|
+
protected
|
57
|
+
|
58
|
+
def find_all_children_with_name(name)
|
59
|
+
child_names.map do |child_name|
|
60
|
+
children[child_name] if child_name == name
|
61
|
+
end.compact + child_names.map do |child_name|
|
62
|
+
children[child_name].find_all_children_with_name(name)
|
63
|
+
end.flatten
|
64
|
+
end
|
49
65
|
end
|
50
66
|
|
67
|
+
class AmbiguousNodeNameException < StandardError; end
|
68
|
+
|
51
69
|
# A module to add auto-registration capability
|
52
70
|
module Registerable
|
53
71
|
def all
|
@@ -69,7 +87,7 @@ module Cascading
|
|
69
87
|
|
70
88
|
def add(name, instance)
|
71
89
|
@registered ||= {}
|
72
|
-
warn "WARNING:
|
90
|
+
warn "WARNING: Node named '#{name}' already registered in #{self}" if @registered[name]
|
73
91
|
@registered[name] = instance
|
74
92
|
end
|
75
93
|
|
data/lib/cascading/cascade.rb
CHANGED
@@ -9,14 +9,23 @@ module Cascading
|
|
9
9
|
class Cascade < Cascading::Node
|
10
10
|
extend Registerable
|
11
11
|
|
12
|
-
|
12
|
+
attr_reader :mode
|
13
|
+
|
14
|
+
# Builds a cascade given the specified name. Optionally accepts a :mode
|
15
|
+
# which will be used as the default mode for all child flows. See
|
16
|
+
# Cascading::Mode.parse for details.
|
17
|
+
def initialize(name, params = {})
|
18
|
+
@mode = params[:mode]
|
13
19
|
super(name, nil) # A Cascade cannot have a parent
|
14
20
|
self.class.add(name, self)
|
15
21
|
end
|
16
22
|
|
17
|
-
|
23
|
+
# Builds a child flow given a name and block. Optionally accepts a :mode,
|
24
|
+
# which will override the default mode stored in this cascade.
|
25
|
+
def flow(name, params = {}, &block)
|
18
26
|
raise "Could not build flow '#{name}'; block required" unless block_given?
|
19
|
-
|
27
|
+
params[:mode] ||= mode
|
28
|
+
flow = Flow.new(name, self, params)
|
20
29
|
add_child(flow)
|
21
30
|
flow.instance_eval(&block)
|
22
31
|
flow
|
data/lib/cascading/cascading.rb
CHANGED
@@ -12,17 +12,21 @@ module Cascading
|
|
12
12
|
:float => java.lang.Float.java_class, :string => java.lang.String.java_class,
|
13
13
|
}
|
14
14
|
|
15
|
-
|
15
|
+
# Builds a top-level cascade given a name and a block. Optionally accepts a
|
16
|
+
# :mode, as explained in Cascading::Cascade#initialize.
|
17
|
+
def cascade(name, params = {}, &block)
|
16
18
|
raise "Could not build cascade '#{name}'; block required" unless block_given?
|
17
|
-
cascade = Cascade.new(name)
|
19
|
+
cascade = Cascade.new(name, params)
|
18
20
|
cascade.instance_eval(&block)
|
19
21
|
cascade
|
20
22
|
end
|
21
23
|
|
22
|
-
#
|
23
|
-
|
24
|
+
# Builds a top-level flow given a name and block for applications built of
|
25
|
+
# flows with no cascades. Optionally accepts a :mode, as explained in
|
26
|
+
# Cascading::Flow#initialize.
|
27
|
+
def flow(name, params = {}, &block)
|
24
28
|
raise "Could not build flow '#{name}'; block required" unless block_given?
|
25
|
-
flow = Flow.new(name, nil)
|
29
|
+
flow = Flow.new(name, nil, params)
|
26
30
|
flow.instance_eval(&block)
|
27
31
|
flow
|
28
32
|
end
|
@@ -91,7 +95,9 @@ module Cascading
|
|
91
95
|
Java::CascadingTuple::Fields::RESULTS
|
92
96
|
end
|
93
97
|
|
94
|
-
# Creates a
|
98
|
+
# Creates a TextLine scheme (can be used in both Cascading local and hadoop
|
99
|
+
# modes). Positional args are used if <tt>:source_fields</tt> is not
|
100
|
+
# provided.
|
95
101
|
#
|
96
102
|
# The named options are:
|
97
103
|
# * <tt>:source_fields</tt> a string or array of strings. Specifies the
|
@@ -100,7 +106,7 @@ module Cascading
|
|
100
106
|
# to be written to a sink with this scheme. Defaults to all_fields.
|
101
107
|
# * <tt>:compression</tt> a symbol, either <tt>:enable</tt> or
|
102
108
|
# <tt>:disable</tt>, that governs the TextLine scheme's compression. Defaults
|
103
|
-
# to the default TextLine compression.
|
109
|
+
# to the default TextLine compression (only applies to c.s.h.TextLine).
|
104
110
|
def text_line_scheme(*args)
|
105
111
|
options = args.extract_options!
|
106
112
|
source_fields = fields(options[:source_fields] || (args.empty? ? ['offset', 'line'] : args))
|
@@ -111,55 +117,40 @@ module Cascading
|
|
111
117
|
else Java::CascadingSchemeHadoop::TextLine::Compress::DEFAULT
|
112
118
|
end
|
113
119
|
|
114
|
-
|
120
|
+
{
|
121
|
+
:local_scheme => Java::CascadingSchemeLocal::TextLine.new(source_fields, sink_fields),
|
122
|
+
:hadoop_scheme => Java::CascadingSchemeHadoop::TextLine.new(source_fields, sink_fields, sink_compression),
|
123
|
+
}
|
115
124
|
end
|
116
125
|
|
117
|
-
# Creates a c.s.h.SequenceFile scheme instance from the specified fields.
|
126
|
+
# Creates a c.s.h.SequenceFile scheme instance from the specified fields. A
|
127
|
+
# local SequenceFile scheme is not provided by Cascading, so this scheme
|
128
|
+
# cannot be used in Cascading local mode.
|
118
129
|
def sequence_file_scheme(*fields)
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
return Java::CascadingSchemeHadoop::SequenceFile.new(all_fields)
|
124
|
-
end
|
130
|
+
{
|
131
|
+
:local_scheme => nil,
|
132
|
+
:hadoop_scheme => Java::CascadingSchemeHadoop::SequenceFile.new(fields.empty? ? all_fields : fields(fields)),
|
133
|
+
}
|
125
134
|
end
|
126
135
|
|
127
136
|
def multi_source_tap(*taps)
|
128
|
-
|
137
|
+
MultiTap.multi_source_tap(taps)
|
129
138
|
end
|
130
139
|
|
131
140
|
def multi_sink_tap(*taps)
|
132
|
-
|
133
|
-
end
|
134
|
-
|
135
|
-
#
|
136
|
-
|
137
|
-
|
138
|
-
opts = args.extract_options!
|
139
|
-
path = args.empty? ? opts[:path] : args[0]
|
140
|
-
scheme = opts[:scheme] || text_line_scheme
|
141
|
-
sink_mode = opts[:sink_mode] || :keep
|
142
|
-
sink_mode = case sink_mode
|
143
|
-
when :keep, 'keep' then Java::CascadingTap::SinkMode::KEEP
|
144
|
-
when :replace, 'replace' then Java::CascadingTap::SinkMode::REPLACE
|
145
|
-
when :append, 'append' then Java::CascadingTap::SinkMode::APPEND
|
146
|
-
else raise "Unrecognized sink mode '#{sink_mode}'"
|
147
|
-
end
|
148
|
-
fs = opts[:kind] || :hfs
|
149
|
-
klass = case fs
|
150
|
-
when :hfs, 'hfs' then Java::CascadingTapHadoop::Hfs
|
151
|
-
when :dfs, 'dfs' then Java::CascadingTapHadoop::Dfs
|
152
|
-
when :lfs, 'lfs' then Java::CascadingTapHadoop::Lfs
|
153
|
-
else raise "Unrecognized kind of tap '#{fs}'"
|
154
|
-
end
|
155
|
-
parameters = [scheme, path, sink_mode]
|
156
|
-
klass.new(*parameters)
|
141
|
+
MultiTap.multi_sink_tap(taps)
|
142
|
+
end
|
143
|
+
|
144
|
+
# Creates a Cascading::Tap given a path and optional :scheme and :sink_mode.
|
145
|
+
def tap(path, params = {})
|
146
|
+
Tap.new(path, params)
|
157
147
|
end
|
158
148
|
|
159
149
|
# Constructs properties to be passed to Flow#complete or Cascade#complete
|
160
|
-
# which will locate temporary Hadoop files in base_dir. It is necessary
|
161
|
-
#
|
162
|
-
# method, which confuses Cascading's attempt to find the
|
150
|
+
# which will locate temporary Hadoop files in base_dir. It is necessary to
|
151
|
+
# pass these properties only when executing scripts in Hadoop local mode via
|
152
|
+
# JRuby's main method, which confuses Cascading's attempt to find the
|
153
|
+
# containing jar. When using Cascading local mode, these are unnecessary.
|
163
154
|
def local_properties(base_dir)
|
164
155
|
dirs = {
|
165
156
|
'test.build.data' => "#{base_dir}/build",
|
@@ -1,9 +1,9 @@
|
|
1
|
-
# NativeException wrapper that prints the full nested stack trace of the Java
|
2
|
-
# exception and all of its causes wrapped by the NativeException.
|
3
|
-
# NativeException by default reveals only the first cause, which is
|
4
|
-
# insufficient for tracing cascading.jruby errors into JRuby code or revealing
|
5
|
-
# underlying Janino expression problems.
|
6
1
|
module Cascading
|
2
|
+
# NativeException wrapper that prints the full nested stack trace of the Java
|
3
|
+
# exception and all of its causes wrapped by the NativeException.
|
4
|
+
# NativeException by default reveals only the first cause, which is
|
5
|
+
# insufficient for tracing cascading.jruby errors into JRuby code or
|
6
|
+
# revealing underlying Janino expression problems.
|
7
7
|
class CascadingException < StandardError
|
8
8
|
attr_accessor :ne, :depth
|
9
9
|
|
data/lib/cascading/flow.rb
CHANGED
@@ -10,9 +10,15 @@ module Cascading
|
|
10
10
|
extend Registerable
|
11
11
|
|
12
12
|
attr_accessor :properties, :sources, :sinks, :incoming_scopes, :outgoing_scopes, :listeners
|
13
|
+
attr_reader :mode
|
13
14
|
|
14
|
-
|
15
|
+
# Builds a flow given a name and a parent node (a cascade or nil).
|
16
|
+
# Optionally accepts a :mode which will determine the execution mode of
|
17
|
+
# this flow. See Cascading::Mode.parse for details.
|
18
|
+
def initialize(name, parent, params = {})
|
15
19
|
@properties, @sources, @sinks, @incoming_scopes, @outgoing_scopes, @listeners = {}, {}, {}, {}, {}, []
|
20
|
+
@mode = Mode.parse(params[:mode])
|
21
|
+
@flow_scope = Scope.flow_scope(name)
|
16
22
|
super(name, parent)
|
17
23
|
self.class.add(name, self)
|
18
24
|
end
|
@@ -25,30 +31,18 @@ module Cascading
|
|
25
31
|
assembly
|
26
32
|
end
|
27
33
|
|
28
|
-
# Create a new
|
29
|
-
#
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
elsif (args.size == 1)
|
35
|
-
sinks[name] = args[0]
|
36
|
-
end
|
34
|
+
# Create a new source for this flow, using the specified name and
|
35
|
+
# Cascading::Tap
|
36
|
+
def source(name, tap)
|
37
|
+
sources[name] = tap
|
38
|
+
incoming_scopes[name] = Scope.source_scope(name, mode.source_tap(name, tap), @flow_scope)
|
39
|
+
outgoing_scopes[name] = incoming_scopes[name]
|
37
40
|
end
|
38
41
|
|
39
|
-
# Create a new
|
40
|
-
#
|
41
|
-
|
42
|
-
|
43
|
-
if (args.size == 2)
|
44
|
-
sources[args[0]] = args[1]
|
45
|
-
incoming_scopes[args[0]] = Scope.tap_scope(args[1], args[0])
|
46
|
-
outgoing_scopes[args[0]] = incoming_scopes[args[0]]
|
47
|
-
elsif (args.size == 1)
|
48
|
-
sources[name] = args[0]
|
49
|
-
incoming_scopes[name] = Scope.empty_scope(name)
|
50
|
-
outgoing_scopes[name] = incoming_scopes[name]
|
51
|
-
end
|
42
|
+
# Create a new sink for this flow, using the specified name and
|
43
|
+
# Cascading::Tap
|
44
|
+
def sink(name, tap)
|
45
|
+
sinks[name] = tap
|
52
46
|
end
|
53
47
|
|
54
48
|
def describe(offset = '')
|
@@ -149,12 +143,10 @@ module Cascading
|
|
149
143
|
Java::CascadingProperty::AppProps.setApplicationName(properties, name)
|
150
144
|
Java::CascadingProperty::AppProps.setApplicationVersion(properties, '0.0.0')
|
151
145
|
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
make_pipes
|
157
|
-
)
|
146
|
+
sources = make_tap_parameter(@sources, :head_pipe)
|
147
|
+
sinks = make_tap_parameter(@sinks, :tail_pipe)
|
148
|
+
pipes = make_pipes
|
149
|
+
mode.connect_flow(properties, name, sources, sinks, pipes)
|
158
150
|
end
|
159
151
|
|
160
152
|
def complete(properties = nil)
|
@@ -169,12 +161,11 @@ module Cascading
|
|
169
161
|
|
170
162
|
private
|
171
163
|
|
172
|
-
def make_tap_parameter(taps)
|
164
|
+
def make_tap_parameter(taps, pipe_accessor)
|
173
165
|
taps.inject({}) do |map, (name, tap)|
|
174
166
|
assembly = find_child(name)
|
175
167
|
raise "Could not find assembly '#{name}' to connect to tap: #{tap}" unless assembly
|
176
|
-
|
177
|
-
map[assembly.tail_pipe.name] = tap
|
168
|
+
map[assembly.send(pipe_accessor).name] = tap
|
178
169
|
map
|
179
170
|
end
|
180
171
|
end
|
@@ -0,0 +1,78 @@
|
|
1
|
+
module Cascading
|
2
|
+
# A Cascading::Mode encapsulates the idea of the execution mode for your
|
3
|
+
# flows. The default is Hadoop mode, but you can request that your code run
|
4
|
+
# in Cascading local mode. If you subsequently use a tap or a scheme that
|
5
|
+
# has no local implementation, the mode will be converted back to Hadoop
|
6
|
+
# mode.
|
7
|
+
class Mode
|
8
|
+
attr_reader :local
|
9
|
+
|
10
|
+
# Hadoop mode is the default. You must explicitly request Cascading local
|
11
|
+
# mode with values 'local' or :local.
|
12
|
+
def self.parse(mode)
|
13
|
+
case mode
|
14
|
+
when 'local', :local then Mode.new(true)
|
15
|
+
else Mode.new(false)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def initialize(local)
|
20
|
+
@local = local
|
21
|
+
end
|
22
|
+
|
23
|
+
# Attempts to select the appropriate tap given the current mode. If that
|
24
|
+
# tap does not exist, it fails over to the other tap with a warning.
|
25
|
+
def source_tap(name, tap)
|
26
|
+
warn "WARNING: No local tap for source '#{name}' in tap #{tap}" if local && !tap.local?
|
27
|
+
warn "WARNING: No Hadoop tap for source '#{name}' in tap #{tap}" if !local && !tap.hadoop?
|
28
|
+
|
29
|
+
if local
|
30
|
+
tap.local_tap || tap.hadoop_tap
|
31
|
+
else
|
32
|
+
tap.hadoop_tap || tap.local_tap
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
# Builds a c.f.Flow given properties, name, sources, sinks, and pipes from
|
37
|
+
# a Cascading::Flow. The current mode is adjusted based on the taps and
|
38
|
+
# schemes of the sources and sinks, then the correct taps are selected
|
39
|
+
# before building the flow.
|
40
|
+
def connect_flow(properties, name, sources, sinks, pipes)
|
41
|
+
update_local_mode(sources, sinks)
|
42
|
+
sources = select_taps(sources)
|
43
|
+
sinks = select_taps(sinks)
|
44
|
+
flow_connector_class.new(properties).connect(name, sources, sinks, pipes)
|
45
|
+
end
|
46
|
+
|
47
|
+
private
|
48
|
+
|
49
|
+
# Updates this mode based upon your sources and sinks. It's possible that
|
50
|
+
# you asked for Cascading local mode, but that request cannot be fulfilled
|
51
|
+
# because you used taps or schemes which have no local implementation.
|
52
|
+
def update_local_mode(sources, sinks)
|
53
|
+
local_supported = sources.all?{ |name, tap| tap.local? } && sinks.all?{ |name, tap| tap.local? }
|
54
|
+
|
55
|
+
if local && !local_supported
|
56
|
+
non_local_sources = sources.reject{ |name, tap| tap.local? }
|
57
|
+
non_local_sinks = sinks.reject{ |name, tap| tap.local? }
|
58
|
+
warn "WARNING: Cascading local mode requested but these sources: #{non_local_sources.inspect} and these sinks: #{non_local_sinks.inspect} do not support it"
|
59
|
+
@local = false
|
60
|
+
end
|
61
|
+
|
62
|
+
local
|
63
|
+
end
|
64
|
+
|
65
|
+
# Given a tap map, extracts the correct taps for the current mode
|
66
|
+
def select_taps(tap_map)
|
67
|
+
tap_map.inject({}) do |map, (name, tap)|
|
68
|
+
map[name] = tap.send(local ? :local_tap : :hadoop_tap)
|
69
|
+
map
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
# Chooses the correct FlowConnector class for the current mode
|
74
|
+
def flow_connector_class
|
75
|
+
local ? Java::CascadingFlowLocal::LocalFlowConnector : Java::CascadingFlowHadoop::HadoopFlowConnector
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
data/lib/cascading/operations.rb
CHANGED
@@ -107,15 +107,21 @@ module Cascading
|
|
107
107
|
|
108
108
|
def to_java_comparable_array(arr)
|
109
109
|
(arr.map do |v|
|
110
|
-
|
110
|
+
coerce_to_java(v)
|
111
|
+
end).to_java(java.lang.Comparable)
|
112
|
+
end
|
113
|
+
|
114
|
+
def coerce_to_java(v)
|
115
|
+
case v
|
111
116
|
when Fixnum
|
112
|
-
java.lang.
|
117
|
+
java.lang.Long.new(v)
|
113
118
|
when Float
|
114
119
|
java.lang.Double.new(v)
|
120
|
+
when NilClass
|
121
|
+
nil
|
115
122
|
else
|
116
123
|
java.lang.String.new(v.to_s)
|
117
|
-
|
118
|
-
end).to_java(java.lang.Comparable)
|
124
|
+
end
|
119
125
|
end
|
120
126
|
|
121
127
|
def expression_filter(*args)
|
data/lib/cascading/scope.rb
CHANGED
@@ -10,12 +10,18 @@ module Cascading
|
|
10
10
|
Scope.new(Java::CascadingFlowPlanner::Scope.new(@scope))
|
11
11
|
end
|
12
12
|
|
13
|
+
def self.flow_scope(name)
|
14
|
+
Java::CascadingFlowPlanner::Scope.new(name)
|
15
|
+
end
|
16
|
+
|
13
17
|
def self.empty_scope(name)
|
14
18
|
Scope.new(Java::CascadingFlowPlanner::Scope.new(name))
|
15
19
|
end
|
16
20
|
|
17
|
-
def self.
|
18
|
-
|
21
|
+
def self.source_scope(name, tap, flow_scope)
|
22
|
+
incoming_scopes = java.util.HashSet.new
|
23
|
+
incoming_scopes.add(flow_scope)
|
24
|
+
java_scope = outgoing_scope_for(tap, incoming_scopes)
|
19
25
|
# Taps and Pipes don't name their outgoing scopes like other FlowElements
|
20
26
|
java_scope.name = name
|
21
27
|
Scope.new(java_scope)
|
@@ -4,12 +4,12 @@ module Cascading
|
|
4
4
|
# Allows you to plugin c.p.SubAssemblies to a cascading.jruby Assembly.
|
5
5
|
#
|
6
6
|
# Assumptions:
|
7
|
-
#
|
8
|
-
#
|
9
|
-
#
|
10
|
-
#
|
11
|
-
#
|
12
|
-
#
|
7
|
+
# * You will either use the tail_pipe of the calling Assembly, or overwrite
|
8
|
+
# its incoming_scopes (as do join and union)
|
9
|
+
# * Your subassembly will have only 1 tail pipe; branching is not
|
10
|
+
# supported. This allows you to continue operating upon the tail of the
|
11
|
+
# SubAssembly within the calling Assembly
|
12
|
+
# * You will not use nested c.p.SubAssemblies
|
13
13
|
#
|
14
14
|
# This is a low-level tool, so be careful.
|
15
15
|
class SubAssembly
|
@@ -0,0 +1,81 @@
|
|
1
|
+
module Cascading
|
2
|
+
# A Cascading::BaseTap wraps up a pair of Cascading taps, one for Cascading
|
3
|
+
# local mode and the other for Hadoop mode.
|
4
|
+
class BaseTap
|
5
|
+
attr_reader :local_tap, :hadoop_tap
|
6
|
+
|
7
|
+
def initialize(local_tap, hadoop_tap)
|
8
|
+
@local_tap = local_tap
|
9
|
+
@hadoop_tap = hadoop_tap
|
10
|
+
end
|
11
|
+
|
12
|
+
def local?
|
13
|
+
!local_tap.nil?
|
14
|
+
end
|
15
|
+
|
16
|
+
def hadoop?
|
17
|
+
!hadoop_tap.nil?
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
# A Cascading::Tap represents a non-aggregate tap with a scheme, path, and
|
22
|
+
# optional sink_mode. c.t.l.FileTap is used in Cascading local mode and
|
23
|
+
# c.t.h.Hfs is used in Hadoop mode. Whether or not these can be created is
|
24
|
+
# governed by the :scheme parameter, which must contain at least one of
|
25
|
+
# :local_scheme or :hadoop_scheme. Schemes like TextLine are supported in
|
26
|
+
# both modes (by Cascading), but SequenceFile is only supported in Hadoop
|
27
|
+
# mode.
|
28
|
+
class Tap < BaseTap
|
29
|
+
attr_reader :scheme, :path, :sink_mode
|
30
|
+
|
31
|
+
def initialize(path, params = {})
|
32
|
+
@path = path
|
33
|
+
|
34
|
+
@scheme = params[:scheme] || text_line_scheme
|
35
|
+
raise "Scheme must provide one of :local_scheme or :hadoop_scheme; received: '#{scheme.inspect}'" unless scheme[:local_scheme] || scheme[:hadoop_scheme]
|
36
|
+
|
37
|
+
@sink_mode = case params[:sink_mode] || :keep
|
38
|
+
when :keep, 'keep' then Java::CascadingTap::SinkMode::KEEP
|
39
|
+
when :replace, 'replace' then Java::CascadingTap::SinkMode::REPLACE
|
40
|
+
when :append, 'append' then Java::CascadingTap::SinkMode::APPEND
|
41
|
+
else raise "Unrecognized sink mode '#{params[:sink_mode]}'"
|
42
|
+
end
|
43
|
+
|
44
|
+
local_scheme = scheme[:local_scheme]
|
45
|
+
@local_tap = local_scheme ? Java::CascadingTapLocal::FileTap.new(local_scheme, path, sink_mode) : nil
|
46
|
+
|
47
|
+
hadoop_scheme = scheme[:hadoop_scheme]
|
48
|
+
@hadoop_tap = hadoop_scheme ? Java::CascadingTapHadoop::Hfs.new(hadoop_scheme, path, sink_mode) : nil
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
# A Cascading::MultiTap represents one of Cascading's aggregate taps and is
|
53
|
+
# built via static constructors that accept an array of Cascading::Taps. In
|
54
|
+
# order for a mode (Cascading local or Hadoop) to be supported, all provided
|
55
|
+
# taps must support it.
|
56
|
+
class MultiTap < BaseTap
|
57
|
+
def initialize(local_tap, hadoop_tap)
|
58
|
+
super(local_tap, hadoop_tap)
|
59
|
+
end
|
60
|
+
|
61
|
+
def self.multi_source_tap(taps)
|
62
|
+
multi_tap(taps, Java::CascadingTap::MultiSourceTap)
|
63
|
+
end
|
64
|
+
|
65
|
+
def self.multi_sink_tap(taps)
|
66
|
+
multi_tap(taps, Java::CascadingTap::MultiSinkTap)
|
67
|
+
end
|
68
|
+
|
69
|
+
private
|
70
|
+
|
71
|
+
def self.multi_tap(taps, klass)
|
72
|
+
local_supported = taps.all?{ |tap| tap.local? }
|
73
|
+
local_tap = local_supported ? klass.new(taps.map{ |tap| tap.local_tap }.to_java('cascading.tap.Tap')) : nil
|
74
|
+
|
75
|
+
hadoop_supported = taps.all?{ |tap| tap.hadoop? }
|
76
|
+
hadoop_tap = hadoop_supported ? klass.new(taps.map{ |tap| tap.hadoop_tap }.to_java('cascading.tap.Tap')) : nil
|
77
|
+
|
78
|
+
MultiTap.new(local_tap, hadoop_tap)
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
data/lib/cascading.rb
CHANGED
@@ -6,7 +6,7 @@ require 'java'
|
|
6
6
|
|
7
7
|
module Cascading
|
8
8
|
# :stopdoc:
|
9
|
-
VERSION = '0.0.
|
9
|
+
VERSION = '0.0.9'
|
10
10
|
LIBPATH = ::File.expand_path(::File.dirname(__FILE__)) + ::File::SEPARATOR
|
11
11
|
PATH = ::File.dirname(LIBPATH) + ::File::SEPARATOR
|
12
12
|
CASCADING_HOME = ENV['CASCADING_HOME']
|
@@ -55,8 +55,10 @@ require 'cascading/cascading'
|
|
55
55
|
require 'cascading/cascading_exception'
|
56
56
|
require 'cascading/expr_stub'
|
57
57
|
require 'cascading/flow'
|
58
|
+
require 'cascading/mode'
|
58
59
|
require 'cascading/operations'
|
59
60
|
require 'cascading/scope'
|
61
|
+
require 'cascading/tap'
|
60
62
|
|
61
63
|
# include module to make them available at top package
|
62
64
|
include Cascading
|