cascading.jruby 0.0.8 → 0.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/HACKING.md +1 -1
- data/History.txt +6 -0
- data/README.md +1 -1
- data/ivy.xml +3 -3
- data/lib/cascading/aggregations.rb +14 -13
- data/lib/cascading/base.rb +27 -9
- data/lib/cascading/cascade.rb +12 -3
- data/lib/cascading/cascading.rb +35 -44
- data/lib/cascading/cascading_exception.rb +5 -5
- data/lib/cascading/flow.rb +23 -32
- data/lib/cascading/mode.rb +78 -0
- data/lib/cascading/operations.rb +10 -4
- data/lib/cascading/scope.rb +8 -2
- data/lib/cascading/sub_assembly.rb +6 -6
- data/lib/cascading/tap.rb +81 -0
- data/lib/cascading.rb +3 -1
- data/samples/branch.rb +2 -2
- data/samples/copy.rb +2 -2
- data/samples/group_by.rb +5 -5
- data/samples/join.rb +2 -2
- data/samples/logwordcount.rb +3 -4
- data/samples/project.rb +2 -2
- data/samples/rename.rb +2 -2
- data/samples/scorenames.rb +2 -2
- data/samples/splitter.rb +2 -2
- data/samples/sub_assembly.rb +2 -2
- data/samples/union.rb +2 -2
- data/spec/cascading_spec.rb +17 -17
- data/spec/spec_util.rb +9 -9
- data/tags +92 -41
- data/tasks/ant.rake +6 -1
- data/test/test_assembly.rb +14 -7
- data/test/test_cascade.rb +55 -0
- data/test/test_cascading.rb +12 -15
- data/test/test_flow.rb +53 -36
- data/test/test_local_execution.rb +7 -7
- data/test/test_operations.rb +61 -0
- metadata +4 -2
data/HACKING.md
CHANGED
data/History.txt
CHANGED
@@ -1,3 +1,9 @@
|
|
1
|
+
0.0.9 - Cascading local mode and upgrade to Cascading 2.0.0
|
2
|
+
|
3
|
+
This release upgrades to Cascading 2.0.0 (final) and introduces Cascading local
|
4
|
+
mode. Ambiguous node names are now prohibited and the insert helper correctly
|
5
|
+
supports constants.
|
6
|
+
|
1
7
|
0.0.8 - AggregateBy and upgrade to Cascading 2.0.0 wip-286
|
2
8
|
|
3
9
|
This release upgrades to Cascading 2.0.0 wip-286, but again does not implement
|
data/README.md
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
`cascading.jruby` is a small DSL above [Cascading](http://www.cascading.org/).
|
4
4
|
|
5
|
-
It requires Hadoop (>= 0.20.2) and [Cascading 2.0.0
|
5
|
+
It requires Hadoop (>= 0.20.2) and [Cascading 2.0.0](http://files.cascading.org/cascading/2.0/cascading-2.0.0.tgz) to be set via the environment variables: `HADOOP_HOME` and `CASCADING_HOME`
|
6
6
|
|
7
7
|
It has been tested on JRuby versions 1.2.0, 1.4.0, 1.5.3, and 1.6.5.
|
8
8
|
|
data/ivy.xml
CHANGED
@@ -17,9 +17,9 @@
|
|
17
17
|
</configurations>
|
18
18
|
|
19
19
|
<dependencies>
|
20
|
-
<dependency org="cascading" name="cascading-core" rev="2.0.0
|
21
|
-
<dependency org="cascading" name="cascading-local" rev="2.0.0
|
22
|
-
<dependency org="cascading" name="cascading-hadoop" rev="2.0.0
|
20
|
+
<dependency org="cascading" name="cascading-core" rev="2.0.0" conf="default" />
|
21
|
+
<dependency org="cascading" name="cascading-local" rev="2.0.0" conf="default" />
|
22
|
+
<dependency org="cascading" name="cascading-hadoop" rev="2.0.0" conf="default" />
|
23
23
|
<dependency org="org.jruby" name="jruby" rev="1.6.5" conf="default" />
|
24
24
|
</dependencies>
|
25
25
|
</ivy-module>
|
@@ -4,19 +4,20 @@ require 'cascading/ext/array'
|
|
4
4
|
|
5
5
|
module Cascading
|
6
6
|
# Rules enforced by Aggregations:
|
7
|
-
#
|
8
|
-
#
|
9
|
-
#
|
10
|
-
#
|
7
|
+
# * Contains either 1 Buffer or >= 1 Aggregator (explicitly checked)
|
8
|
+
# * No GroupBys, CoGroups, Joins, or Merges (methods for these pipes do not
|
9
|
+
# exist on Aggregations)
|
10
|
+
# * No Eaches (Aggregations#each does not exist)
|
11
|
+
# * Aggregations may not branch (Aggregations#branch does not exist)
|
11
12
|
#
|
12
13
|
# Externally enforced rules:
|
13
|
-
#
|
14
|
-
#
|
14
|
+
# * May be empty (in which case, Aggregations is not instantiated)
|
15
|
+
# * Must follow a GroupBy or CoGroup (not a Join or Merge)
|
15
16
|
#
|
16
17
|
# Optimizations:
|
17
|
-
#
|
18
|
-
#
|
19
|
-
#
|
18
|
+
# * If the leading Group is a GroupBy and all subsequent Everies are
|
19
|
+
# Aggregators that have a corresponding AggregateBy, Aggregations can replace
|
20
|
+
# the GroupBy/Aggregator pipe with a single composite AggregateBy
|
20
21
|
class Aggregations
|
21
22
|
include Operations
|
22
23
|
|
@@ -110,10 +111,10 @@ module Cascading
|
|
110
111
|
# insert) and an options hash.
|
111
112
|
#
|
112
113
|
# Options include:
|
113
|
-
#
|
114
|
-
#
|
115
|
-
#
|
116
|
-
#
|
114
|
+
# * <tt>:ignore</tt> a Java Array of Objects (for min and max) or Tuples
|
115
|
+
# (for first and last) of values for the aggregator to ignore
|
116
|
+
# * <tt>function</tt> is a symbol that is the method to call to construct
|
117
|
+
# the Cascading Aggregator.
|
117
118
|
def composite_aggregator(args, function)
|
118
119
|
field_map, options = extract_field_map(args)
|
119
120
|
|
data/lib/cascading/base.rb
CHANGED
@@ -14,9 +14,12 @@ module Cascading
|
|
14
14
|
@last_child = nil
|
15
15
|
end
|
16
16
|
|
17
|
+
# Children must be uniquely named within the scope of each Node. This
|
18
|
+
# ensures, for example, two assemblies are not created within the same flow
|
19
|
+
# with the same name, causing joins, unions, and sinks on them to be
|
20
|
+
# ambiguous.
|
17
21
|
def add_child(node)
|
18
|
-
|
19
|
-
warn "WARNING: adding '#{node.qualified_name}', but node named '#{node.name}' already exists at '#{child.qualified_name}'" if child
|
22
|
+
raise AmbiguousNodeNameException.new("Attempted to add '#{node.qualified_name}', but node named '#{node.name}' already exists") if @children[node.name]
|
20
23
|
|
21
24
|
@children[node.name] = node
|
22
25
|
@child_names << node.name
|
@@ -33,21 +36,36 @@ module Cascading
|
|
33
36
|
end
|
34
37
|
alias desc describe
|
35
38
|
|
39
|
+
# In order to find a child, we require it to be uniquely named within this
|
40
|
+
# Node and its children. This ensures, for example, branches in peer
|
41
|
+
# assemblies or branches and assemblies do not conflict in joins, unions,
|
42
|
+
# and sinks.
|
36
43
|
def find_child(name)
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
nil
|
44
|
+
all_children_with_name = find_all_children_with_name(name)
|
45
|
+
qualified_names = all_children_with_name.map{ |child| child.qualified_name }
|
46
|
+
raise AmbiguousNodeNameException.new("Ambiguous lookup of child by name '#{name}'; found '#{qualified_names.join("', '")}'") if all_children_with_name.size > 1
|
47
|
+
|
48
|
+
all_children_with_name.first
|
43
49
|
end
|
44
50
|
|
45
51
|
def root
|
46
52
|
return self unless parent
|
47
53
|
parent.root
|
48
54
|
end
|
55
|
+
|
56
|
+
protected
|
57
|
+
|
58
|
+
def find_all_children_with_name(name)
|
59
|
+
child_names.map do |child_name|
|
60
|
+
children[child_name] if child_name == name
|
61
|
+
end.compact + child_names.map do |child_name|
|
62
|
+
children[child_name].find_all_children_with_name(name)
|
63
|
+
end.flatten
|
64
|
+
end
|
49
65
|
end
|
50
66
|
|
67
|
+
class AmbiguousNodeNameException < StandardError; end
|
68
|
+
|
51
69
|
# A module to add auto-registration capability
|
52
70
|
module Registerable
|
53
71
|
def all
|
@@ -69,7 +87,7 @@ module Cascading
|
|
69
87
|
|
70
88
|
def add(name, instance)
|
71
89
|
@registered ||= {}
|
72
|
-
warn "WARNING:
|
90
|
+
warn "WARNING: Node named '#{name}' already registered in #{self}" if @registered[name]
|
73
91
|
@registered[name] = instance
|
74
92
|
end
|
75
93
|
|
data/lib/cascading/cascade.rb
CHANGED
@@ -9,14 +9,23 @@ module Cascading
|
|
9
9
|
class Cascade < Cascading::Node
|
10
10
|
extend Registerable
|
11
11
|
|
12
|
-
|
12
|
+
attr_reader :mode
|
13
|
+
|
14
|
+
# Builds a cascade given the specified name. Optionally accepts a :mode
|
15
|
+
# which will be used as the default mode for all child flows. See
|
16
|
+
# Cascading::Mode.parse for details.
|
17
|
+
def initialize(name, params = {})
|
18
|
+
@mode = params[:mode]
|
13
19
|
super(name, nil) # A Cascade cannot have a parent
|
14
20
|
self.class.add(name, self)
|
15
21
|
end
|
16
22
|
|
17
|
-
|
23
|
+
# Builds a child flow given a name and block. Optionally accepts a :mode,
|
24
|
+
# which will override the default mode stored in this cascade.
|
25
|
+
def flow(name, params = {}, &block)
|
18
26
|
raise "Could not build flow '#{name}'; block required" unless block_given?
|
19
|
-
|
27
|
+
params[:mode] ||= mode
|
28
|
+
flow = Flow.new(name, self, params)
|
20
29
|
add_child(flow)
|
21
30
|
flow.instance_eval(&block)
|
22
31
|
flow
|
data/lib/cascading/cascading.rb
CHANGED
@@ -12,17 +12,21 @@ module Cascading
|
|
12
12
|
:float => java.lang.Float.java_class, :string => java.lang.String.java_class,
|
13
13
|
}
|
14
14
|
|
15
|
-
|
15
|
+
# Builds a top-level cascade given a name and a block. Optionally accepts a
|
16
|
+
# :mode, as explained in Cascading::Cascade#initialize.
|
17
|
+
def cascade(name, params = {}, &block)
|
16
18
|
raise "Could not build cascade '#{name}'; block required" unless block_given?
|
17
|
-
cascade = Cascade.new(name)
|
19
|
+
cascade = Cascade.new(name, params)
|
18
20
|
cascade.instance_eval(&block)
|
19
21
|
cascade
|
20
22
|
end
|
21
23
|
|
22
|
-
#
|
23
|
-
|
24
|
+
# Builds a top-level flow given a name and block for applications built of
|
25
|
+
# flows with no cascades. Optionally accepts a :mode, as explained in
|
26
|
+
# Cascading::Flow#initialize.
|
27
|
+
def flow(name, params = {}, &block)
|
24
28
|
raise "Could not build flow '#{name}'; block required" unless block_given?
|
25
|
-
flow = Flow.new(name, nil)
|
29
|
+
flow = Flow.new(name, nil, params)
|
26
30
|
flow.instance_eval(&block)
|
27
31
|
flow
|
28
32
|
end
|
@@ -91,7 +95,9 @@ module Cascading
|
|
91
95
|
Java::CascadingTuple::Fields::RESULTS
|
92
96
|
end
|
93
97
|
|
94
|
-
# Creates a
|
98
|
+
# Creates a TextLine scheme (can be used in both Cascading local and hadoop
|
99
|
+
# modes). Positional args are used if <tt>:source_fields</tt> is not
|
100
|
+
# provided.
|
95
101
|
#
|
96
102
|
# The named options are:
|
97
103
|
# * <tt>:source_fields</tt> a string or array of strings. Specifies the
|
@@ -100,7 +106,7 @@ module Cascading
|
|
100
106
|
# to be written to a sink with this scheme. Defaults to all_fields.
|
101
107
|
# * <tt>:compression</tt> a symbol, either <tt>:enable</tt> or
|
102
108
|
# <tt>:disable</tt>, that governs the TextLine scheme's compression. Defaults
|
103
|
-
# to the default TextLine compression.
|
109
|
+
# to the default TextLine compression (only applies to c.s.h.TextLine).
|
104
110
|
def text_line_scheme(*args)
|
105
111
|
options = args.extract_options!
|
106
112
|
source_fields = fields(options[:source_fields] || (args.empty? ? ['offset', 'line'] : args))
|
@@ -111,55 +117,40 @@ module Cascading
|
|
111
117
|
else Java::CascadingSchemeHadoop::TextLine::Compress::DEFAULT
|
112
118
|
end
|
113
119
|
|
114
|
-
|
120
|
+
{
|
121
|
+
:local_scheme => Java::CascadingSchemeLocal::TextLine.new(source_fields, sink_fields),
|
122
|
+
:hadoop_scheme => Java::CascadingSchemeHadoop::TextLine.new(source_fields, sink_fields, sink_compression),
|
123
|
+
}
|
115
124
|
end
|
116
125
|
|
117
|
-
# Creates a c.s.h.SequenceFile scheme instance from the specified fields.
|
126
|
+
# Creates a c.s.h.SequenceFile scheme instance from the specified fields. A
|
127
|
+
# local SequenceFile scheme is not provided by Cascading, so this scheme
|
128
|
+
# cannot be used in Cascading local mode.
|
118
129
|
def sequence_file_scheme(*fields)
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
return Java::CascadingSchemeHadoop::SequenceFile.new(all_fields)
|
124
|
-
end
|
130
|
+
{
|
131
|
+
:local_scheme => nil,
|
132
|
+
:hadoop_scheme => Java::CascadingSchemeHadoop::SequenceFile.new(fields.empty? ? all_fields : fields(fields)),
|
133
|
+
}
|
125
134
|
end
|
126
135
|
|
127
136
|
def multi_source_tap(*taps)
|
128
|
-
|
137
|
+
MultiTap.multi_source_tap(taps)
|
129
138
|
end
|
130
139
|
|
131
140
|
def multi_sink_tap(*taps)
|
132
|
-
|
133
|
-
end
|
134
|
-
|
135
|
-
#
|
136
|
-
|
137
|
-
|
138
|
-
opts = args.extract_options!
|
139
|
-
path = args.empty? ? opts[:path] : args[0]
|
140
|
-
scheme = opts[:scheme] || text_line_scheme
|
141
|
-
sink_mode = opts[:sink_mode] || :keep
|
142
|
-
sink_mode = case sink_mode
|
143
|
-
when :keep, 'keep' then Java::CascadingTap::SinkMode::KEEP
|
144
|
-
when :replace, 'replace' then Java::CascadingTap::SinkMode::REPLACE
|
145
|
-
when :append, 'append' then Java::CascadingTap::SinkMode::APPEND
|
146
|
-
else raise "Unrecognized sink mode '#{sink_mode}'"
|
147
|
-
end
|
148
|
-
fs = opts[:kind] || :hfs
|
149
|
-
klass = case fs
|
150
|
-
when :hfs, 'hfs' then Java::CascadingTapHadoop::Hfs
|
151
|
-
when :dfs, 'dfs' then Java::CascadingTapHadoop::Dfs
|
152
|
-
when :lfs, 'lfs' then Java::CascadingTapHadoop::Lfs
|
153
|
-
else raise "Unrecognized kind of tap '#{fs}'"
|
154
|
-
end
|
155
|
-
parameters = [scheme, path, sink_mode]
|
156
|
-
klass.new(*parameters)
|
141
|
+
MultiTap.multi_sink_tap(taps)
|
142
|
+
end
|
143
|
+
|
144
|
+
# Creates a Cascading::Tap given a path and optional :scheme and :sink_mode.
|
145
|
+
def tap(path, params = {})
|
146
|
+
Tap.new(path, params)
|
157
147
|
end
|
158
148
|
|
159
149
|
# Constructs properties to be passed to Flow#complete or Cascade#complete
|
160
|
-
# which will locate temporary Hadoop files in base_dir. It is necessary
|
161
|
-
#
|
162
|
-
# method, which confuses Cascading's attempt to find the
|
150
|
+
# which will locate temporary Hadoop files in base_dir. It is necessary to
|
151
|
+
# pass these properties only when executing scripts in Hadoop local mode via
|
152
|
+
# JRuby's main method, which confuses Cascading's attempt to find the
|
153
|
+
# containing jar. When using Cascading local mode, these are unnecessary.
|
163
154
|
def local_properties(base_dir)
|
164
155
|
dirs = {
|
165
156
|
'test.build.data' => "#{base_dir}/build",
|
@@ -1,9 +1,9 @@
|
|
1
|
-
# NativeException wrapper that prints the full nested stack trace of the Java
|
2
|
-
# exception and all of its causes wrapped by the NativeException.
|
3
|
-
# NativeException by default reveals only the first cause, which is
|
4
|
-
# insufficient for tracing cascading.jruby errors into JRuby code or revealing
|
5
|
-
# underlying Janino expression problems.
|
6
1
|
module Cascading
|
2
|
+
# NativeException wrapper that prints the full nested stack trace of the Java
|
3
|
+
# exception and all of its causes wrapped by the NativeException.
|
4
|
+
# NativeException by default reveals only the first cause, which is
|
5
|
+
# insufficient for tracing cascading.jruby errors into JRuby code or
|
6
|
+
# revealing underlying Janino expression problems.
|
7
7
|
class CascadingException < StandardError
|
8
8
|
attr_accessor :ne, :depth
|
9
9
|
|
data/lib/cascading/flow.rb
CHANGED
@@ -10,9 +10,15 @@ module Cascading
|
|
10
10
|
extend Registerable
|
11
11
|
|
12
12
|
attr_accessor :properties, :sources, :sinks, :incoming_scopes, :outgoing_scopes, :listeners
|
13
|
+
attr_reader :mode
|
13
14
|
|
14
|
-
|
15
|
+
# Builds a flow given a name and a parent node (a cascade or nil).
|
16
|
+
# Optionally accepts a :mode which will determine the execution mode of
|
17
|
+
# this flow. See Cascading::Mode.parse for details.
|
18
|
+
def initialize(name, parent, params = {})
|
15
19
|
@properties, @sources, @sinks, @incoming_scopes, @outgoing_scopes, @listeners = {}, {}, {}, {}, {}, []
|
20
|
+
@mode = Mode.parse(params[:mode])
|
21
|
+
@flow_scope = Scope.flow_scope(name)
|
16
22
|
super(name, parent)
|
17
23
|
self.class.add(name, self)
|
18
24
|
end
|
@@ -25,30 +31,18 @@ module Cascading
|
|
25
31
|
assembly
|
26
32
|
end
|
27
33
|
|
28
|
-
# Create a new
|
29
|
-
#
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
elsif (args.size == 1)
|
35
|
-
sinks[name] = args[0]
|
36
|
-
end
|
34
|
+
# Create a new source for this flow, using the specified name and
|
35
|
+
# Cascading::Tap
|
36
|
+
def source(name, tap)
|
37
|
+
sources[name] = tap
|
38
|
+
incoming_scopes[name] = Scope.source_scope(name, mode.source_tap(name, tap), @flow_scope)
|
39
|
+
outgoing_scopes[name] = incoming_scopes[name]
|
37
40
|
end
|
38
41
|
|
39
|
-
# Create a new
|
40
|
-
#
|
41
|
-
|
42
|
-
|
43
|
-
if (args.size == 2)
|
44
|
-
sources[args[0]] = args[1]
|
45
|
-
incoming_scopes[args[0]] = Scope.tap_scope(args[1], args[0])
|
46
|
-
outgoing_scopes[args[0]] = incoming_scopes[args[0]]
|
47
|
-
elsif (args.size == 1)
|
48
|
-
sources[name] = args[0]
|
49
|
-
incoming_scopes[name] = Scope.empty_scope(name)
|
50
|
-
outgoing_scopes[name] = incoming_scopes[name]
|
51
|
-
end
|
42
|
+
# Create a new sink for this flow, using the specified name and
|
43
|
+
# Cascading::Tap
|
44
|
+
def sink(name, tap)
|
45
|
+
sinks[name] = tap
|
52
46
|
end
|
53
47
|
|
54
48
|
def describe(offset = '')
|
@@ -149,12 +143,10 @@ module Cascading
|
|
149
143
|
Java::CascadingProperty::AppProps.setApplicationName(properties, name)
|
150
144
|
Java::CascadingProperty::AppProps.setApplicationVersion(properties, '0.0.0')
|
151
145
|
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
make_pipes
|
157
|
-
)
|
146
|
+
sources = make_tap_parameter(@sources, :head_pipe)
|
147
|
+
sinks = make_tap_parameter(@sinks, :tail_pipe)
|
148
|
+
pipes = make_pipes
|
149
|
+
mode.connect_flow(properties, name, sources, sinks, pipes)
|
158
150
|
end
|
159
151
|
|
160
152
|
def complete(properties = nil)
|
@@ -169,12 +161,11 @@ module Cascading
|
|
169
161
|
|
170
162
|
private
|
171
163
|
|
172
|
-
def make_tap_parameter(taps)
|
164
|
+
def make_tap_parameter(taps, pipe_accessor)
|
173
165
|
taps.inject({}) do |map, (name, tap)|
|
174
166
|
assembly = find_child(name)
|
175
167
|
raise "Could not find assembly '#{name}' to connect to tap: #{tap}" unless assembly
|
176
|
-
|
177
|
-
map[assembly.tail_pipe.name] = tap
|
168
|
+
map[assembly.send(pipe_accessor).name] = tap
|
178
169
|
map
|
179
170
|
end
|
180
171
|
end
|
@@ -0,0 +1,78 @@
|
|
1
|
+
module Cascading
|
2
|
+
# A Cascading::Mode encapsulates the idea of the execution mode for your
|
3
|
+
# flows. The default is Hadoop mode, but you can request that your code run
|
4
|
+
# in Cascading local mode. If you subsequently use a tap or a scheme that
|
5
|
+
# has no local implementation, the mode will be converted back to Hadoop
|
6
|
+
# mode.
|
7
|
+
class Mode
|
8
|
+
attr_reader :local
|
9
|
+
|
10
|
+
# Hadoop mode is the default. You must explicitly request Cascading local
|
11
|
+
# mode with values 'local' or :local.
|
12
|
+
def self.parse(mode)
|
13
|
+
case mode
|
14
|
+
when 'local', :local then Mode.new(true)
|
15
|
+
else Mode.new(false)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def initialize(local)
|
20
|
+
@local = local
|
21
|
+
end
|
22
|
+
|
23
|
+
# Attempts to select the appropriate tap given the current mode. If that
|
24
|
+
# tap does not exist, it fails over to the other tap with a warning.
|
25
|
+
def source_tap(name, tap)
|
26
|
+
warn "WARNING: No local tap for source '#{name}' in tap #{tap}" if local && !tap.local?
|
27
|
+
warn "WARNING: No Hadoop tap for source '#{name}' in tap #{tap}" if !local && !tap.hadoop?
|
28
|
+
|
29
|
+
if local
|
30
|
+
tap.local_tap || tap.hadoop_tap
|
31
|
+
else
|
32
|
+
tap.hadoop_tap || tap.local_tap
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
# Builds a c.f.Flow given properties, name, sources, sinks, and pipes from
|
37
|
+
# a Cascading::Flow. The current mode is adjusted based on the taps and
|
38
|
+
# schemes of the sources and sinks, then the correct taps are selected
|
39
|
+
# before building the flow.
|
40
|
+
def connect_flow(properties, name, sources, sinks, pipes)
|
41
|
+
update_local_mode(sources, sinks)
|
42
|
+
sources = select_taps(sources)
|
43
|
+
sinks = select_taps(sinks)
|
44
|
+
flow_connector_class.new(properties).connect(name, sources, sinks, pipes)
|
45
|
+
end
|
46
|
+
|
47
|
+
private
|
48
|
+
|
49
|
+
# Updates this mode based upon your sources and sinks. It's possible that
|
50
|
+
# you asked for Cascading local mode, but that request cannot be fulfilled
|
51
|
+
# because you used taps or schemes which have no local implementation.
|
52
|
+
def update_local_mode(sources, sinks)
|
53
|
+
local_supported = sources.all?{ |name, tap| tap.local? } && sinks.all?{ |name, tap| tap.local? }
|
54
|
+
|
55
|
+
if local && !local_supported
|
56
|
+
non_local_sources = sources.reject{ |name, tap| tap.local? }
|
57
|
+
non_local_sinks = sinks.reject{ |name, tap| tap.local? }
|
58
|
+
warn "WARNING: Cascading local mode requested but these sources: #{non_local_sources.inspect} and these sinks: #{non_local_sinks.inspect} do not support it"
|
59
|
+
@local = false
|
60
|
+
end
|
61
|
+
|
62
|
+
local
|
63
|
+
end
|
64
|
+
|
65
|
+
# Given a tap map, extracts the correct taps for the current mode
|
66
|
+
def select_taps(tap_map)
|
67
|
+
tap_map.inject({}) do |map, (name, tap)|
|
68
|
+
map[name] = tap.send(local ? :local_tap : :hadoop_tap)
|
69
|
+
map
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
# Chooses the correct FlowConnector class for the current mode
|
74
|
+
def flow_connector_class
|
75
|
+
local ? Java::CascadingFlowLocal::LocalFlowConnector : Java::CascadingFlowHadoop::HadoopFlowConnector
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
data/lib/cascading/operations.rb
CHANGED
@@ -107,15 +107,21 @@ module Cascading
|
|
107
107
|
|
108
108
|
def to_java_comparable_array(arr)
|
109
109
|
(arr.map do |v|
|
110
|
-
|
110
|
+
coerce_to_java(v)
|
111
|
+
end).to_java(java.lang.Comparable)
|
112
|
+
end
|
113
|
+
|
114
|
+
def coerce_to_java(v)
|
115
|
+
case v
|
111
116
|
when Fixnum
|
112
|
-
java.lang.
|
117
|
+
java.lang.Long.new(v)
|
113
118
|
when Float
|
114
119
|
java.lang.Double.new(v)
|
120
|
+
when NilClass
|
121
|
+
nil
|
115
122
|
else
|
116
123
|
java.lang.String.new(v.to_s)
|
117
|
-
|
118
|
-
end).to_java(java.lang.Comparable)
|
124
|
+
end
|
119
125
|
end
|
120
126
|
|
121
127
|
def expression_filter(*args)
|
data/lib/cascading/scope.rb
CHANGED
@@ -10,12 +10,18 @@ module Cascading
|
|
10
10
|
Scope.new(Java::CascadingFlowPlanner::Scope.new(@scope))
|
11
11
|
end
|
12
12
|
|
13
|
+
def self.flow_scope(name)
|
14
|
+
Java::CascadingFlowPlanner::Scope.new(name)
|
15
|
+
end
|
16
|
+
|
13
17
|
def self.empty_scope(name)
|
14
18
|
Scope.new(Java::CascadingFlowPlanner::Scope.new(name))
|
15
19
|
end
|
16
20
|
|
17
|
-
def self.
|
18
|
-
|
21
|
+
def self.source_scope(name, tap, flow_scope)
|
22
|
+
incoming_scopes = java.util.HashSet.new
|
23
|
+
incoming_scopes.add(flow_scope)
|
24
|
+
java_scope = outgoing_scope_for(tap, incoming_scopes)
|
19
25
|
# Taps and Pipes don't name their outgoing scopes like other FlowElements
|
20
26
|
java_scope.name = name
|
21
27
|
Scope.new(java_scope)
|
@@ -4,12 +4,12 @@ module Cascading
|
|
4
4
|
# Allows you to plugin c.p.SubAssemblies to a cascading.jruby Assembly.
|
5
5
|
#
|
6
6
|
# Assumptions:
|
7
|
-
#
|
8
|
-
#
|
9
|
-
#
|
10
|
-
#
|
11
|
-
#
|
12
|
-
#
|
7
|
+
# * You will either use the tail_pipe of the calling Assembly, or overwrite
|
8
|
+
# its incoming_scopes (as do join and union)
|
9
|
+
# * Your subassembly will have only 1 tail pipe; branching is not
|
10
|
+
# supported. This allows you to continue operating upon the tail of the
|
11
|
+
# SubAssembly within the calling Assembly
|
12
|
+
# * You will not use nested c.p.SubAssemblies
|
13
13
|
#
|
14
14
|
# This is a low-level tool, so be careful.
|
15
15
|
class SubAssembly
|
@@ -0,0 +1,81 @@
|
|
1
|
+
module Cascading
|
2
|
+
# A Cascading::BaseTap wraps up a pair of Cascading taps, one for Cascading
|
3
|
+
# local mode and the other for Hadoop mode.
|
4
|
+
class BaseTap
|
5
|
+
attr_reader :local_tap, :hadoop_tap
|
6
|
+
|
7
|
+
def initialize(local_tap, hadoop_tap)
|
8
|
+
@local_tap = local_tap
|
9
|
+
@hadoop_tap = hadoop_tap
|
10
|
+
end
|
11
|
+
|
12
|
+
def local?
|
13
|
+
!local_tap.nil?
|
14
|
+
end
|
15
|
+
|
16
|
+
def hadoop?
|
17
|
+
!hadoop_tap.nil?
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
# A Cascading::Tap represents a non-aggregate tap with a scheme, path, and
|
22
|
+
# optional sink_mode. c.t.l.FileTap is used in Cascading local mode and
|
23
|
+
# c.t.h.Hfs is used in Hadoop mode. Whether or not these can be created is
|
24
|
+
# governed by the :scheme parameter, which must contain at least one of
|
25
|
+
# :local_scheme or :hadoop_scheme. Schemes like TextLine are supported in
|
26
|
+
# both modes (by Cascading), but SequenceFile is only supported in Hadoop
|
27
|
+
# mode.
|
28
|
+
class Tap < BaseTap
|
29
|
+
attr_reader :scheme, :path, :sink_mode
|
30
|
+
|
31
|
+
def initialize(path, params = {})
|
32
|
+
@path = path
|
33
|
+
|
34
|
+
@scheme = params[:scheme] || text_line_scheme
|
35
|
+
raise "Scheme must provide one of :local_scheme or :hadoop_scheme; received: '#{scheme.inspect}'" unless scheme[:local_scheme] || scheme[:hadoop_scheme]
|
36
|
+
|
37
|
+
@sink_mode = case params[:sink_mode] || :keep
|
38
|
+
when :keep, 'keep' then Java::CascadingTap::SinkMode::KEEP
|
39
|
+
when :replace, 'replace' then Java::CascadingTap::SinkMode::REPLACE
|
40
|
+
when :append, 'append' then Java::CascadingTap::SinkMode::APPEND
|
41
|
+
else raise "Unrecognized sink mode '#{params[:sink_mode]}'"
|
42
|
+
end
|
43
|
+
|
44
|
+
local_scheme = scheme[:local_scheme]
|
45
|
+
@local_tap = local_scheme ? Java::CascadingTapLocal::FileTap.new(local_scheme, path, sink_mode) : nil
|
46
|
+
|
47
|
+
hadoop_scheme = scheme[:hadoop_scheme]
|
48
|
+
@hadoop_tap = hadoop_scheme ? Java::CascadingTapHadoop::Hfs.new(hadoop_scheme, path, sink_mode) : nil
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
# A Cascading::MultiTap represents one of Cascading's aggregate taps and is
|
53
|
+
# built via static constructors that accept an array of Cascading::Taps. In
|
54
|
+
# order for a mode (Cascading local or Hadoop) to be supported, all provided
|
55
|
+
# taps must support it.
|
56
|
+
class MultiTap < BaseTap
|
57
|
+
def initialize(local_tap, hadoop_tap)
|
58
|
+
super(local_tap, hadoop_tap)
|
59
|
+
end
|
60
|
+
|
61
|
+
def self.multi_source_tap(taps)
|
62
|
+
multi_tap(taps, Java::CascadingTap::MultiSourceTap)
|
63
|
+
end
|
64
|
+
|
65
|
+
def self.multi_sink_tap(taps)
|
66
|
+
multi_tap(taps, Java::CascadingTap::MultiSinkTap)
|
67
|
+
end
|
68
|
+
|
69
|
+
private
|
70
|
+
|
71
|
+
def self.multi_tap(taps, klass)
|
72
|
+
local_supported = taps.all?{ |tap| tap.local? }
|
73
|
+
local_tap = local_supported ? klass.new(taps.map{ |tap| tap.local_tap }.to_java('cascading.tap.Tap')) : nil
|
74
|
+
|
75
|
+
hadoop_supported = taps.all?{ |tap| tap.hadoop? }
|
76
|
+
hadoop_tap = hadoop_supported ? klass.new(taps.map{ |tap| tap.hadoop_tap }.to_java('cascading.tap.Tap')) : nil
|
77
|
+
|
78
|
+
MultiTap.new(local_tap, hadoop_tap)
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
data/lib/cascading.rb
CHANGED
@@ -6,7 +6,7 @@ require 'java'
|
|
6
6
|
|
7
7
|
module Cascading
|
8
8
|
# :stopdoc:
|
9
|
-
VERSION = '0.0.
|
9
|
+
VERSION = '0.0.9'
|
10
10
|
LIBPATH = ::File.expand_path(::File.dirname(__FILE__)) + ::File::SEPARATOR
|
11
11
|
PATH = ::File.dirname(LIBPATH) + ::File::SEPARATOR
|
12
12
|
CASCADING_HOME = ENV['CASCADING_HOME']
|
@@ -55,8 +55,10 @@ require 'cascading/cascading'
|
|
55
55
|
require 'cascading/cascading_exception'
|
56
56
|
require 'cascading/expr_stub'
|
57
57
|
require 'cascading/flow'
|
58
|
+
require 'cascading/mode'
|
58
59
|
require 'cascading/operations'
|
59
60
|
require 'cascading/scope'
|
61
|
+
require 'cascading/tap'
|
60
62
|
|
61
63
|
# include module to make them available at top package
|
62
64
|
include Cascading
|