cascading.jruby 0.0.5 → 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
- data/HACKING.md +1 -1
- data/History.txt +6 -0
- data/README.md +1 -1
- data/TODO +14 -0
- data/lib/cascading/assembly.rb +6 -37
- data/lib/cascading/base.rb +14 -1
- data/lib/cascading/cascading.rb +1 -0
- data/lib/cascading/flow.rb +10 -2
- data/lib/cascading/operations.rb +8 -2
- data/lib/cascading/scope.rb +4 -88
- data/lib/cascading.rb +1 -1
- data/spec/jruby_version_spec.rb +1 -1
- data/spec/spec_util.rb +0 -61
- data/tags +250 -0
- data/tasks/ant.rake +5 -2
- data/test/test_assembly.rb +42 -7
- data/test/test_cascade.rb +47 -0
- data/test/test_flow.rb +90 -6
- data/test/test_operations.rb +23 -0
- metadata +123 -117
- data/spec/primary_key_spec.rb +0 -119
data/HACKING.md
CHANGED
@@ -4,7 +4,7 @@ Some hacking info on `cascading.jruby`:
|
|
4
4
|
|
5
5
|
`cascading.jruby` can be packaged as a gem. To do so, you must generate the necessary packaging files:
|
6
6
|
|
7
|
-
|
7
|
+
jruby -S rake gem
|
8
8
|
|
9
9
|
will produce the gem in the pkg/ sub-directory. After that, just cd to this directory and:
|
10
10
|
|
data/History.txt
CHANGED
@@ -1,3 +1,9 @@
|
|
1
|
+
0.0.6 - Removing primary key
|
2
|
+
|
3
|
+
The primary key feature was a source of great confusion at Etsy, so it's been
|
4
|
+
removed. This release also closes several issues, warns about potential node
|
5
|
+
name conflicts, and updates the tests to work under JRuby 1.6.5.
|
6
|
+
|
1
7
|
0.0.5 - Addressing Janino pain
|
2
8
|
|
3
9
|
This release expands upon the ExprStub class adding composition time compilation
|
data/README.md
CHANGED
@@ -4,6 +4,6 @@
|
|
4
4
|
|
5
5
|
It requires Hadoop (>= 0.18.3) and Cascading (>=1.0.1) to be set via the environment variables: `HADOOP_HOME` and `CASCADING_HOME`
|
6
6
|
|
7
|
-
It has been tested on JRuby versions 1.2.0, 1.4.0, and 1.5.
|
7
|
+
It has been tested on JRuby versions 1.2.0, 1.4.0, 1.5.3, and 1.6.5.
|
8
8
|
|
9
9
|
Copyright 2009, Grégoire Marabout.
|
data/TODO
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
Documentation
|
2
|
+
|
3
|
+
Assembly -> Each/Every refactor
|
4
|
+
Look into totally eliminating registries
|
5
|
+
|
6
|
+
Bug fixes on github
|
7
|
+
Enforce more runtime rules at composition time
|
8
|
+
Standardize helper contracts
|
9
|
+
Upgrade Cascading (already upgraded JRuby)
|
10
|
+
Possibly combine unit tests...into unit tests because RSpec sucks and swallows stack traces
|
11
|
+
|
12
|
+
Split out runner
|
13
|
+
Make runner implement Tool
|
14
|
+
Create build tool for job jar
|
data/lib/cascading/assembly.rb
CHANGED
@@ -51,16 +51,6 @@ module Cascading
|
|
51
51
|
puts "Current scope for '#{name}':\n #{scope}\n----------\n"
|
52
52
|
end
|
53
53
|
|
54
|
-
def primary(*args)
|
55
|
-
options = args.extract_options!
|
56
|
-
if args.size > 0 && args[0] != nil
|
57
|
-
scope.primary_key_fields = fields(args)
|
58
|
-
else
|
59
|
-
scope.primary_key_fields = nil
|
60
|
-
end
|
61
|
-
scope.grouping_primary_key_fields = scope.primary_key_fields
|
62
|
-
end
|
63
|
-
|
64
54
|
def make_each(type, *parameters)
|
65
55
|
make_pipe(type, parameters)
|
66
56
|
@every_applied = false
|
@@ -82,26 +72,6 @@ module Cascading
|
|
82
72
|
# that only allows aggregation and buffer operations.
|
83
73
|
instance_eval &block
|
84
74
|
|
85
|
-
# First all non-primary key fields from each pipe if its primary key is a
|
86
|
-
# subset of the grouping primary key
|
87
|
-
first_fields = incoming_scopes.map do |scope|
|
88
|
-
if scope.primary_key_fields
|
89
|
-
primary_key = scope.primary_key_fields.to_a
|
90
|
-
grouping_primary_key = scope.grouping_primary_key_fields.to_a
|
91
|
-
if (primary_key & grouping_primary_key) == primary_key
|
92
|
-
difference_fields(scope.values_fields, scope.primary_key_fields).to_a
|
93
|
-
end
|
94
|
-
end
|
95
|
-
end.compact.flatten
|
96
|
-
# assert first_fields == first_fields.uniq
|
97
|
-
|
98
|
-
# Do no first any fields explicitly aggregated over
|
99
|
-
first_fields = first_fields - scope.grouping_fields.to_a
|
100
|
-
if first_fields.size > 0
|
101
|
-
first *first_fields
|
102
|
-
puts "Firsting: #{first_fields.inspect} in assembly: #{name}"
|
103
|
-
end
|
104
|
-
|
105
75
|
bind_names scope.grouping_fields.to_a if every_applied?
|
106
76
|
end
|
107
77
|
|
@@ -321,11 +291,7 @@ module Cascading
|
|
321
291
|
invalid = name_map.keys.sort - old_names
|
322
292
|
raise "invalid names: #{invalid.inspect}" unless invalid.empty?
|
323
293
|
|
324
|
-
old_key = scope.primary_key_fields.to_a
|
325
|
-
new_key = old_key.map{ |name| name_map[name] || name }
|
326
|
-
|
327
294
|
each all_fields, :function => Java::CascadingOperation::Identity.new(fields(new_names))
|
328
|
-
primary(*new_key)
|
329
295
|
end
|
330
296
|
|
331
297
|
def cast(type_map)
|
@@ -409,7 +375,8 @@ module Cascading
|
|
409
375
|
# insert) and an options hash.
|
410
376
|
#
|
411
377
|
# Options include:
|
412
|
-
# * <tt>:
|
378
|
+
# * <tt>:ignore</tt> a Java Array of Objects (for min and max) or Tuples
|
379
|
+
# (for first and last) of values for the aggregator to ignore
|
413
380
|
#
|
414
381
|
# <tt>function</tt> is a symbol that is the method to call to construct the Cascading Aggregator.
|
415
382
|
def composite_aggregator(args, function)
|
@@ -685,9 +652,11 @@ module Cascading
|
|
685
652
|
raise "Regex not allowed" if options && options[:pattern]
|
686
653
|
|
687
654
|
if options[:expression]
|
688
|
-
|
655
|
+
_, imports, expr = options[:expression].match(/^((?:\s*import.*;\s*)*)(.*)$/).to_a
|
656
|
+
options[:expression] = "#{imports}!(#{expr})"
|
689
657
|
elsif args[0]
|
690
|
-
|
658
|
+
_, imports, expr = args[0].match(/^((?:\s*import.*;\s*)*)(.*)$/).to_a
|
659
|
+
args[0] = "#{imports}!(#{expr})"
|
691
660
|
end
|
692
661
|
|
693
662
|
filter(*args)
|
data/lib/cascading/base.rb
CHANGED
@@ -15,12 +15,19 @@ module Cascading
|
|
15
15
|
end
|
16
16
|
|
17
17
|
def add_child(node)
|
18
|
+
child = root.find_child(node.name)
|
19
|
+
warn "WARNING: adding '#{node.qualified_name}', but node named '#{node.name}' already exists at '#{child.qualified_name}'" if child
|
20
|
+
|
18
21
|
@children[node.name] = node
|
19
22
|
@child_names << node.name
|
20
23
|
@last_child = node
|
21
24
|
node
|
22
25
|
end
|
23
26
|
|
27
|
+
def qualified_name
|
28
|
+
parent ? "#{parent.qualified_name}.#{name}" : name
|
29
|
+
end
|
30
|
+
|
24
31
|
def describe(offset = '')
|
25
32
|
"#{offset}#{name}:node\n#{child_names.map{ |child| children[child].describe("#{offset} ") }.join("\n")}"
|
26
33
|
end
|
@@ -32,7 +39,12 @@ module Cascading
|
|
32
39
|
result = child.find_child(name)
|
33
40
|
return result if result
|
34
41
|
end
|
35
|
-
|
42
|
+
nil
|
43
|
+
end
|
44
|
+
|
45
|
+
def root
|
46
|
+
return self unless parent
|
47
|
+
parent.root
|
36
48
|
end
|
37
49
|
end
|
38
50
|
|
@@ -57,6 +69,7 @@ module Cascading
|
|
57
69
|
|
58
70
|
def add(name, instance)
|
59
71
|
@registered ||= {}
|
72
|
+
warn "WARNING: node named '#{name}' already registered in #{self}" if @registered[name]
|
60
73
|
@registered[name] = instance
|
61
74
|
end
|
62
75
|
|
data/lib/cascading/cascading.rb
CHANGED
data/lib/cascading/flow.rb
CHANGED
@@ -75,7 +75,6 @@ module Cascading
|
|
75
75
|
raise "Cannot sink undefined assembly '#{sink_name}'" unless @outgoing_scopes[sink_name]
|
76
76
|
sink_metadata[sink_name] = {
|
77
77
|
:field_names => @outgoing_scopes[sink_name].values_fields.to_a,
|
78
|
-
:primary_key => @outgoing_scopes[sink_name].primary_key_fields.to_a
|
79
78
|
}
|
80
79
|
sink_metadata
|
81
80
|
end
|
@@ -135,7 +134,16 @@ module Cascading
|
|
135
134
|
end
|
136
135
|
|
137
136
|
def connect(properties = nil)
|
138
|
-
|
137
|
+
# This ensures we have a hash, and that it is a Ruby Hash (because we
|
138
|
+
# also accept java.util.HashMap), then merges it with Flow properties
|
139
|
+
properties ||= {}
|
140
|
+
properties = java.util.HashMap.new(@properties.merge(Hash[*properties.to_a.flatten]))
|
141
|
+
|
142
|
+
puts "Connecting flow '#{name}' with properties:"
|
143
|
+
properties.key_set.to_a.sort.each do |key|
|
144
|
+
puts "#{key}=#{properties[key]}"
|
145
|
+
end
|
146
|
+
|
139
147
|
Java::CascadingFlow::FlowConnector.new(properties).connect(
|
140
148
|
name,
|
141
149
|
make_tap_parameter(@sources),
|
data/lib/cascading/operations.rb
CHANGED
@@ -19,8 +19,14 @@ module Cascading
|
|
19
19
|
|
20
20
|
def aggregator_function(args, aggregator_klass)
|
21
21
|
options = args.extract_options!
|
22
|
-
|
23
|
-
|
22
|
+
ignore = options[:ignore]
|
23
|
+
raise "Option 'ignore' is only supported by min, max, first, and last" if ignore && ![
|
24
|
+
Java::CascadingOperationAggregator::First,
|
25
|
+
Java::CascadingOperationAggregator::Min,
|
26
|
+
Java::CascadingOperationAggregator::Max,
|
27
|
+
Java::CascadingOperationAggregator::Last,
|
28
|
+
].include?(aggregator_klass)
|
29
|
+
parameters = [Cascading.fields(args), ignore].compact
|
24
30
|
aggregator_klass.new(*parameters)
|
25
31
|
end
|
26
32
|
|
data/lib/cascading/scope.rb
CHANGED
@@ -1,25 +1,14 @@
|
|
1
1
|
module Cascading
|
2
2
|
class Scope
|
3
|
-
attr_accessor :scope, :grouping_key_fields
|
4
|
-
@@scheme_keys = {}
|
3
|
+
attr_accessor :scope, :grouping_key_fields
|
5
4
|
|
6
5
|
def initialize(scope, params = {})
|
7
6
|
@scope = scope
|
8
7
|
@grouping_key_fields = fields(params[:grouping_key_fields] || [])
|
9
|
-
@primary_key_fields = fields(params[:primary_key_fields])
|
10
|
-
@grouping_primary_key_fields = fields(params[:grouping_primary_key_fields])
|
11
8
|
end
|
12
9
|
|
13
10
|
def copy
|
14
|
-
Scope.new(Java::CascadingFlow::Scope.new(@scope),
|
15
|
-
:grouping_key_fields => @grouping_key_fields,
|
16
|
-
:primary_key_fields => @primary_key_fields,
|
17
|
-
:grouping_primary_key_fields => @grouping_primary_key_fields
|
18
|
-
)
|
19
|
-
end
|
20
|
-
|
21
|
-
def self.register_scheme_key(scheme, primary_key)
|
22
|
-
@@scheme_keys[scheme] = primary_key
|
11
|
+
Scope.new(Java::CascadingFlow::Scope.new(@scope), :grouping_key_fields => @grouping_key_fields)
|
23
12
|
end
|
24
13
|
|
25
14
|
def self.empty_scope(name)
|
@@ -30,26 +19,14 @@ module Cascading
|
|
30
19
|
java_scope = outgoing_scope_for(tap, java.util.HashSet.new)
|
31
20
|
# Taps and Pipes don't name their outgoing scopes like other FlowElements
|
32
21
|
java_scope.name = name
|
33
|
-
|
34
|
-
:primary_key_fields => @@scheme_keys[tap.scheme.class],
|
35
|
-
:grouping_primary_key_fields => @@scheme_keys[tap.scheme.class]
|
36
|
-
)
|
37
|
-
vf, gf = scope.values_fields.to_a, scope.grouping_fields.to_a
|
38
|
-
pk, gpk = scope.primary_key_fields.to_a, scope.grouping_primary_key_fields.to_a
|
39
|
-
raise "Primary key must be a subset of available fields (primary key: #{pk.inspect}, values fields: #{vf.inspect})" unless vf & pk == pk
|
40
|
-
raise "Grouping primary key must be a subset of available fields (grouping primary key: #{gpk.inspect}, grouping fields: #{gf.inspect})" unless gf & gpk == gpk
|
41
|
-
scope
|
22
|
+
Scope.new(java_scope)
|
42
23
|
end
|
43
24
|
|
44
25
|
def self.outgoing_scope(flow_element, incoming_scopes, grouping_key_fields, every_applied)
|
45
26
|
java_scopes = incoming_scopes.compact.map{ |s| s.scope }
|
46
|
-
|
27
|
+
Scope.new(outgoing_scope_for(flow_element, java.util.HashSet.new(java_scopes)),
|
47
28
|
:grouping_key_fields => grouping_key_fields
|
48
29
|
)
|
49
|
-
scope.grouping_primary_key_fields = fields(grouping_primary_key_fields(flow_element, incoming_scopes, scope))
|
50
|
-
scope.primary_key_fields = scope.grouping_primary_key_fields if every_applied
|
51
|
-
scope.primary_key_fields = fields(primary_key_fields(flow_element, incoming_scopes, scope)) unless every_applied
|
52
|
-
scope
|
53
30
|
end
|
54
31
|
|
55
32
|
def values_fields
|
@@ -80,11 +57,9 @@ Scope name: #{@scope.name}
|
|
80
57
|
selector: #{@scope.out_grouping_selector}
|
81
58
|
fields: #{grouping_fields}
|
82
59
|
key fields: #{@grouping_key_fields}
|
83
|
-
primary key fields: #{@grouping_primary_key_fields}
|
84
60
|
Out values
|
85
61
|
selector: #{@scope.out_values_selector}
|
86
62
|
fields: #{values_fields}
|
87
|
-
primary key fields: #{@primary_key_fields}
|
88
63
|
END
|
89
64
|
end
|
90
65
|
|
@@ -97,64 +72,5 @@ END
|
|
97
72
|
raise CascadingException.new(e, 'Exception computing outgoing scope')
|
98
73
|
end
|
99
74
|
end
|
100
|
-
|
101
|
-
def self.primary_key_fields(flow_element, incoming_scopes, scope)
|
102
|
-
case flow_element
|
103
|
-
when Java::CascadingPipe::Each
|
104
|
-
# assert incoming_scopes.size == 1
|
105
|
-
project_primary_key(incoming_scopes.first.primary_key_fields,
|
106
|
-
incoming_scopes.first.values_fields.to_a,
|
107
|
-
scope.values_fields.to_a)
|
108
|
-
when Java::CascadingPipe::Every
|
109
|
-
# assert incoming_scopes.size == 1
|
110
|
-
incoming_scopes.first.primary_key_fields
|
111
|
-
when Java::CascadingPipe::GroupBy
|
112
|
-
if incoming_scopes.size == 1
|
113
|
-
incoming_scopes.first.primary_key_fields
|
114
|
-
else
|
115
|
-
# We must clear the primary key when unioning multiple inputs. If
|
116
|
-
# the programmer wants to preserve the primary key, they must use
|
117
|
-
# the primary override.
|
118
|
-
nil
|
119
|
-
end
|
120
|
-
when Java::CascadingPipe::CoGroup
|
121
|
-
# FIXME: assume grouping_key_fields are the same for all
|
122
|
-
# incoming_scopes. Need join to give me names from all incoming
|
123
|
-
# scopes to perform rename on primary key fields.
|
124
|
-
union_fields(*incoming_scopes.map{ |s| s.primary_key_fields })
|
125
|
-
else raise "No primary key rules for FlowElement of type #{flow_element}"
|
126
|
-
end
|
127
|
-
end
|
128
|
-
|
129
|
-
def self.project_primary_key(primary_key, old_fields, new_fields)
|
130
|
-
return nil if primary_key.nil?
|
131
|
-
primary_key = primary_key.to_a
|
132
|
-
primary_key if (primary_key & new_fields) == primary_key
|
133
|
-
end
|
134
|
-
|
135
|
-
def self.grouping_primary_key_fields(flow_element, incoming_scopes, scope)
|
136
|
-
case flow_element
|
137
|
-
when Java::CascadingPipe::Each
|
138
|
-
# assert incoming_scopes.size == 1
|
139
|
-
project_primary_key(incoming_scopes.first.grouping_primary_key_fields,
|
140
|
-
incoming_scopes.first.grouping_fields.to_a,
|
141
|
-
scope.grouping_fields.to_a)
|
142
|
-
when Java::CascadingPipe::Every
|
143
|
-
# assert incoming_scopes.size == 1
|
144
|
-
incoming_scopes.first.grouping_primary_key_fields
|
145
|
-
when Java::CascadingPipe::GroupBy
|
146
|
-
scope.grouping_key_fields
|
147
|
-
when Java::CascadingPipe::CoGroup
|
148
|
-
scope.grouping_key_fields
|
149
|
-
else raise "No primary key rules for FlowElement of type #{flow_element}"
|
150
|
-
end
|
151
|
-
end
|
152
|
-
end
|
153
|
-
|
154
|
-
# Register default primary keys
|
155
|
-
begin
|
156
|
-
Scope.register_scheme_key(Java::CascadingScheme::TextLine, ['offset'])
|
157
|
-
rescue NameError => ne
|
158
|
-
puts 'WARNING: Could not register primary key for TextLine Scheme as it was not on the class path'
|
159
75
|
end
|
160
76
|
end
|
data/lib/cascading.rb
CHANGED
data/spec/jruby_version_spec.rb
CHANGED
data/spec/spec_util.rb
CHANGED
@@ -7,22 +7,12 @@ module ScopeTests
|
|
7
7
|
scope = scope(*name_params)
|
8
8
|
values_fields = params[:values_fields]
|
9
9
|
grouping_fields = params[:grouping_fields] || values_fields
|
10
|
-
primary_key_fields = params[:primary_key_fields]
|
11
|
-
grouping_primary_key_fields = primary_key_fields
|
12
|
-
grouping_primary_key_fields = params[:grouping_primary_key_fields] if params.has_key?(:grouping_primary_key_fields)
|
13
10
|
|
14
11
|
debug = params[:debug]
|
15
12
|
debug_scope(*name_params) if debug
|
16
13
|
|
17
14
|
scope.values_fields.to_a.should == values_fields
|
18
15
|
scope.grouping_fields.to_a.should == grouping_fields
|
19
|
-
if params.has_key?(:primary_key_fields) # Must support nil values
|
20
|
-
scope.primary_key_fields.should == nil if primary_key_fields.nil?
|
21
|
-
scope.primary_key_fields.to_a.should == primary_key_fields unless primary_key_fields.nil?
|
22
|
-
|
23
|
-
scope.grouping_primary_key_fields.should == nil if grouping_primary_key_fields.nil?
|
24
|
-
scope.grouping_primary_key_fields.to_a.should == grouping_primary_key_fields unless grouping_primary_key_fields.nil?
|
25
|
-
end
|
26
16
|
end
|
27
17
|
end
|
28
18
|
|
@@ -135,54 +125,3 @@ def cascading_properties
|
|
135
125
|
Java::CascadingFlow::MultiMapReducePlanner.set_job_conf(properties, job_conf)
|
136
126
|
properties
|
137
127
|
end
|
138
|
-
|
139
|
-
def verify_assembly_output(assembly_name, params, &block)
|
140
|
-
`rm -rf spec_output`
|
141
|
-
|
142
|
-
Cascade.new("foo") do
|
143
|
-
flow("bar") do
|
144
|
-
source assembly_name, tap(params[:source], params.slice(:scheme))
|
145
|
-
assembly = assembly(assembly_name)
|
146
|
-
sink assembly_name, tap("spec_output", :kind => :lfs, :sink_mode => :replace)
|
147
|
-
end
|
148
|
-
end.complete(@properties)
|
149
|
-
|
150
|
-
output_data = nil
|
151
|
-
|
152
|
-
File.open("spec_output/part-00000") do |f|
|
153
|
-
output_data = f.readlines
|
154
|
-
end
|
155
|
-
|
156
|
-
if params[:length]
|
157
|
-
output_data.size.should == params[:length]
|
158
|
-
end
|
159
|
-
|
160
|
-
keys = assembly.scope.values_fields
|
161
|
-
if block_given?
|
162
|
-
output_data.each do |line|
|
163
|
-
values = line.chomp.split(/\t/)
|
164
|
-
|
165
|
-
yield(keys.zip(values).inject({}) do |map, kv|
|
166
|
-
map[kv[0].to_sym] = kv[1]
|
167
|
-
map
|
168
|
-
end)
|
169
|
-
end
|
170
|
-
end
|
171
|
-
end
|
172
|
-
|
173
|
-
def describe_job(job_file, &block)
|
174
|
-
context Object do
|
175
|
-
before(:each) do
|
176
|
-
@properties = cascading_properties
|
177
|
-
# Must artificially fill ARGV to prevent errors when creating multi-taps
|
178
|
-
# in ETL cascade
|
179
|
-
ARGV.clear
|
180
|
-
10.times do
|
181
|
-
ARGV << 'text_line_scheme' # Dummy value, required for 3rd arg
|
182
|
-
end
|
183
|
-
load "lib/jobs/#{job_file}/#{job_file}.rb"
|
184
|
-
end
|
185
|
-
|
186
|
-
self.class_eval(&block)
|
187
|
-
end
|
188
|
-
end
|