cascading.jruby 0.0.5 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/HACKING.md +1 -1
- data/History.txt +6 -0
- data/README.md +1 -1
- data/TODO +14 -0
- data/lib/cascading/assembly.rb +6 -37
- data/lib/cascading/base.rb +14 -1
- data/lib/cascading/cascading.rb +1 -0
- data/lib/cascading/flow.rb +10 -2
- data/lib/cascading/operations.rb +8 -2
- data/lib/cascading/scope.rb +4 -88
- data/lib/cascading.rb +1 -1
- data/spec/jruby_version_spec.rb +1 -1
- data/spec/spec_util.rb +0 -61
- data/tags +250 -0
- data/tasks/ant.rake +5 -2
- data/test/test_assembly.rb +42 -7
- data/test/test_cascade.rb +47 -0
- data/test/test_flow.rb +90 -6
- data/test/test_operations.rb +23 -0
- metadata +123 -117
- data/spec/primary_key_spec.rb +0 -119
data/HACKING.md
CHANGED
@@ -4,7 +4,7 @@ Some hacking info on `cascading.jruby`:
|
|
4
4
|
|
5
5
|
`cascading.jruby` can be packaged as a gem. To do so, you must generate the necessary packaging files:
|
6
6
|
|
7
|
-
|
7
|
+
jruby -S rake gem
|
8
8
|
|
9
9
|
will produce the gem in the pkg/ sub-directory. After that, just cd to this directory and:
|
10
10
|
|
data/History.txt
CHANGED
@@ -1,3 +1,9 @@
|
|
1
|
+
0.0.6 - Removing primary key
|
2
|
+
|
3
|
+
The primary key feature was a source of great confusion at Etsy, so it's been
|
4
|
+
removed. This release also closes several issues, warns about potential node
|
5
|
+
name conflicts, and updates the tests to work under JRuby 1.6.5.
|
6
|
+
|
1
7
|
0.0.5 - Addressing Janino pain
|
2
8
|
|
3
9
|
This release expands upon the ExprStub class adding composition time compilation
|
data/README.md
CHANGED
@@ -4,6 +4,6 @@
|
|
4
4
|
|
5
5
|
It requires Hadoop (>= 0.18.3) and Cascading (>=1.0.1) to be set via the environment variables: `HADOOP_HOME` and `CASCADING_HOME`
|
6
6
|
|
7
|
-
It has been tested on JRuby versions 1.2.0, 1.4.0, and 1.5.
|
7
|
+
It has been tested on JRuby versions 1.2.0, 1.4.0, 1.5.3, and 1.6.5.
|
8
8
|
|
9
9
|
Copyright 2009, Grégoire Marabout.
|
data/TODO
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
Documentation
|
2
|
+
|
3
|
+
Assembly -> Each/Every refactor
|
4
|
+
Look into totally eliminating registries
|
5
|
+
|
6
|
+
Bug fixes on github
|
7
|
+
Enforce more runtime rules at composition time
|
8
|
+
Standardize helper contracts
|
9
|
+
Upgrade Cascading (already upgraded JRuby)
|
10
|
+
Possibly combine unit tests...into unit tests because RSpec sucks and swallows stack traces
|
11
|
+
|
12
|
+
Split out runner
|
13
|
+
Make runner implement Tool
|
14
|
+
Create build tool for job jar
|
data/lib/cascading/assembly.rb
CHANGED
@@ -51,16 +51,6 @@ module Cascading
|
|
51
51
|
puts "Current scope for '#{name}':\n #{scope}\n----------\n"
|
52
52
|
end
|
53
53
|
|
54
|
-
def primary(*args)
|
55
|
-
options = args.extract_options!
|
56
|
-
if args.size > 0 && args[0] != nil
|
57
|
-
scope.primary_key_fields = fields(args)
|
58
|
-
else
|
59
|
-
scope.primary_key_fields = nil
|
60
|
-
end
|
61
|
-
scope.grouping_primary_key_fields = scope.primary_key_fields
|
62
|
-
end
|
63
|
-
|
64
54
|
def make_each(type, *parameters)
|
65
55
|
make_pipe(type, parameters)
|
66
56
|
@every_applied = false
|
@@ -82,26 +72,6 @@ module Cascading
|
|
82
72
|
# that only allows aggregation and buffer operations.
|
83
73
|
instance_eval &block
|
84
74
|
|
85
|
-
# First all non-primary key fields from each pipe if its primary key is a
|
86
|
-
# subset of the grouping primary key
|
87
|
-
first_fields = incoming_scopes.map do |scope|
|
88
|
-
if scope.primary_key_fields
|
89
|
-
primary_key = scope.primary_key_fields.to_a
|
90
|
-
grouping_primary_key = scope.grouping_primary_key_fields.to_a
|
91
|
-
if (primary_key & grouping_primary_key) == primary_key
|
92
|
-
difference_fields(scope.values_fields, scope.primary_key_fields).to_a
|
93
|
-
end
|
94
|
-
end
|
95
|
-
end.compact.flatten
|
96
|
-
# assert first_fields == first_fields.uniq
|
97
|
-
|
98
|
-
# Do no first any fields explicitly aggregated over
|
99
|
-
first_fields = first_fields - scope.grouping_fields.to_a
|
100
|
-
if first_fields.size > 0
|
101
|
-
first *first_fields
|
102
|
-
puts "Firsting: #{first_fields.inspect} in assembly: #{name}"
|
103
|
-
end
|
104
|
-
|
105
75
|
bind_names scope.grouping_fields.to_a if every_applied?
|
106
76
|
end
|
107
77
|
|
@@ -321,11 +291,7 @@ module Cascading
|
|
321
291
|
invalid = name_map.keys.sort - old_names
|
322
292
|
raise "invalid names: #{invalid.inspect}" unless invalid.empty?
|
323
293
|
|
324
|
-
old_key = scope.primary_key_fields.to_a
|
325
|
-
new_key = old_key.map{ |name| name_map[name] || name }
|
326
|
-
|
327
294
|
each all_fields, :function => Java::CascadingOperation::Identity.new(fields(new_names))
|
328
|
-
primary(*new_key)
|
329
295
|
end
|
330
296
|
|
331
297
|
def cast(type_map)
|
@@ -409,7 +375,8 @@ module Cascading
|
|
409
375
|
# insert) and an options hash.
|
410
376
|
#
|
411
377
|
# Options include:
|
412
|
-
# * <tt>:
|
378
|
+
# * <tt>:ignore</tt> a Java Array of Objects (for min and max) or Tuples
|
379
|
+
# (for first and last) of values for the aggregator to ignore
|
413
380
|
#
|
414
381
|
# <tt>function</tt> is a symbol that is the method to call to construct the Cascading Aggregator.
|
415
382
|
def composite_aggregator(args, function)
|
@@ -685,9 +652,11 @@ module Cascading
|
|
685
652
|
raise "Regex not allowed" if options && options[:pattern]
|
686
653
|
|
687
654
|
if options[:expression]
|
688
|
-
|
655
|
+
_, imports, expr = options[:expression].match(/^((?:\s*import.*;\s*)*)(.*)$/).to_a
|
656
|
+
options[:expression] = "#{imports}!(#{expr})"
|
689
657
|
elsif args[0]
|
690
|
-
|
658
|
+
_, imports, expr = args[0].match(/^((?:\s*import.*;\s*)*)(.*)$/).to_a
|
659
|
+
args[0] = "#{imports}!(#{expr})"
|
691
660
|
end
|
692
661
|
|
693
662
|
filter(*args)
|
data/lib/cascading/base.rb
CHANGED
@@ -15,12 +15,19 @@ module Cascading
|
|
15
15
|
end
|
16
16
|
|
17
17
|
def add_child(node)
|
18
|
+
child = root.find_child(node.name)
|
19
|
+
warn "WARNING: adding '#{node.qualified_name}', but node named '#{node.name}' already exists at '#{child.qualified_name}'" if child
|
20
|
+
|
18
21
|
@children[node.name] = node
|
19
22
|
@child_names << node.name
|
20
23
|
@last_child = node
|
21
24
|
node
|
22
25
|
end
|
23
26
|
|
27
|
+
def qualified_name
|
28
|
+
parent ? "#{parent.qualified_name}.#{name}" : name
|
29
|
+
end
|
30
|
+
|
24
31
|
def describe(offset = '')
|
25
32
|
"#{offset}#{name}:node\n#{child_names.map{ |child| children[child].describe("#{offset} ") }.join("\n")}"
|
26
33
|
end
|
@@ -32,7 +39,12 @@ module Cascading
|
|
32
39
|
result = child.find_child(name)
|
33
40
|
return result if result
|
34
41
|
end
|
35
|
-
|
42
|
+
nil
|
43
|
+
end
|
44
|
+
|
45
|
+
def root
|
46
|
+
return self unless parent
|
47
|
+
parent.root
|
36
48
|
end
|
37
49
|
end
|
38
50
|
|
@@ -57,6 +69,7 @@ module Cascading
|
|
57
69
|
|
58
70
|
def add(name, instance)
|
59
71
|
@registered ||= {}
|
72
|
+
warn "WARNING: node named '#{name}' already registered in #{self}" if @registered[name]
|
60
73
|
@registered[name] = instance
|
61
74
|
end
|
62
75
|
|
data/lib/cascading/cascading.rb
CHANGED
data/lib/cascading/flow.rb
CHANGED
@@ -75,7 +75,6 @@ module Cascading
|
|
75
75
|
raise "Cannot sink undefined assembly '#{sink_name}'" unless @outgoing_scopes[sink_name]
|
76
76
|
sink_metadata[sink_name] = {
|
77
77
|
:field_names => @outgoing_scopes[sink_name].values_fields.to_a,
|
78
|
-
:primary_key => @outgoing_scopes[sink_name].primary_key_fields.to_a
|
79
78
|
}
|
80
79
|
sink_metadata
|
81
80
|
end
|
@@ -135,7 +134,16 @@ module Cascading
|
|
135
134
|
end
|
136
135
|
|
137
136
|
def connect(properties = nil)
|
138
|
-
|
137
|
+
# This ensures we have a hash, and that it is a Ruby Hash (because we
|
138
|
+
# also accept java.util.HashMap), then merges it with Flow properties
|
139
|
+
properties ||= {}
|
140
|
+
properties = java.util.HashMap.new(@properties.merge(Hash[*properties.to_a.flatten]))
|
141
|
+
|
142
|
+
puts "Connecting flow '#{name}' with properties:"
|
143
|
+
properties.key_set.to_a.sort.each do |key|
|
144
|
+
puts "#{key}=#{properties[key]}"
|
145
|
+
end
|
146
|
+
|
139
147
|
Java::CascadingFlow::FlowConnector.new(properties).connect(
|
140
148
|
name,
|
141
149
|
make_tap_parameter(@sources),
|
data/lib/cascading/operations.rb
CHANGED
@@ -19,8 +19,14 @@ module Cascading
|
|
19
19
|
|
20
20
|
def aggregator_function(args, aggregator_klass)
|
21
21
|
options = args.extract_options!
|
22
|
-
|
23
|
-
|
22
|
+
ignore = options[:ignore]
|
23
|
+
raise "Option 'ignore' is only supported by min, max, first, and last" if ignore && ![
|
24
|
+
Java::CascadingOperationAggregator::First,
|
25
|
+
Java::CascadingOperationAggregator::Min,
|
26
|
+
Java::CascadingOperationAggregator::Max,
|
27
|
+
Java::CascadingOperationAggregator::Last,
|
28
|
+
].include?(aggregator_klass)
|
29
|
+
parameters = [Cascading.fields(args), ignore].compact
|
24
30
|
aggregator_klass.new(*parameters)
|
25
31
|
end
|
26
32
|
|
data/lib/cascading/scope.rb
CHANGED
@@ -1,25 +1,14 @@
|
|
1
1
|
module Cascading
|
2
2
|
class Scope
|
3
|
-
attr_accessor :scope, :grouping_key_fields
|
4
|
-
@@scheme_keys = {}
|
3
|
+
attr_accessor :scope, :grouping_key_fields
|
5
4
|
|
6
5
|
def initialize(scope, params = {})
|
7
6
|
@scope = scope
|
8
7
|
@grouping_key_fields = fields(params[:grouping_key_fields] || [])
|
9
|
-
@primary_key_fields = fields(params[:primary_key_fields])
|
10
|
-
@grouping_primary_key_fields = fields(params[:grouping_primary_key_fields])
|
11
8
|
end
|
12
9
|
|
13
10
|
def copy
|
14
|
-
Scope.new(Java::CascadingFlow::Scope.new(@scope),
|
15
|
-
:grouping_key_fields => @grouping_key_fields,
|
16
|
-
:primary_key_fields => @primary_key_fields,
|
17
|
-
:grouping_primary_key_fields => @grouping_primary_key_fields
|
18
|
-
)
|
19
|
-
end
|
20
|
-
|
21
|
-
def self.register_scheme_key(scheme, primary_key)
|
22
|
-
@@scheme_keys[scheme] = primary_key
|
11
|
+
Scope.new(Java::CascadingFlow::Scope.new(@scope), :grouping_key_fields => @grouping_key_fields)
|
23
12
|
end
|
24
13
|
|
25
14
|
def self.empty_scope(name)
|
@@ -30,26 +19,14 @@ module Cascading
|
|
30
19
|
java_scope = outgoing_scope_for(tap, java.util.HashSet.new)
|
31
20
|
# Taps and Pipes don't name their outgoing scopes like other FlowElements
|
32
21
|
java_scope.name = name
|
33
|
-
|
34
|
-
:primary_key_fields => @@scheme_keys[tap.scheme.class],
|
35
|
-
:grouping_primary_key_fields => @@scheme_keys[tap.scheme.class]
|
36
|
-
)
|
37
|
-
vf, gf = scope.values_fields.to_a, scope.grouping_fields.to_a
|
38
|
-
pk, gpk = scope.primary_key_fields.to_a, scope.grouping_primary_key_fields.to_a
|
39
|
-
raise "Primary key must be a subset of available fields (primary key: #{pk.inspect}, values fields: #{vf.inspect})" unless vf & pk == pk
|
40
|
-
raise "Grouping primary key must be a subset of available fields (grouping primary key: #{gpk.inspect}, grouping fields: #{gf.inspect})" unless gf & gpk == gpk
|
41
|
-
scope
|
22
|
+
Scope.new(java_scope)
|
42
23
|
end
|
43
24
|
|
44
25
|
def self.outgoing_scope(flow_element, incoming_scopes, grouping_key_fields, every_applied)
|
45
26
|
java_scopes = incoming_scopes.compact.map{ |s| s.scope }
|
46
|
-
|
27
|
+
Scope.new(outgoing_scope_for(flow_element, java.util.HashSet.new(java_scopes)),
|
47
28
|
:grouping_key_fields => grouping_key_fields
|
48
29
|
)
|
49
|
-
scope.grouping_primary_key_fields = fields(grouping_primary_key_fields(flow_element, incoming_scopes, scope))
|
50
|
-
scope.primary_key_fields = scope.grouping_primary_key_fields if every_applied
|
51
|
-
scope.primary_key_fields = fields(primary_key_fields(flow_element, incoming_scopes, scope)) unless every_applied
|
52
|
-
scope
|
53
30
|
end
|
54
31
|
|
55
32
|
def values_fields
|
@@ -80,11 +57,9 @@ Scope name: #{@scope.name}
|
|
80
57
|
selector: #{@scope.out_grouping_selector}
|
81
58
|
fields: #{grouping_fields}
|
82
59
|
key fields: #{@grouping_key_fields}
|
83
|
-
primary key fields: #{@grouping_primary_key_fields}
|
84
60
|
Out values
|
85
61
|
selector: #{@scope.out_values_selector}
|
86
62
|
fields: #{values_fields}
|
87
|
-
primary key fields: #{@primary_key_fields}
|
88
63
|
END
|
89
64
|
end
|
90
65
|
|
@@ -97,64 +72,5 @@ END
|
|
97
72
|
raise CascadingException.new(e, 'Exception computing outgoing scope')
|
98
73
|
end
|
99
74
|
end
|
100
|
-
|
101
|
-
def self.primary_key_fields(flow_element, incoming_scopes, scope)
|
102
|
-
case flow_element
|
103
|
-
when Java::CascadingPipe::Each
|
104
|
-
# assert incoming_scopes.size == 1
|
105
|
-
project_primary_key(incoming_scopes.first.primary_key_fields,
|
106
|
-
incoming_scopes.first.values_fields.to_a,
|
107
|
-
scope.values_fields.to_a)
|
108
|
-
when Java::CascadingPipe::Every
|
109
|
-
# assert incoming_scopes.size == 1
|
110
|
-
incoming_scopes.first.primary_key_fields
|
111
|
-
when Java::CascadingPipe::GroupBy
|
112
|
-
if incoming_scopes.size == 1
|
113
|
-
incoming_scopes.first.primary_key_fields
|
114
|
-
else
|
115
|
-
# We must clear the primary key when unioning multiple inputs. If
|
116
|
-
# the programmer wants to preserve the primary key, they must use
|
117
|
-
# the primary override.
|
118
|
-
nil
|
119
|
-
end
|
120
|
-
when Java::CascadingPipe::CoGroup
|
121
|
-
# FIXME: assume grouping_key_fields are the same for all
|
122
|
-
# incoming_scopes. Need join to give me names from all incoming
|
123
|
-
# scopes to perform rename on primary key fields.
|
124
|
-
union_fields(*incoming_scopes.map{ |s| s.primary_key_fields })
|
125
|
-
else raise "No primary key rules for FlowElement of type #{flow_element}"
|
126
|
-
end
|
127
|
-
end
|
128
|
-
|
129
|
-
def self.project_primary_key(primary_key, old_fields, new_fields)
|
130
|
-
return nil if primary_key.nil?
|
131
|
-
primary_key = primary_key.to_a
|
132
|
-
primary_key if (primary_key & new_fields) == primary_key
|
133
|
-
end
|
134
|
-
|
135
|
-
def self.grouping_primary_key_fields(flow_element, incoming_scopes, scope)
|
136
|
-
case flow_element
|
137
|
-
when Java::CascadingPipe::Each
|
138
|
-
# assert incoming_scopes.size == 1
|
139
|
-
project_primary_key(incoming_scopes.first.grouping_primary_key_fields,
|
140
|
-
incoming_scopes.first.grouping_fields.to_a,
|
141
|
-
scope.grouping_fields.to_a)
|
142
|
-
when Java::CascadingPipe::Every
|
143
|
-
# assert incoming_scopes.size == 1
|
144
|
-
incoming_scopes.first.grouping_primary_key_fields
|
145
|
-
when Java::CascadingPipe::GroupBy
|
146
|
-
scope.grouping_key_fields
|
147
|
-
when Java::CascadingPipe::CoGroup
|
148
|
-
scope.grouping_key_fields
|
149
|
-
else raise "No primary key rules for FlowElement of type #{flow_element}"
|
150
|
-
end
|
151
|
-
end
|
152
|
-
end
|
153
|
-
|
154
|
-
# Register default primary keys
|
155
|
-
begin
|
156
|
-
Scope.register_scheme_key(Java::CascadingScheme::TextLine, ['offset'])
|
157
|
-
rescue NameError => ne
|
158
|
-
puts 'WARNING: Could not register primary key for TextLine Scheme as it was not on the class path'
|
159
75
|
end
|
160
76
|
end
|
data/lib/cascading.rb
CHANGED
data/spec/jruby_version_spec.rb
CHANGED
data/spec/spec_util.rb
CHANGED
@@ -7,22 +7,12 @@ module ScopeTests
|
|
7
7
|
scope = scope(*name_params)
|
8
8
|
values_fields = params[:values_fields]
|
9
9
|
grouping_fields = params[:grouping_fields] || values_fields
|
10
|
-
primary_key_fields = params[:primary_key_fields]
|
11
|
-
grouping_primary_key_fields = primary_key_fields
|
12
|
-
grouping_primary_key_fields = params[:grouping_primary_key_fields] if params.has_key?(:grouping_primary_key_fields)
|
13
10
|
|
14
11
|
debug = params[:debug]
|
15
12
|
debug_scope(*name_params) if debug
|
16
13
|
|
17
14
|
scope.values_fields.to_a.should == values_fields
|
18
15
|
scope.grouping_fields.to_a.should == grouping_fields
|
19
|
-
if params.has_key?(:primary_key_fields) # Must support nil values
|
20
|
-
scope.primary_key_fields.should == nil if primary_key_fields.nil?
|
21
|
-
scope.primary_key_fields.to_a.should == primary_key_fields unless primary_key_fields.nil?
|
22
|
-
|
23
|
-
scope.grouping_primary_key_fields.should == nil if grouping_primary_key_fields.nil?
|
24
|
-
scope.grouping_primary_key_fields.to_a.should == grouping_primary_key_fields unless grouping_primary_key_fields.nil?
|
25
|
-
end
|
26
16
|
end
|
27
17
|
end
|
28
18
|
|
@@ -135,54 +125,3 @@ def cascading_properties
|
|
135
125
|
Java::CascadingFlow::MultiMapReducePlanner.set_job_conf(properties, job_conf)
|
136
126
|
properties
|
137
127
|
end
|
138
|
-
|
139
|
-
def verify_assembly_output(assembly_name, params, &block)
|
140
|
-
`rm -rf spec_output`
|
141
|
-
|
142
|
-
Cascade.new("foo") do
|
143
|
-
flow("bar") do
|
144
|
-
source assembly_name, tap(params[:source], params.slice(:scheme))
|
145
|
-
assembly = assembly(assembly_name)
|
146
|
-
sink assembly_name, tap("spec_output", :kind => :lfs, :sink_mode => :replace)
|
147
|
-
end
|
148
|
-
end.complete(@properties)
|
149
|
-
|
150
|
-
output_data = nil
|
151
|
-
|
152
|
-
File.open("spec_output/part-00000") do |f|
|
153
|
-
output_data = f.readlines
|
154
|
-
end
|
155
|
-
|
156
|
-
if params[:length]
|
157
|
-
output_data.size.should == params[:length]
|
158
|
-
end
|
159
|
-
|
160
|
-
keys = assembly.scope.values_fields
|
161
|
-
if block_given?
|
162
|
-
output_data.each do |line|
|
163
|
-
values = line.chomp.split(/\t/)
|
164
|
-
|
165
|
-
yield(keys.zip(values).inject({}) do |map, kv|
|
166
|
-
map[kv[0].to_sym] = kv[1]
|
167
|
-
map
|
168
|
-
end)
|
169
|
-
end
|
170
|
-
end
|
171
|
-
end
|
172
|
-
|
173
|
-
def describe_job(job_file, &block)
|
174
|
-
context Object do
|
175
|
-
before(:each) do
|
176
|
-
@properties = cascading_properties
|
177
|
-
# Must artificially fill ARGV to prevent errors when creating multi-taps
|
178
|
-
# in ETL cascade
|
179
|
-
ARGV.clear
|
180
|
-
10.times do
|
181
|
-
ARGV << 'text_line_scheme' # Dummy value, required for 3rd arg
|
182
|
-
end
|
183
|
-
load "lib/jobs/#{job_file}/#{job_file}.rb"
|
184
|
-
end
|
185
|
-
|
186
|
-
self.class_eval(&block)
|
187
|
-
end
|
188
|
-
end
|