cascading.jruby 0.0.5 → 0.0.6

Sign up to get free protection for your applications and to get access to all the features.
data/HACKING.md CHANGED
@@ -4,7 +4,7 @@ Some hacking info on `cascading.jruby`:
4
4
 
5
5
  `cascading.jruby` can be packaged as a gem. To do so, you must generate the necessary packaging files:
6
6
 
7
- ant build; jruby -S rake gem
7
+ jruby -S rake gem
8
8
 
9
9
  will produce the gem in the pkg/ sub-directory. After that, just cd to this directory and:
10
10
 
data/History.txt CHANGED
@@ -1,3 +1,9 @@
1
+ 0.0.6 - Removing primary key
2
+
3
+ The primary key feature was a source of great confusion at Etsy, so it's been
4
+ removed. This release also closes several issues, warns about potential node
5
+ name conflicts, and updates the tests to work under JRuby 1.6.5.
6
+
1
7
  0.0.5 - Addressing Janino pain
2
8
 
3
9
  This release expands upon the ExprStub class adding composition time compilation
data/README.md CHANGED
@@ -4,6 +4,6 @@
4
4
 
5
5
  It requires Hadoop (>= 0.18.3) and Cascading (>=1.0.1) to be set via the environment variables: `HADOOP_HOME` and `CASCADING_HOME`
6
6
 
7
- It has been tested on JRuby versions 1.2.0, 1.4.0, and 1.5.3.
7
+ It has been tested on JRuby versions 1.2.0, 1.4.0, 1.5.3, and 1.6.5.
8
8
 
9
9
  Copyright 2009, Grégoire Marabout.
data/TODO ADDED
@@ -0,0 +1,14 @@
1
+ Documentation
2
+
3
+ Assembly -> Each/Every refactor
4
+ Look into totally eliminating registries
5
+
6
+ Bug fixes on github
7
+ Enforce more runtime rules at composition time
8
+ Standardize helper contracts
9
+ Upgrade Cascading (already upgraded JRuby)
10
+ Possibly combine unit tests...into unit tests because RSpec sucks and swallows stack traces
11
+
12
+ Split out runner
13
+ Make runner implement Tool
14
+ Create build tool for job jar
@@ -51,16 +51,6 @@ module Cascading
51
51
  puts "Current scope for '#{name}':\n #{scope}\n----------\n"
52
52
  end
53
53
 
54
- def primary(*args)
55
- options = args.extract_options!
56
- if args.size > 0 && args[0] != nil
57
- scope.primary_key_fields = fields(args)
58
- else
59
- scope.primary_key_fields = nil
60
- end
61
- scope.grouping_primary_key_fields = scope.primary_key_fields
62
- end
63
-
64
54
  def make_each(type, *parameters)
65
55
  make_pipe(type, parameters)
66
56
  @every_applied = false
@@ -82,26 +72,6 @@ module Cascading
82
72
  # that only allows aggregation and buffer operations.
83
73
  instance_eval &block
84
74
 
85
- # First all non-primary key fields from each pipe if its primary key is a
86
- # subset of the grouping primary key
87
- first_fields = incoming_scopes.map do |scope|
88
- if scope.primary_key_fields
89
- primary_key = scope.primary_key_fields.to_a
90
- grouping_primary_key = scope.grouping_primary_key_fields.to_a
91
- if (primary_key & grouping_primary_key) == primary_key
92
- difference_fields(scope.values_fields, scope.primary_key_fields).to_a
93
- end
94
- end
95
- end.compact.flatten
96
- # assert first_fields == first_fields.uniq
97
-
98
- # Do no first any fields explicitly aggregated over
99
- first_fields = first_fields - scope.grouping_fields.to_a
100
- if first_fields.size > 0
101
- first *first_fields
102
- puts "Firsting: #{first_fields.inspect} in assembly: #{name}"
103
- end
104
-
105
75
  bind_names scope.grouping_fields.to_a if every_applied?
106
76
  end
107
77
 
@@ -321,11 +291,7 @@ module Cascading
321
291
  invalid = name_map.keys.sort - old_names
322
292
  raise "invalid names: #{invalid.inspect}" unless invalid.empty?
323
293
 
324
- old_key = scope.primary_key_fields.to_a
325
- new_key = old_key.map{ |name| name_map[name] || name }
326
-
327
294
  each all_fields, :function => Java::CascadingOperation::Identity.new(fields(new_names))
328
- primary(*new_key)
329
295
  end
330
296
 
331
297
  def cast(type_map)
@@ -409,7 +375,8 @@ module Cascading
409
375
  # insert) and an options hash.
410
376
  #
411
377
  # Options include:
412
- # * <tt>:sql</tt> a boolean indicating whether the operation should act like the SQL equivalent
378
+ # * <tt>:ignore</tt> a Java Array of Objects (for min and max) or Tuples
379
+ # (for first and last) of values for the aggregator to ignore
413
380
  #
414
381
  # <tt>function</tt> is a symbol that is the method to call to construct the Cascading Aggregator.
415
382
  def composite_aggregator(args, function)
@@ -685,9 +652,11 @@ module Cascading
685
652
  raise "Regex not allowed" if options && options[:pattern]
686
653
 
687
654
  if options[:expression]
688
- options[:expression] = "!(#{options[:expression]})"
655
+ _, imports, expr = options[:expression].match(/^((?:\s*import.*;\s*)*)(.*)$/).to_a
656
+ options[:expression] = "#{imports}!(#{expr})"
689
657
  elsif args[0]
690
- args[0] = "!(#{args[0]})"
658
+ _, imports, expr = args[0].match(/^((?:\s*import.*;\s*)*)(.*)$/).to_a
659
+ args[0] = "#{imports}!(#{expr})"
691
660
  end
692
661
 
693
662
  filter(*args)
@@ -15,12 +15,19 @@ module Cascading
15
15
  end
16
16
 
17
17
  def add_child(node)
18
+ child = root.find_child(node.name)
19
+ warn "WARNING: adding '#{node.qualified_name}', but node named '#{node.name}' already exists at '#{child.qualified_name}'" if child
20
+
18
21
  @children[node.name] = node
19
22
  @child_names << node.name
20
23
  @last_child = node
21
24
  node
22
25
  end
23
26
 
27
+ def qualified_name
28
+ parent ? "#{parent.qualified_name}.#{name}" : name
29
+ end
30
+
24
31
  def describe(offset = '')
25
32
  "#{offset}#{name}:node\n#{child_names.map{ |child| children[child].describe("#{offset} ") }.join("\n")}"
26
33
  end
@@ -32,7 +39,12 @@ module Cascading
32
39
  result = child.find_child(name)
33
40
  return result if result
34
41
  end
35
- return nil
42
+ nil
43
+ end
44
+
45
+ def root
46
+ return self unless parent
47
+ parent.root
36
48
  end
37
49
  end
38
50
 
@@ -57,6 +69,7 @@ module Cascading
57
69
 
58
70
  def add(name, instance)
59
71
  @registered ||= {}
72
+ warn "WARNING: node named '#{name}' already registered in #{self}" if @registered[name]
60
73
  @registered[name] = instance
61
74
  end
62
75
 
@@ -20,6 +20,7 @@ module Cascading
20
20
 
21
21
  # For applications built of Flows with no Cascades
22
22
  def flow(name, &block)
23
+ raise "Could not build flow '#{name}'; block required" unless block_given?
23
24
  flow = Flow.new(name, nil)
24
25
  flow.instance_eval(&block)
25
26
  flow
@@ -75,7 +75,6 @@ module Cascading
75
75
  raise "Cannot sink undefined assembly '#{sink_name}'" unless @outgoing_scopes[sink_name]
76
76
  sink_metadata[sink_name] = {
77
77
  :field_names => @outgoing_scopes[sink_name].values_fields.to_a,
78
- :primary_key => @outgoing_scopes[sink_name].primary_key_fields.to_a
79
78
  }
80
79
  sink_metadata
81
80
  end
@@ -135,7 +134,16 @@ module Cascading
135
134
  end
136
135
 
137
136
  def connect(properties = nil)
138
- properties = java.util.HashMap.new(properties || @properties)
137
+ # This ensures we have a hash, and that it is a Ruby Hash (because we
138
+ # also accept java.util.HashMap), then merges it with Flow properties
139
+ properties ||= {}
140
+ properties = java.util.HashMap.new(@properties.merge(Hash[*properties.to_a.flatten]))
141
+
142
+ puts "Connecting flow '#{name}' with properties:"
143
+ properties.key_set.to_a.sort.each do |key|
144
+ puts "#{key}=#{properties[key]}"
145
+ end
146
+
139
147
  Java::CascadingFlow::FlowConnector.new(properties).connect(
140
148
  name,
141
149
  make_tap_parameter(@sources),
@@ -19,8 +19,14 @@ module Cascading
19
19
 
20
20
  def aggregator_function(args, aggregator_klass)
21
21
  options = args.extract_options!
22
- ignore_values = options[:sql] ? [nil].to_java(java.lang.Object) : nil
23
- parameters = [Cascading.fields(args), ignore_values].compact
22
+ ignore = options[:ignore]
23
+ raise "Option 'ignore' is only supported by min, max, first, and last" if ignore && ![
24
+ Java::CascadingOperationAggregator::First,
25
+ Java::CascadingOperationAggregator::Min,
26
+ Java::CascadingOperationAggregator::Max,
27
+ Java::CascadingOperationAggregator::Last,
28
+ ].include?(aggregator_klass)
29
+ parameters = [Cascading.fields(args), ignore].compact
24
30
  aggregator_klass.new(*parameters)
25
31
  end
26
32
 
@@ -1,25 +1,14 @@
1
1
  module Cascading
2
2
  class Scope
3
- attr_accessor :scope, :grouping_key_fields, :primary_key_fields, :grouping_primary_key_fields
4
- @@scheme_keys = {}
3
+ attr_accessor :scope, :grouping_key_fields
5
4
 
6
5
  def initialize(scope, params = {})
7
6
  @scope = scope
8
7
  @grouping_key_fields = fields(params[:grouping_key_fields] || [])
9
- @primary_key_fields = fields(params[:primary_key_fields])
10
- @grouping_primary_key_fields = fields(params[:grouping_primary_key_fields])
11
8
  end
12
9
 
13
10
  def copy
14
- Scope.new(Java::CascadingFlow::Scope.new(@scope),
15
- :grouping_key_fields => @grouping_key_fields,
16
- :primary_key_fields => @primary_key_fields,
17
- :grouping_primary_key_fields => @grouping_primary_key_fields
18
- )
19
- end
20
-
21
- def self.register_scheme_key(scheme, primary_key)
22
- @@scheme_keys[scheme] = primary_key
11
+ Scope.new(Java::CascadingFlow::Scope.new(@scope), :grouping_key_fields => @grouping_key_fields)
23
12
  end
24
13
 
25
14
  def self.empty_scope(name)
@@ -30,26 +19,14 @@ module Cascading
30
19
  java_scope = outgoing_scope_for(tap, java.util.HashSet.new)
31
20
  # Taps and Pipes don't name their outgoing scopes like other FlowElements
32
21
  java_scope.name = name
33
- scope = Scope.new(java_scope,
34
- :primary_key_fields => @@scheme_keys[tap.scheme.class],
35
- :grouping_primary_key_fields => @@scheme_keys[tap.scheme.class]
36
- )
37
- vf, gf = scope.values_fields.to_a, scope.grouping_fields.to_a
38
- pk, gpk = scope.primary_key_fields.to_a, scope.grouping_primary_key_fields.to_a
39
- raise "Primary key must be a subset of available fields (primary key: #{pk.inspect}, values fields: #{vf.inspect})" unless vf & pk == pk
40
- raise "Grouping primary key must be a subset of available fields (grouping primary key: #{gpk.inspect}, grouping fields: #{gf.inspect})" unless gf & gpk == gpk
41
- scope
22
+ Scope.new(java_scope)
42
23
  end
43
24
 
44
25
  def self.outgoing_scope(flow_element, incoming_scopes, grouping_key_fields, every_applied)
45
26
  java_scopes = incoming_scopes.compact.map{ |s| s.scope }
46
- scope = Scope.new(outgoing_scope_for(flow_element, java.util.HashSet.new(java_scopes)),
27
+ Scope.new(outgoing_scope_for(flow_element, java.util.HashSet.new(java_scopes)),
47
28
  :grouping_key_fields => grouping_key_fields
48
29
  )
49
- scope.grouping_primary_key_fields = fields(grouping_primary_key_fields(flow_element, incoming_scopes, scope))
50
- scope.primary_key_fields = scope.grouping_primary_key_fields if every_applied
51
- scope.primary_key_fields = fields(primary_key_fields(flow_element, incoming_scopes, scope)) unless every_applied
52
- scope
53
30
  end
54
31
 
55
32
  def values_fields
@@ -80,11 +57,9 @@ Scope name: #{@scope.name}
80
57
  selector: #{@scope.out_grouping_selector}
81
58
  fields: #{grouping_fields}
82
59
  key fields: #{@grouping_key_fields}
83
- primary key fields: #{@grouping_primary_key_fields}
84
60
  Out values
85
61
  selector: #{@scope.out_values_selector}
86
62
  fields: #{values_fields}
87
- primary key fields: #{@primary_key_fields}
88
63
  END
89
64
  end
90
65
 
@@ -97,64 +72,5 @@ END
97
72
  raise CascadingException.new(e, 'Exception computing outgoing scope')
98
73
  end
99
74
  end
100
-
101
- def self.primary_key_fields(flow_element, incoming_scopes, scope)
102
- case flow_element
103
- when Java::CascadingPipe::Each
104
- # assert incoming_scopes.size == 1
105
- project_primary_key(incoming_scopes.first.primary_key_fields,
106
- incoming_scopes.first.values_fields.to_a,
107
- scope.values_fields.to_a)
108
- when Java::CascadingPipe::Every
109
- # assert incoming_scopes.size == 1
110
- incoming_scopes.first.primary_key_fields
111
- when Java::CascadingPipe::GroupBy
112
- if incoming_scopes.size == 1
113
- incoming_scopes.first.primary_key_fields
114
- else
115
- # We must clear the primary key when unioning multiple inputs. If
116
- # the programmer wants to preserve the primary key, they must use
117
- # the primary override.
118
- nil
119
- end
120
- when Java::CascadingPipe::CoGroup
121
- # FIXME: assume grouping_key_fields are the same for all
122
- # incoming_scopes. Need join to give me names from all incoming
123
- # scopes to perform rename on primary key fields.
124
- union_fields(*incoming_scopes.map{ |s| s.primary_key_fields })
125
- else raise "No primary key rules for FlowElement of type #{flow_element}"
126
- end
127
- end
128
-
129
- def self.project_primary_key(primary_key, old_fields, new_fields)
130
- return nil if primary_key.nil?
131
- primary_key = primary_key.to_a
132
- primary_key if (primary_key & new_fields) == primary_key
133
- end
134
-
135
- def self.grouping_primary_key_fields(flow_element, incoming_scopes, scope)
136
- case flow_element
137
- when Java::CascadingPipe::Each
138
- # assert incoming_scopes.size == 1
139
- project_primary_key(incoming_scopes.first.grouping_primary_key_fields,
140
- incoming_scopes.first.grouping_fields.to_a,
141
- scope.grouping_fields.to_a)
142
- when Java::CascadingPipe::Every
143
- # assert incoming_scopes.size == 1
144
- incoming_scopes.first.grouping_primary_key_fields
145
- when Java::CascadingPipe::GroupBy
146
- scope.grouping_key_fields
147
- when Java::CascadingPipe::CoGroup
148
- scope.grouping_key_fields
149
- else raise "No primary key rules for FlowElement of type #{flow_element}"
150
- end
151
- end
152
- end
153
-
154
- # Register default primary keys
155
- begin
156
- Scope.register_scheme_key(Java::CascadingScheme::TextLine, ['offset'])
157
- rescue NameError => ne
158
- puts 'WARNING: Could not register primary key for TextLine Scheme as it was not on the class path'
159
75
  end
160
76
  end
data/lib/cascading.rb CHANGED
@@ -6,7 +6,7 @@ require 'java'
6
6
 
7
7
  module Cascading
8
8
  # :stopdoc:
9
- VERSION = '0.0.5'
9
+ VERSION = '0.0.6'
10
10
  LIBPATH = ::File.expand_path(::File.dirname(__FILE__)) + ::File::SEPARATOR
11
11
  PATH = ::File.dirname(LIBPATH) + ::File::SEPARATOR
12
12
  CASCADING_HOME = ENV['CASCADING_HOME']
@@ -54,7 +54,7 @@ context Object do
54
54
  result = e.validate
55
55
  result.should == 0
56
56
  end
57
- when '1.5.3'
57
+ when '1.5.3', '1.6.5'
58
58
  it 'should handle Fixnum -> Integer for ExprStub#eval' do
59
59
  e = ExprStub.new('x:int + y:int')
60
60
  result = e.eval(:x => 2, :y => 3)
data/spec/spec_util.rb CHANGED
@@ -7,22 +7,12 @@ module ScopeTests
7
7
  scope = scope(*name_params)
8
8
  values_fields = params[:values_fields]
9
9
  grouping_fields = params[:grouping_fields] || values_fields
10
- primary_key_fields = params[:primary_key_fields]
11
- grouping_primary_key_fields = primary_key_fields
12
- grouping_primary_key_fields = params[:grouping_primary_key_fields] if params.has_key?(:grouping_primary_key_fields)
13
10
 
14
11
  debug = params[:debug]
15
12
  debug_scope(*name_params) if debug
16
13
 
17
14
  scope.values_fields.to_a.should == values_fields
18
15
  scope.grouping_fields.to_a.should == grouping_fields
19
- if params.has_key?(:primary_key_fields) # Must support nil values
20
- scope.primary_key_fields.should == nil if primary_key_fields.nil?
21
- scope.primary_key_fields.to_a.should == primary_key_fields unless primary_key_fields.nil?
22
-
23
- scope.grouping_primary_key_fields.should == nil if grouping_primary_key_fields.nil?
24
- scope.grouping_primary_key_fields.to_a.should == grouping_primary_key_fields unless grouping_primary_key_fields.nil?
25
- end
26
16
  end
27
17
  end
28
18
 
@@ -135,54 +125,3 @@ def cascading_properties
135
125
  Java::CascadingFlow::MultiMapReducePlanner.set_job_conf(properties, job_conf)
136
126
  properties
137
127
  end
138
-
139
- def verify_assembly_output(assembly_name, params, &block)
140
- `rm -rf spec_output`
141
-
142
- Cascade.new("foo") do
143
- flow("bar") do
144
- source assembly_name, tap(params[:source], params.slice(:scheme))
145
- assembly = assembly(assembly_name)
146
- sink assembly_name, tap("spec_output", :kind => :lfs, :sink_mode => :replace)
147
- end
148
- end.complete(@properties)
149
-
150
- output_data = nil
151
-
152
- File.open("spec_output/part-00000") do |f|
153
- output_data = f.readlines
154
- end
155
-
156
- if params[:length]
157
- output_data.size.should == params[:length]
158
- end
159
-
160
- keys = assembly.scope.values_fields
161
- if block_given?
162
- output_data.each do |line|
163
- values = line.chomp.split(/\t/)
164
-
165
- yield(keys.zip(values).inject({}) do |map, kv|
166
- map[kv[0].to_sym] = kv[1]
167
- map
168
- end)
169
- end
170
- end
171
- end
172
-
173
- def describe_job(job_file, &block)
174
- context Object do
175
- before(:each) do
176
- @properties = cascading_properties
177
- # Must artificially fill ARGV to prevent errors when creating multi-taps
178
- # in ETL cascade
179
- ARGV.clear
180
- 10.times do
181
- ARGV << 'text_line_scheme' # Dummy value, required for 3rd arg
182
- end
183
- load "lib/jobs/#{job_file}/#{job_file}.rb"
184
- end
185
-
186
- self.class_eval(&block)
187
- end
188
- end