cascading.jruby 0.0.5 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/HACKING.md CHANGED
@@ -4,7 +4,7 @@ Some hacking info on `cascading.jruby`:
4
4
 
5
5
  `cascading.jruby` can be packaged as a gem. To do so, you must generate the necessary packaging files:
6
6
 
7
- ant build; jruby -S rake gem
7
+ jruby -S rake gem
8
8
 
9
9
  will produce the gem in the pkg/ sub-directory. After that, just cd to this directory and:
10
10
 
data/History.txt CHANGED
@@ -1,3 +1,9 @@
1
+ 0.0.6 - Removing primary key
2
+
3
+ The primary key feature was a source of great confusion at Etsy, so it's been
4
+ removed. This release also closes several issues, warns about potential node
5
+ name conflicts, and updates the tests to work under JRuby 1.6.5.
6
+
1
7
  0.0.5 - Addressing Janino pain
2
8
 
3
9
  This release expands upon the ExprStub class adding composition time compilation
data/README.md CHANGED
@@ -4,6 +4,6 @@
4
4
 
5
5
  It requires Hadoop (>= 0.18.3) and Cascading (>=1.0.1) to be set via the environment variables: `HADOOP_HOME` and `CASCADING_HOME`
6
6
 
7
- It has been tested on JRuby versions 1.2.0, 1.4.0, and 1.5.3.
7
+ It has been tested on JRuby versions 1.2.0, 1.4.0, 1.5.3, and 1.6.5.
8
8
 
9
9
  Copyright 2009, Grégoire Marabout.
data/TODO ADDED
@@ -0,0 +1,14 @@
1
+ Documentation
2
+
3
+ Assembly -> Each/Every refactor
4
+ Look into totally eliminating registries
5
+
6
+ Bug fixes on github
7
+ Enforce more runtime rules at composition time
8
+ Standardize helper contracts
9
+ Upgrade Cascading (already upgraded JRuby)
10
+ Possibly combine unit tests...into unit tests because RSpec sucks and swallows stack traces
11
+
12
+ Split out runner
13
+ Make runner implement Tool
14
+ Create build tool for job jar
@@ -51,16 +51,6 @@ module Cascading
51
51
  puts "Current scope for '#{name}':\n #{scope}\n----------\n"
52
52
  end
53
53
 
54
- def primary(*args)
55
- options = args.extract_options!
56
- if args.size > 0 && args[0] != nil
57
- scope.primary_key_fields = fields(args)
58
- else
59
- scope.primary_key_fields = nil
60
- end
61
- scope.grouping_primary_key_fields = scope.primary_key_fields
62
- end
63
-
64
54
  def make_each(type, *parameters)
65
55
  make_pipe(type, parameters)
66
56
  @every_applied = false
@@ -82,26 +72,6 @@ module Cascading
82
72
  # that only allows aggregation and buffer operations.
83
73
  instance_eval &block
84
74
 
85
- # First all non-primary key fields from each pipe if its primary key is a
86
- # subset of the grouping primary key
87
- first_fields = incoming_scopes.map do |scope|
88
- if scope.primary_key_fields
89
- primary_key = scope.primary_key_fields.to_a
90
- grouping_primary_key = scope.grouping_primary_key_fields.to_a
91
- if (primary_key & grouping_primary_key) == primary_key
92
- difference_fields(scope.values_fields, scope.primary_key_fields).to_a
93
- end
94
- end
95
- end.compact.flatten
96
- # assert first_fields == first_fields.uniq
97
-
98
- # Do no first any fields explicitly aggregated over
99
- first_fields = first_fields - scope.grouping_fields.to_a
100
- if first_fields.size > 0
101
- first *first_fields
102
- puts "Firsting: #{first_fields.inspect} in assembly: #{name}"
103
- end
104
-
105
75
  bind_names scope.grouping_fields.to_a if every_applied?
106
76
  end
107
77
 
@@ -321,11 +291,7 @@ module Cascading
321
291
  invalid = name_map.keys.sort - old_names
322
292
  raise "invalid names: #{invalid.inspect}" unless invalid.empty?
323
293
 
324
- old_key = scope.primary_key_fields.to_a
325
- new_key = old_key.map{ |name| name_map[name] || name }
326
-
327
294
  each all_fields, :function => Java::CascadingOperation::Identity.new(fields(new_names))
328
- primary(*new_key)
329
295
  end
330
296
 
331
297
  def cast(type_map)
@@ -409,7 +375,8 @@ module Cascading
409
375
  # insert) and an options hash.
410
376
  #
411
377
  # Options include:
412
- # * <tt>:sql</tt> a boolean indicating whether the operation should act like the SQL equivalent
378
+ # * <tt>:ignore</tt> a Java Array of Objects (for min and max) or Tuples
379
+ # (for first and last) of values for the aggregator to ignore
413
380
  #
414
381
  # <tt>function</tt> is a symbol that is the method to call to construct the Cascading Aggregator.
415
382
  def composite_aggregator(args, function)
@@ -685,9 +652,11 @@ module Cascading
685
652
  raise "Regex not allowed" if options && options[:pattern]
686
653
 
687
654
  if options[:expression]
688
- options[:expression] = "!(#{options[:expression]})"
655
+ _, imports, expr = options[:expression].match(/^((?:\s*import.*;\s*)*)(.*)$/).to_a
656
+ options[:expression] = "#{imports}!(#{expr})"
689
657
  elsif args[0]
690
- args[0] = "!(#{args[0]})"
658
+ _, imports, expr = args[0].match(/^((?:\s*import.*;\s*)*)(.*)$/).to_a
659
+ args[0] = "#{imports}!(#{expr})"
691
660
  end
692
661
 
693
662
  filter(*args)
@@ -15,12 +15,19 @@ module Cascading
15
15
  end
16
16
 
17
17
  def add_child(node)
18
+ child = root.find_child(node.name)
19
+ warn "WARNING: adding '#{node.qualified_name}', but node named '#{node.name}' already exists at '#{child.qualified_name}'" if child
20
+
18
21
  @children[node.name] = node
19
22
  @child_names << node.name
20
23
  @last_child = node
21
24
  node
22
25
  end
23
26
 
27
+ def qualified_name
28
+ parent ? "#{parent.qualified_name}.#{name}" : name
29
+ end
30
+
24
31
  def describe(offset = '')
25
32
  "#{offset}#{name}:node\n#{child_names.map{ |child| children[child].describe("#{offset} ") }.join("\n")}"
26
33
  end
@@ -32,7 +39,12 @@ module Cascading
32
39
  result = child.find_child(name)
33
40
  return result if result
34
41
  end
35
- return nil
42
+ nil
43
+ end
44
+
45
+ def root
46
+ return self unless parent
47
+ parent.root
36
48
  end
37
49
  end
38
50
 
@@ -57,6 +69,7 @@ module Cascading
57
69
 
58
70
  def add(name, instance)
59
71
  @registered ||= {}
72
+ warn "WARNING: node named '#{name}' already registered in #{self}" if @registered[name]
60
73
  @registered[name] = instance
61
74
  end
62
75
 
@@ -20,6 +20,7 @@ module Cascading
20
20
 
21
21
  # For applications built of Flows with no Cascades
22
22
  def flow(name, &block)
23
+ raise "Could not build flow '#{name}'; block required" unless block_given?
23
24
  flow = Flow.new(name, nil)
24
25
  flow.instance_eval(&block)
25
26
  flow
@@ -75,7 +75,6 @@ module Cascading
75
75
  raise "Cannot sink undefined assembly '#{sink_name}'" unless @outgoing_scopes[sink_name]
76
76
  sink_metadata[sink_name] = {
77
77
  :field_names => @outgoing_scopes[sink_name].values_fields.to_a,
78
- :primary_key => @outgoing_scopes[sink_name].primary_key_fields.to_a
79
78
  }
80
79
  sink_metadata
81
80
  end
@@ -135,7 +134,16 @@ module Cascading
135
134
  end
136
135
 
137
136
  def connect(properties = nil)
138
- properties = java.util.HashMap.new(properties || @properties)
137
+ # This ensures we have a hash, and that it is a Ruby Hash (because we
138
+ # also accept java.util.HashMap), then merges it with Flow properties
139
+ properties ||= {}
140
+ properties = java.util.HashMap.new(@properties.merge(Hash[*properties.to_a.flatten]))
141
+
142
+ puts "Connecting flow '#{name}' with properties:"
143
+ properties.key_set.to_a.sort.each do |key|
144
+ puts "#{key}=#{properties[key]}"
145
+ end
146
+
139
147
  Java::CascadingFlow::FlowConnector.new(properties).connect(
140
148
  name,
141
149
  make_tap_parameter(@sources),
@@ -19,8 +19,14 @@ module Cascading
19
19
 
20
20
  def aggregator_function(args, aggregator_klass)
21
21
  options = args.extract_options!
22
- ignore_values = options[:sql] ? [nil].to_java(java.lang.Object) : nil
23
- parameters = [Cascading.fields(args), ignore_values].compact
22
+ ignore = options[:ignore]
23
+ raise "Option 'ignore' is only supported by min, max, first, and last" if ignore && ![
24
+ Java::CascadingOperationAggregator::First,
25
+ Java::CascadingOperationAggregator::Min,
26
+ Java::CascadingOperationAggregator::Max,
27
+ Java::CascadingOperationAggregator::Last,
28
+ ].include?(aggregator_klass)
29
+ parameters = [Cascading.fields(args), ignore].compact
24
30
  aggregator_klass.new(*parameters)
25
31
  end
26
32
 
@@ -1,25 +1,14 @@
1
1
  module Cascading
2
2
  class Scope
3
- attr_accessor :scope, :grouping_key_fields, :primary_key_fields, :grouping_primary_key_fields
4
- @@scheme_keys = {}
3
+ attr_accessor :scope, :grouping_key_fields
5
4
 
6
5
  def initialize(scope, params = {})
7
6
  @scope = scope
8
7
  @grouping_key_fields = fields(params[:grouping_key_fields] || [])
9
- @primary_key_fields = fields(params[:primary_key_fields])
10
- @grouping_primary_key_fields = fields(params[:grouping_primary_key_fields])
11
8
  end
12
9
 
13
10
  def copy
14
- Scope.new(Java::CascadingFlow::Scope.new(@scope),
15
- :grouping_key_fields => @grouping_key_fields,
16
- :primary_key_fields => @primary_key_fields,
17
- :grouping_primary_key_fields => @grouping_primary_key_fields
18
- )
19
- end
20
-
21
- def self.register_scheme_key(scheme, primary_key)
22
- @@scheme_keys[scheme] = primary_key
11
+ Scope.new(Java::CascadingFlow::Scope.new(@scope), :grouping_key_fields => @grouping_key_fields)
23
12
  end
24
13
 
25
14
  def self.empty_scope(name)
@@ -30,26 +19,14 @@ module Cascading
30
19
  java_scope = outgoing_scope_for(tap, java.util.HashSet.new)
31
20
  # Taps and Pipes don't name their outgoing scopes like other FlowElements
32
21
  java_scope.name = name
33
- scope = Scope.new(java_scope,
34
- :primary_key_fields => @@scheme_keys[tap.scheme.class],
35
- :grouping_primary_key_fields => @@scheme_keys[tap.scheme.class]
36
- )
37
- vf, gf = scope.values_fields.to_a, scope.grouping_fields.to_a
38
- pk, gpk = scope.primary_key_fields.to_a, scope.grouping_primary_key_fields.to_a
39
- raise "Primary key must be a subset of available fields (primary key: #{pk.inspect}, values fields: #{vf.inspect})" unless vf & pk == pk
40
- raise "Grouping primary key must be a subset of available fields (grouping primary key: #{gpk.inspect}, grouping fields: #{gf.inspect})" unless gf & gpk == gpk
41
- scope
22
+ Scope.new(java_scope)
42
23
  end
43
24
 
44
25
  def self.outgoing_scope(flow_element, incoming_scopes, grouping_key_fields, every_applied)
45
26
  java_scopes = incoming_scopes.compact.map{ |s| s.scope }
46
- scope = Scope.new(outgoing_scope_for(flow_element, java.util.HashSet.new(java_scopes)),
27
+ Scope.new(outgoing_scope_for(flow_element, java.util.HashSet.new(java_scopes)),
47
28
  :grouping_key_fields => grouping_key_fields
48
29
  )
49
- scope.grouping_primary_key_fields = fields(grouping_primary_key_fields(flow_element, incoming_scopes, scope))
50
- scope.primary_key_fields = scope.grouping_primary_key_fields if every_applied
51
- scope.primary_key_fields = fields(primary_key_fields(flow_element, incoming_scopes, scope)) unless every_applied
52
- scope
53
30
  end
54
31
 
55
32
  def values_fields
@@ -80,11 +57,9 @@ Scope name: #{@scope.name}
80
57
  selector: #{@scope.out_grouping_selector}
81
58
  fields: #{grouping_fields}
82
59
  key fields: #{@grouping_key_fields}
83
- primary key fields: #{@grouping_primary_key_fields}
84
60
  Out values
85
61
  selector: #{@scope.out_values_selector}
86
62
  fields: #{values_fields}
87
- primary key fields: #{@primary_key_fields}
88
63
  END
89
64
  end
90
65
 
@@ -97,64 +72,5 @@ END
97
72
  raise CascadingException.new(e, 'Exception computing outgoing scope')
98
73
  end
99
74
  end
100
-
101
- def self.primary_key_fields(flow_element, incoming_scopes, scope)
102
- case flow_element
103
- when Java::CascadingPipe::Each
104
- # assert incoming_scopes.size == 1
105
- project_primary_key(incoming_scopes.first.primary_key_fields,
106
- incoming_scopes.first.values_fields.to_a,
107
- scope.values_fields.to_a)
108
- when Java::CascadingPipe::Every
109
- # assert incoming_scopes.size == 1
110
- incoming_scopes.first.primary_key_fields
111
- when Java::CascadingPipe::GroupBy
112
- if incoming_scopes.size == 1
113
- incoming_scopes.first.primary_key_fields
114
- else
115
- # We must clear the primary key when unioning multiple inputs. If
116
- # the programmer wants to preserve the primary key, they must use
117
- # the primary override.
118
- nil
119
- end
120
- when Java::CascadingPipe::CoGroup
121
- # FIXME: assume grouping_key_fields are the same for all
122
- # incoming_scopes. Need join to give me names from all incoming
123
- # scopes to perform rename on primary key fields.
124
- union_fields(*incoming_scopes.map{ |s| s.primary_key_fields })
125
- else raise "No primary key rules for FlowElement of type #{flow_element}"
126
- end
127
- end
128
-
129
- def self.project_primary_key(primary_key, old_fields, new_fields)
130
- return nil if primary_key.nil?
131
- primary_key = primary_key.to_a
132
- primary_key if (primary_key & new_fields) == primary_key
133
- end
134
-
135
- def self.grouping_primary_key_fields(flow_element, incoming_scopes, scope)
136
- case flow_element
137
- when Java::CascadingPipe::Each
138
- # assert incoming_scopes.size == 1
139
- project_primary_key(incoming_scopes.first.grouping_primary_key_fields,
140
- incoming_scopes.first.grouping_fields.to_a,
141
- scope.grouping_fields.to_a)
142
- when Java::CascadingPipe::Every
143
- # assert incoming_scopes.size == 1
144
- incoming_scopes.first.grouping_primary_key_fields
145
- when Java::CascadingPipe::GroupBy
146
- scope.grouping_key_fields
147
- when Java::CascadingPipe::CoGroup
148
- scope.grouping_key_fields
149
- else raise "No primary key rules for FlowElement of type #{flow_element}"
150
- end
151
- end
152
- end
153
-
154
- # Register default primary keys
155
- begin
156
- Scope.register_scheme_key(Java::CascadingScheme::TextLine, ['offset'])
157
- rescue NameError => ne
158
- puts 'WARNING: Could not register primary key for TextLine Scheme as it was not on the class path'
159
75
  end
160
76
  end
data/lib/cascading.rb CHANGED
@@ -6,7 +6,7 @@ require 'java'
6
6
 
7
7
  module Cascading
8
8
  # :stopdoc:
9
- VERSION = '0.0.5'
9
+ VERSION = '0.0.6'
10
10
  LIBPATH = ::File.expand_path(::File.dirname(__FILE__)) + ::File::SEPARATOR
11
11
  PATH = ::File.dirname(LIBPATH) + ::File::SEPARATOR
12
12
  CASCADING_HOME = ENV['CASCADING_HOME']
@@ -54,7 +54,7 @@ context Object do
54
54
  result = e.validate
55
55
  result.should == 0
56
56
  end
57
- when '1.5.3'
57
+ when '1.5.3', '1.6.5'
58
58
  it 'should handle Fixnum -> Integer for ExprStub#eval' do
59
59
  e = ExprStub.new('x:int + y:int')
60
60
  result = e.eval(:x => 2, :y => 3)
data/spec/spec_util.rb CHANGED
@@ -7,22 +7,12 @@ module ScopeTests
7
7
  scope = scope(*name_params)
8
8
  values_fields = params[:values_fields]
9
9
  grouping_fields = params[:grouping_fields] || values_fields
10
- primary_key_fields = params[:primary_key_fields]
11
- grouping_primary_key_fields = primary_key_fields
12
- grouping_primary_key_fields = params[:grouping_primary_key_fields] if params.has_key?(:grouping_primary_key_fields)
13
10
 
14
11
  debug = params[:debug]
15
12
  debug_scope(*name_params) if debug
16
13
 
17
14
  scope.values_fields.to_a.should == values_fields
18
15
  scope.grouping_fields.to_a.should == grouping_fields
19
- if params.has_key?(:primary_key_fields) # Must support nil values
20
- scope.primary_key_fields.should == nil if primary_key_fields.nil?
21
- scope.primary_key_fields.to_a.should == primary_key_fields unless primary_key_fields.nil?
22
-
23
- scope.grouping_primary_key_fields.should == nil if grouping_primary_key_fields.nil?
24
- scope.grouping_primary_key_fields.to_a.should == grouping_primary_key_fields unless grouping_primary_key_fields.nil?
25
- end
26
16
  end
27
17
  end
28
18
 
@@ -135,54 +125,3 @@ def cascading_properties
135
125
  Java::CascadingFlow::MultiMapReducePlanner.set_job_conf(properties, job_conf)
136
126
  properties
137
127
  end
138
-
139
- def verify_assembly_output(assembly_name, params, &block)
140
- `rm -rf spec_output`
141
-
142
- Cascade.new("foo") do
143
- flow("bar") do
144
- source assembly_name, tap(params[:source], params.slice(:scheme))
145
- assembly = assembly(assembly_name)
146
- sink assembly_name, tap("spec_output", :kind => :lfs, :sink_mode => :replace)
147
- end
148
- end.complete(@properties)
149
-
150
- output_data = nil
151
-
152
- File.open("spec_output/part-00000") do |f|
153
- output_data = f.readlines
154
- end
155
-
156
- if params[:length]
157
- output_data.size.should == params[:length]
158
- end
159
-
160
- keys = assembly.scope.values_fields
161
- if block_given?
162
- output_data.each do |line|
163
- values = line.chomp.split(/\t/)
164
-
165
- yield(keys.zip(values).inject({}) do |map, kv|
166
- map[kv[0].to_sym] = kv[1]
167
- map
168
- end)
169
- end
170
- end
171
- end
172
-
173
- def describe_job(job_file, &block)
174
- context Object do
175
- before(:each) do
176
- @properties = cascading_properties
177
- # Must artificially fill ARGV to prevent errors when creating multi-taps
178
- # in ETL cascade
179
- ARGV.clear
180
- 10.times do
181
- ARGV << 'text_line_scheme' # Dummy value, required for 3rd arg
182
- end
183
- load "lib/jobs/#{job_file}/#{job_file}.rb"
184
- end
185
-
186
- self.class_eval(&block)
187
- end
188
- end