cascading.jruby 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. data/HACKING.md +15 -0
  2. data/History.txt +0 -0
  3. data/LICENSE.txt +165 -0
  4. data/README.md +7 -0
  5. data/Rakefile +45 -0
  6. data/bin/make_job +81 -0
  7. data/lib/cascading/assembly.rb +726 -0
  8. data/lib/cascading/base.rb +63 -0
  9. data/lib/cascading/cascade.rb +63 -0
  10. data/lib/cascading/cascading.rb +134 -0
  11. data/lib/cascading/cascading_exception.rb +30 -0
  12. data/lib/cascading/expr_stub.rb +33 -0
  13. data/lib/cascading/ext/array.rb +15 -0
  14. data/lib/cascading/flow.rb +168 -0
  15. data/lib/cascading/operations.rb +204 -0
  16. data/lib/cascading/scope.rb +160 -0
  17. data/lib/cascading.rb +63 -0
  18. data/samples/branch.rb +31 -0
  19. data/samples/cascading.rb +41 -0
  20. data/samples/copy.rb +18 -0
  21. data/samples/data/data2.txt +88799 -0
  22. data/samples/data/data_join1.txt +3 -0
  23. data/samples/data/data_join2.txt +3 -0
  24. data/samples/data/data_join3.txt +3 -0
  25. data/samples/join.rb +32 -0
  26. data/samples/logwordcount.rb +22 -0
  27. data/samples/project.rb +24 -0
  28. data/samples/rename.rb +21 -0
  29. data/samples/scorenames.rb +20 -0
  30. data/samples/splitter.rb +20 -0
  31. data/samples/union.rb +35 -0
  32. data/spec/cascading_spec.rb +100 -0
  33. data/spec/expr_spec.rb +10 -0
  34. data/spec/primary_key_spec.rb +119 -0
  35. data/spec/resource/join_input.txt +3 -0
  36. data/spec/resource/test_input.txt +4 -0
  37. data/spec/scope_spec.rb +174 -0
  38. data/spec/spec.opts +6 -0
  39. data/spec/spec_helper.rb +5 -0
  40. data/spec/spec_util.rb +188 -0
  41. data/src/cascading/jruby/Main.java +38 -0
  42. data/src/cascading/jruby/runner.rb +6 -0
  43. data/tags +238 -0
  44. data/tasks/ann.rake +80 -0
  45. data/tasks/ant.rake +11 -0
  46. data/tasks/bones.rake +20 -0
  47. data/tasks/gem.rake +206 -0
  48. data/tasks/git.rake +40 -0
  49. data/tasks/notes.rake +27 -0
  50. data/tasks/post_load.rake +34 -0
  51. data/tasks/rdoc.rake +50 -0
  52. data/tasks/rubyforge.rake +55 -0
  53. data/tasks/samples.rake +13 -0
  54. data/tasks/setup.rb +300 -0
  55. data/tasks/spec.rake +59 -0
  56. data/tasks/svn.rake +47 -0
  57. data/tasks/test.rake +42 -0
  58. data/test/data/data1.txt +14 -0
  59. data/test/data/data2.txt +14 -0
  60. data/test/test_assembly.rb +321 -0
  61. data/test/test_cascading.rb +49 -0
  62. data/test/test_flow.rb +15 -0
  63. metadata +137 -0
@@ -0,0 +1,204 @@
1
+ # Copyright 2009, Grégoire Marabout. All Rights Reserved.
2
+ #
3
+ # This is free software. Please see the LICENSE and COPYING files for details.
4
+
5
+ module Cascading
6
+ module Operations
7
+ def identity
8
+ Java::CascadingOperation::Identity.new
9
+ end
10
+
11
+ def sum_function(*args)
12
+ options = args.extract_options!
13
+ raise "Need to specify args" if args.empty?
14
+ type = options[:type] || java.lang.Double.java_class
15
+ parameters = [Cascading.fields(args),type].compact.to_java
16
+
17
+ Java::CascadingOperationAggregator::Sum.new(*parameters)
18
+ end
19
+
20
+ def aggregator_function(args, aggregator_klass)
21
+ options = args.extract_options!
22
+ ignore_values = options[:sql] ? [nil].to_java(java.lang.Object) : nil
23
+ parameters = [Cascading.fields(args), ignore_values].compact
24
+ aggregator_klass.new(*parameters)
25
+ end
26
+
27
+ def count_function(*args)
28
+ aggregator_function(args, Java::CascadingOperationAggregator::Count)
29
+ end
30
+
31
+ def average_function(*args)
32
+ aggregator_function(args, Java::CascadingOperationAggregator::Average)
33
+ end
34
+
35
+ def first_function(*args)
36
+ aggregator_function(args, Java::CascadingOperationAggregator::First)
37
+ end
38
+
39
+ def min_function(*args)
40
+ aggregator_function(args, Java::CascadingOperationAggregator::Min)
41
+ end
42
+
43
+ def max_function(*args)
44
+ aggregator_function(args, Java::CascadingOperationAggregator::Max)
45
+ end
46
+
47
+ def last_function(*args)
48
+ aggregator_function(args, Java::CascadingOperationAggregator::Last)
49
+ end
50
+
51
+ def regex_parser(*args)
52
+ options = args.extract_options!
53
+
54
+ pattern = args[0].to_s
55
+ fields = Cascading.fields(options[:fields])
56
+ groups = options[:groups].to_java(:int) if options[:groups]
57
+ parameters = [fields, pattern, groups].compact
58
+
59
+ Java::CascadingOperationRegex::RegexParser.new(*parameters)
60
+ end
61
+
62
+ def regex_splitter(*args)
63
+ options = args.extract_options!
64
+
65
+ fields = Cascading.fields(args)
66
+ pattern = options[:pattern].to_s
67
+ parameters = [fields, pattern].compact
68
+ Java::CascadingOperationRegex::RegexSplitter.new(*parameters)
69
+ end
70
+
71
+ def regex_split_generator(*args)
72
+ options = args.extract_options!
73
+
74
+ fields = Cascading.fields(args)
75
+ pattern = options[:pattern].to_s
76
+ parameters = [fields, pattern].compact
77
+ Java::CascadingOperationRegex::RegexSplitGenerator.new(*parameters)
78
+ end
79
+
80
+ def regex_generator(*args)
81
+ options = args.extract_options!
82
+
83
+ fields = Cascading.fields(args)
84
+ pattern = options[:pattern].to_s
85
+ parameters = [fields, pattern].compact
86
+ Java::CascadingOperationRegex::RegexGenerator.new(*parameters)
87
+ end
88
+
89
+ def expression_function(*args)
90
+ options = args.extract_options!
91
+
92
+ fields = Cascading.fields(args)
93
+ expression = options[:expression].to_s
94
+ parameters = options[:parameters]
95
+ parameter_names = []
96
+ parameter_types = []
97
+ if parameters.is_a? ::Hash
98
+ parameters.each do |name, type|
99
+ parameter_names << name
100
+ parameter_types << type
101
+ end
102
+ parameter_names = parameter_names.to_java(java.lang.String)
103
+ parameter_types = parameter_types.to_java(java.lang.Class)
104
+
105
+ arguments = [fields, expression, parameter_names, parameter_types].compact
106
+ elsif !parameters.nil?
107
+ arguments = [fields, expression, parameters.java_class].compact
108
+ else
109
+ arguments = [fields, expression, java.lang.String.java_class].compact
110
+ end
111
+
112
+ Java::CascadingOperationExpression::ExpressionFunction.new(*arguments)
113
+ end
114
+
115
+ def insert_function(*args)
116
+ options=args.extract_options!
117
+ fields = Cascading.fields(args)
118
+ values = options[:values]
119
+
120
+ parameters = [fields, to_java_comparable_array(values)].compact
121
+ Java::CascadingOperation::Insert.new(*parameters)
122
+ end
123
+
124
+ def to_java_comparable_array(arr)
125
+ (arr.map do |v|
126
+ case v.class
127
+ when Fixnum
128
+ java.lang.Integer.new(v)
129
+ when Float
130
+ java.lang.Double.new(v)
131
+ else
132
+ java.lang.String.new(v.to_s)
133
+ end
134
+ end).to_java(java.lang.Comparable)
135
+ end
136
+
137
+ def expression_filter(*args)
138
+ options = args.extract_options!
139
+ expression = (args[0] || options[:expression]).to_s
140
+ parameters = options[:parameters]
141
+ parameter_names = []
142
+ parameter_types = []
143
+ if parameters.is_a? ::Hash
144
+ parameters.each do |name, type|
145
+ parameter_names << name
146
+ parameter_types << type
147
+ end
148
+ parameter_names = parameter_names.to_java(java.lang.String)
149
+ parameter_types = parameter_types.to_java(java.lang.Class)
150
+
151
+ arguments = [expression, parameter_names, parameter_types].compact
152
+ elsif !parameters.nil?
153
+ arguments = [expression, parameters.java_class].compact
154
+ else
155
+ arguments = [expression, java.lang.String.java_class].compact
156
+ end
157
+
158
+ Java::CascadingOperationExpression::ExpressionFilter.new(*arguments)
159
+ end
160
+
161
+ def date_parser(field, format)
162
+ fields = fields(field)
163
+ Java::CascadingOperationText::DateParser.new(fields, format)
164
+ end
165
+
166
+ def date_formatter(fields, format, timezone=nil)
167
+ fields = fields(fields)
168
+ timezone = Java::JavaUtil::TimeZone.get_time_zone(timezone) if timezone
169
+ arguments = [fields, format, timezone].compact
170
+ Java::CascadingOperationText::DateFormatter.new(*arguments)
171
+ end
172
+
173
+ def regex_filter(*args)
174
+ options = args.extract_options!
175
+
176
+ pattern = args[0]
177
+ remove_match = options[:remove_match]
178
+ match_each_element = options[:match_each_element]
179
+ parameters = [pattern.to_s, remove_match, match_each_element].compact
180
+ Java::CascadingOperationRegex::RegexFilter.new(*parameters)
181
+ end
182
+
183
+ def regex_replace(*args)
184
+ options = args.extract_options!
185
+
186
+ fields = fields(args[0])
187
+ pattern = args[1]
188
+ replacement = args[2]
189
+ replace_all = options[:replace_all]
190
+
191
+ parameters = [fields, pattern.to_s, replacement.to_s, replace_all].compact
192
+ Java::CascadingOperationRegex::RegexReplace.new(*parameters)
193
+ end
194
+
195
+ def field_joiner(*args)
196
+ options = args.extract_options!
197
+ delimiter = options[:delimiter] || ','
198
+ fields = fields(options[:into])
199
+
200
+ parameters = [fields, delimiter].compact
201
+ Java::CascadingOperationText::FieldJoiner.new(*parameters)
202
+ end
203
+ end
204
+ end
@@ -0,0 +1,160 @@
1
+ module Cascading
2
+ class Scope
3
+ attr_accessor :scope, :grouping_key_fields, :primary_key_fields, :grouping_primary_key_fields
4
+ @@scheme_keys = {}
5
+
6
+ def initialize(scope, params = {})
7
+ @scope = scope
8
+ @grouping_key_fields = fields(params[:grouping_key_fields] || [])
9
+ @primary_key_fields = fields(params[:primary_key_fields])
10
+ @grouping_primary_key_fields = fields(params[:grouping_primary_key_fields])
11
+ end
12
+
13
+ def copy
14
+ Scope.new(Java::CascadingFlow::Scope.new(@scope),
15
+ :grouping_key_fields => @grouping_key_fields,
16
+ :primary_key_fields => @primary_key_fields,
17
+ :grouping_primary_key_fields => @grouping_primary_key_fields
18
+ )
19
+ end
20
+
21
+ def self.register_scheme_key(scheme, primary_key)
22
+ @@scheme_keys[scheme] = primary_key
23
+ end
24
+
25
+ def self.empty_scope(name)
26
+ Scope.new(Java::CascadingFlow::Scope.new(name))
27
+ end
28
+
29
+ def self.tap_scope(tap, name)
30
+ java_scope = outgoing_scope_for(tap, java.util.HashSet.new)
31
+ # Taps and Pipes don't name their outgoing scopes like other FlowElements
32
+ java_scope.name = name
33
+ scope = Scope.new(java_scope,
34
+ :primary_key_fields => @@scheme_keys[tap.scheme.class],
35
+ :grouping_primary_key_fields => @@scheme_keys[tap.scheme.class]
36
+ )
37
+ vf, gf = scope.values_fields.to_a, scope.grouping_fields.to_a
38
+ pk, gpk = scope.primary_key_fields.to_a, scope.grouping_primary_key_fields.to_a
39
+ raise "Primary key must be a subset of available fields (primary key: #{pk.inspect}, values fields: #{vf.inspect})" unless vf & pk == pk
40
+ raise "Grouping primary key must be a subset of available fields (grouping primary key: #{gpk.inspect}, grouping fields: #{gf.inspect})" unless gf & gpk == gpk
41
+ scope
42
+ end
43
+
44
+ def self.outgoing_scope(flow_element, incoming_scopes, grouping_key_fields, every_applied)
45
+ java_scopes = incoming_scopes.compact.map{ |s| s.scope }
46
+ scope = Scope.new(outgoing_scope_for(flow_element, java.util.HashSet.new(java_scopes)),
47
+ :grouping_key_fields => grouping_key_fields
48
+ )
49
+ scope.grouping_primary_key_fields = fields(grouping_primary_key_fields(flow_element, incoming_scopes, scope))
50
+ scope.primary_key_fields = scope.grouping_primary_key_fields if every_applied
51
+ scope.primary_key_fields = fields(primary_key_fields(flow_element, incoming_scopes, scope)) unless every_applied
52
+ scope
53
+ end
54
+
55
+ def values_fields
56
+ @scope.out_values_fields
57
+ end
58
+
59
+ def grouping_fields
60
+ keys = @grouping_key_fields.to_a
61
+ grouping_fields = @scope.out_grouping_fields.to_a
62
+ # Overwrite key fields only
63
+ fields(keys + grouping_fields[keys.size..-1])
64
+ end
65
+
66
+ def to_s
67
+ kind = 'Unknown'
68
+ kind = 'Tap' if @scope.tap?
69
+ kind = 'Group' if @scope.group?
70
+ kind = 'Each' if @scope.each?
71
+ kind = 'Every' if @scope.every?
72
+ <<-END
73
+ Scope name: #{@scope.name}
74
+ Kind: #{kind}
75
+ Argument selector: #{@scope.argument_selector}
76
+ Declared fields: #{@scope.declared_fields}
77
+ Grouping selectors: #{@scope.grouping_selectors}
78
+ Sorting selectors: #{@scope.sorting_selectors}
79
+ Out grouping
80
+ selector: #{@scope.out_grouping_selector}
81
+ fields: #{grouping_fields}
82
+ key fields: #{@grouping_key_fields}
83
+ primary key fields: #{@grouping_primary_key_fields}
84
+ Out values
85
+ selector: #{@scope.out_values_selector}
86
+ fields: #{values_fields}
87
+ primary key fields: #{@primary_key_fields}
88
+ END
89
+ end
90
+
91
+ private
92
+
93
+ def self.outgoing_scope_for(flow_element, incoming_scopes)
94
+ begin
95
+ flow_element.outgoing_scope_for(incoming_scopes)
96
+ rescue NativeException => e
97
+ raise CascadingException.new(e, 'Exception computing outgoing scope')
98
+ end
99
+ end
100
+
101
+ def self.primary_key_fields(flow_element, incoming_scopes, scope)
102
+ case flow_element
103
+ when Java::CascadingPipe::Each
104
+ # assert incoming_scopes.size == 1
105
+ project_primary_key(incoming_scopes.first.primary_key_fields,
106
+ incoming_scopes.first.values_fields.to_a,
107
+ scope.values_fields.to_a)
108
+ when Java::CascadingPipe::Every
109
+ # assert incoming_scopes.size == 1
110
+ incoming_scopes.first.primary_key_fields
111
+ when Java::CascadingPipe::GroupBy
112
+ if incoming_scopes.size == 1
113
+ incoming_scopes.first.primary_key_fields
114
+ else
115
+ # We must clear the primary key when unioning multiple inputs. If
116
+ # the programmer wants to preserve the primary key, they must use
117
+ # the primary override.
118
+ nil
119
+ end
120
+ when Java::CascadingPipe::CoGroup
121
+ # FIXME: assume grouping_key_fields are the same for all
122
+ # incoming_scopes. Need join to give me names from all incoming
123
+ # scopes to perform rename on primary key fields.
124
+ union_fields(*incoming_scopes.map{ |s| s.primary_key_fields })
125
+ else raise "No primary key rules for FlowElement of type #{flow_element}"
126
+ end
127
+ end
128
+
129
+ def self.project_primary_key(primary_key, old_fields, new_fields)
130
+ return nil if primary_key.nil?
131
+ primary_key = primary_key.to_a
132
+ primary_key if (primary_key & new_fields) == primary_key
133
+ end
134
+
135
+ def self.grouping_primary_key_fields(flow_element, incoming_scopes, scope)
136
+ case flow_element
137
+ when Java::CascadingPipe::Each
138
+ # assert incoming_scopes.size == 1
139
+ project_primary_key(incoming_scopes.first.grouping_primary_key_fields,
140
+ incoming_scopes.first.grouping_fields.to_a,
141
+ scope.grouping_fields.to_a)
142
+ when Java::CascadingPipe::Every
143
+ # assert incoming_scopes.size == 1
144
+ incoming_scopes.first.grouping_primary_key_fields
145
+ when Java::CascadingPipe::GroupBy
146
+ scope.grouping_key_fields
147
+ when Java::CascadingPipe::CoGroup
148
+ scope.grouping_key_fields
149
+ else raise "No primary key rules for FlowElement of type #{flow_element}"
150
+ end
151
+ end
152
+ end
153
+
154
+ # Register default primary keys
155
+ begin
156
+ Scope.register_scheme_key(Java::CascadingScheme::TextLine, ['offset'])
157
+ rescue NameError => ne
158
+ puts 'WARNING: Could not register primary key for TextLine Scheme as it was not on the class path'
159
+ end
160
+ end
data/lib/cascading.rb ADDED
@@ -0,0 +1,63 @@
1
+ # Copyright 2009, Grégoire Marabout. All Rights Reserved.
2
+ #
3
+ # This is free software. Please see the LICENSE and COPYING files for details.
4
+
5
+ require 'java'
6
+
7
+ module Cascading
8
+ # :stopdoc:
9
+ VERSION = '0.0.4'
10
+ LIBPATH = ::File.expand_path(::File.dirname(__FILE__)) + ::File::SEPARATOR
11
+ PATH = ::File.dirname(LIBPATH) + ::File::SEPARATOR
12
+ CASCADING_HOME = ENV['CASCADING_HOME']
13
+ HADOOP_HOME = ENV['HADOOP_HOME']
14
+
15
+ # :startdoc:
16
+
17
+ # Returns the version string for the library.
18
+ #
19
+ def self.version
20
+ VERSION
21
+ end
22
+
23
+ # Returns the library path for the module. If any arguments are given,
24
+ # they will be joined to the end of the libray path using
25
+ # <tt>File.join</tt>.
26
+ #
27
+ def self.libpath( *args )
28
+ args.empty? ? LIBPATH : ::File.join(LIBPATH, args.flatten)
29
+ end
30
+
31
+ # Returns the lpath for the module. If any arguments are given,
32
+ # they will be joined to the end of the path using
33
+ # <tt>File.join</tt>.
34
+ #
35
+ def self.path( *args )
36
+ args.empty? ? PATH : ::File.join(PATH, args.flatten)
37
+ end
38
+
39
+ def self.require_all_jars(from = ::File.join(::File.dirname(__FILE__), "..", "jars"))
40
+ search_me = ::File.expand_path(
41
+ ::File.join(from, '**', '*.jar'))
42
+ Dir.glob(search_me).sort.each do |jar|
43
+ #puts "required: #{jar}"
44
+ require jar
45
+ end
46
+ end
47
+ end
48
+
49
+ Cascading.require_all_jars(Cascading::HADOOP_HOME) if Cascading::HADOOP_HOME
50
+ Cascading.require_all_jars(Cascading::CASCADING_HOME) if Cascading::CASCADING_HOME
51
+
52
+ require 'cascading/assembly'
53
+ require 'cascading/base'
54
+ require 'cascading/cascade'
55
+ require 'cascading/cascading'
56
+ require 'cascading/cascading_exception'
57
+ require 'cascading/expr_stub'
58
+ require 'cascading/flow'
59
+ require 'cascading/operations'
60
+ require 'cascading/scope'
61
+
62
+ # include module to make them available at top package
63
+ include Cascading
data/samples/branch.rb ADDED
@@ -0,0 +1,31 @@
1
+ #! /usr/bin/env jruby
2
+
3
+ $: << File.join(File.dirname(__FILE__), '..', 'lib')
4
+
5
+ require 'cascading'
6
+ require 'samples/cascading'
7
+
8
+ cascade 'branch' do
9
+ flow 'branch' do
10
+ source 'input', tap('samples/data/data2.txt')
11
+
12
+ assembly 'input' do
13
+ split 'line', ['name', 'score1', 'score2', 'id'], :pattern => /[.,]*\s+/
14
+
15
+ branch 'branch1' do
16
+ group_by 'score1' do
17
+ count
18
+ end
19
+ end
20
+
21
+ branch 'branch2' do
22
+ group_by 'score2' do
23
+ count
24
+ end
25
+ end
26
+ end
27
+
28
+ sink 'branch1', tap('output/branch1', :sink_mode => :replace)
29
+ sink 'branch2', tap('output/branch2', :sink_mode => :replace)
30
+ end
31
+ end.complete(sample_properties)
@@ -0,0 +1,41 @@
1
+ module Cascading
2
+ # Constructs properties to be passed to Flow#complete or Cascade#complete
3
+ # which will locate temporary Hadoop files in build/sample. It is necessary
4
+ # to pass these properties only because the sample apps are invoked using
5
+ # JRuby's main method, which confuses the JobConf's attempt to find the
6
+ # containing jar.
7
+ def sample_properties
8
+ build_dir = 'build/sample/build'
9
+ `mkdir -p #{build_dir}`
10
+ tmp_dir = "build/sample/tmp"
11
+ `mkdir -p #{tmp_dir}`
12
+ log_dir = "build/sample/log"
13
+ `mkdir -p #{log_dir}`
14
+
15
+ # Local cluster settings
16
+ #java.lang.System.set_property("test.build.data", build_dir)
17
+ #java.lang.System.set_property("hadoop.tmp.dir", tmp_dir)
18
+ #java.lang.System.set_property("hadoop.log.dir", log_dir)
19
+ #conf = Java::OrgApacheHadoopConf::Configuration.new
20
+ #dfs = Java::OrgApacheHadoopDfs::MiniDFSCluster.new(conf, 4, true, nil);
21
+ #file_sys = dfs.file_system
22
+ #mr = Java::OrgApacheHadoopMapred::MiniMRCluster.new(4, file_sys.uri.to_string, 1)
23
+ #job_conf = mr.create_job_conf
24
+ #job_conf.set("mapred.child.java.opts", "-Xmx512m")
25
+ #job_conf.set("mapred.map.tasks.speculative.execution", "false")
26
+ #job_conf.set("mapred.reduce.tasks.speculative.execution", "false")
27
+
28
+ job_conf = Java::OrgApacheHadoopMapred::JobConf.new
29
+ job_conf.jar = build_dir
30
+ job_conf.set("test.build.data", build_dir)
31
+ job_conf.set("hadoop.tmp.dir", tmp_dir)
32
+ job_conf.set("hadoop.log.dir", log_dir)
33
+
34
+ job_conf.num_map_tasks = 4
35
+ job_conf.num_reduce_tasks = 1
36
+
37
+ properties = java.util.HashMap.new({})
38
+ Java::CascadingFlow::MultiMapReducePlanner.set_job_conf(properties, job_conf)
39
+ properties
40
+ end
41
+ end
data/samples/copy.rb ADDED
@@ -0,0 +1,18 @@
1
+ #! /usr/bin/env jruby
2
+ $: << File.join(File.dirname(__FILE__), '..', 'lib')
3
+
4
+ require 'cascading'
5
+ require 'samples/cascading'
6
+
7
+ cascade 'copy' do
8
+ flow 'copy' do
9
+ source 'input', tap('http://www.census.gov/genealogy/names/dist.all.last')
10
+
11
+ assembly 'input' do
12
+ rename 'line' => 'value'
13
+ reject 'value:string.indexOf("R") == -1'
14
+ end
15
+
16
+ sink 'input', tap('output/copy', :sink_mode => :replace)
17
+ end
18
+ end.complete(sample_properties)