cascading.jruby 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (63) hide show
  1. data/HACKING.md +15 -0
  2. data/History.txt +0 -0
  3. data/LICENSE.txt +165 -0
  4. data/README.md +7 -0
  5. data/Rakefile +45 -0
  6. data/bin/make_job +81 -0
  7. data/lib/cascading/assembly.rb +726 -0
  8. data/lib/cascading/base.rb +63 -0
  9. data/lib/cascading/cascade.rb +63 -0
  10. data/lib/cascading/cascading.rb +134 -0
  11. data/lib/cascading/cascading_exception.rb +30 -0
  12. data/lib/cascading/expr_stub.rb +33 -0
  13. data/lib/cascading/ext/array.rb +15 -0
  14. data/lib/cascading/flow.rb +168 -0
  15. data/lib/cascading/operations.rb +204 -0
  16. data/lib/cascading/scope.rb +160 -0
  17. data/lib/cascading.rb +63 -0
  18. data/samples/branch.rb +31 -0
  19. data/samples/cascading.rb +41 -0
  20. data/samples/copy.rb +18 -0
  21. data/samples/data/data2.txt +88799 -0
  22. data/samples/data/data_join1.txt +3 -0
  23. data/samples/data/data_join2.txt +3 -0
  24. data/samples/data/data_join3.txt +3 -0
  25. data/samples/join.rb +32 -0
  26. data/samples/logwordcount.rb +22 -0
  27. data/samples/project.rb +24 -0
  28. data/samples/rename.rb +21 -0
  29. data/samples/scorenames.rb +20 -0
  30. data/samples/splitter.rb +20 -0
  31. data/samples/union.rb +35 -0
  32. data/spec/cascading_spec.rb +100 -0
  33. data/spec/expr_spec.rb +10 -0
  34. data/spec/primary_key_spec.rb +119 -0
  35. data/spec/resource/join_input.txt +3 -0
  36. data/spec/resource/test_input.txt +4 -0
  37. data/spec/scope_spec.rb +174 -0
  38. data/spec/spec.opts +6 -0
  39. data/spec/spec_helper.rb +5 -0
  40. data/spec/spec_util.rb +188 -0
  41. data/src/cascading/jruby/Main.java +38 -0
  42. data/src/cascading/jruby/runner.rb +6 -0
  43. data/tags +238 -0
  44. data/tasks/ann.rake +80 -0
  45. data/tasks/ant.rake +11 -0
  46. data/tasks/bones.rake +20 -0
  47. data/tasks/gem.rake +206 -0
  48. data/tasks/git.rake +40 -0
  49. data/tasks/notes.rake +27 -0
  50. data/tasks/post_load.rake +34 -0
  51. data/tasks/rdoc.rake +50 -0
  52. data/tasks/rubyforge.rake +55 -0
  53. data/tasks/samples.rake +13 -0
  54. data/tasks/setup.rb +300 -0
  55. data/tasks/spec.rake +59 -0
  56. data/tasks/svn.rake +47 -0
  57. data/tasks/test.rake +42 -0
  58. data/test/data/data1.txt +14 -0
  59. data/test/data/data2.txt +14 -0
  60. data/test/test_assembly.rb +321 -0
  61. data/test/test_cascading.rb +49 -0
  62. data/test/test_flow.rb +15 -0
  63. metadata +137 -0
@@ -0,0 +1,204 @@
1
+ # Copyright 2009, Grégoire Marabout. All Rights Reserved.
2
+ #
3
+ # This is free software. Please see the LICENSE and COPYING files for details.
4
+
5
+ module Cascading
6
+ module Operations
7
+ def identity
8
+ Java::CascadingOperation::Identity.new
9
+ end
10
+
11
+ def sum_function(*args)
12
+ options = args.extract_options!
13
+ raise "Need to specify args" if args.empty?
14
+ type = options[:type] || java.lang.Double.java_class
15
+ parameters = [Cascading.fields(args),type].compact.to_java
16
+
17
+ Java::CascadingOperationAggregator::Sum.new(*parameters)
18
+ end
19
+
20
+ def aggregator_function(args, aggregator_klass)
21
+ options = args.extract_options!
22
+ ignore_values = options[:sql] ? [nil].to_java(java.lang.Object) : nil
23
+ parameters = [Cascading.fields(args), ignore_values].compact
24
+ aggregator_klass.new(*parameters)
25
+ end
26
+
27
+ def count_function(*args)
28
+ aggregator_function(args, Java::CascadingOperationAggregator::Count)
29
+ end
30
+
31
+ def average_function(*args)
32
+ aggregator_function(args, Java::CascadingOperationAggregator::Average)
33
+ end
34
+
35
+ def first_function(*args)
36
+ aggregator_function(args, Java::CascadingOperationAggregator::First)
37
+ end
38
+
39
+ def min_function(*args)
40
+ aggregator_function(args, Java::CascadingOperationAggregator::Min)
41
+ end
42
+
43
+ def max_function(*args)
44
+ aggregator_function(args, Java::CascadingOperationAggregator::Max)
45
+ end
46
+
47
+ def last_function(*args)
48
+ aggregator_function(args, Java::CascadingOperationAggregator::Last)
49
+ end
50
+
51
+ def regex_parser(*args)
52
+ options = args.extract_options!
53
+
54
+ pattern = args[0].to_s
55
+ fields = Cascading.fields(options[:fields])
56
+ groups = options[:groups].to_java(:int) if options[:groups]
57
+ parameters = [fields, pattern, groups].compact
58
+
59
+ Java::CascadingOperationRegex::RegexParser.new(*parameters)
60
+ end
61
+
62
+ def regex_splitter(*args)
63
+ options = args.extract_options!
64
+
65
+ fields = Cascading.fields(args)
66
+ pattern = options[:pattern].to_s
67
+ parameters = [fields, pattern].compact
68
+ Java::CascadingOperationRegex::RegexSplitter.new(*parameters)
69
+ end
70
+
71
+ def regex_split_generator(*args)
72
+ options = args.extract_options!
73
+
74
+ fields = Cascading.fields(args)
75
+ pattern = options[:pattern].to_s
76
+ parameters = [fields, pattern].compact
77
+ Java::CascadingOperationRegex::RegexSplitGenerator.new(*parameters)
78
+ end
79
+
80
+ def regex_generator(*args)
81
+ options = args.extract_options!
82
+
83
+ fields = Cascading.fields(args)
84
+ pattern = options[:pattern].to_s
85
+ parameters = [fields, pattern].compact
86
+ Java::CascadingOperationRegex::RegexGenerator.new(*parameters)
87
+ end
88
+
89
+ def expression_function(*args)
90
+ options = args.extract_options!
91
+
92
+ fields = Cascading.fields(args)
93
+ expression = options[:expression].to_s
94
+ parameters = options[:parameters]
95
+ parameter_names = []
96
+ parameter_types = []
97
+ if parameters.is_a? ::Hash
98
+ parameters.each do |name, type|
99
+ parameter_names << name
100
+ parameter_types << type
101
+ end
102
+ parameter_names = parameter_names.to_java(java.lang.String)
103
+ parameter_types = parameter_types.to_java(java.lang.Class)
104
+
105
+ arguments = [fields, expression, parameter_names, parameter_types].compact
106
+ elsif !parameters.nil?
107
+ arguments = [fields, expression, parameters.java_class].compact
108
+ else
109
+ arguments = [fields, expression, java.lang.String.java_class].compact
110
+ end
111
+
112
+ Java::CascadingOperationExpression::ExpressionFunction.new(*arguments)
113
+ end
114
+
115
+ def insert_function(*args)
116
+ options=args.extract_options!
117
+ fields = Cascading.fields(args)
118
+ values = options[:values]
119
+
120
+ parameters = [fields, to_java_comparable_array(values)].compact
121
+ Java::CascadingOperation::Insert.new(*parameters)
122
+ end
123
+
124
+ def to_java_comparable_array(arr)
125
+ (arr.map do |v|
126
+ case v.class
127
+ when Fixnum
128
+ java.lang.Integer.new(v)
129
+ when Float
130
+ java.lang.Double.new(v)
131
+ else
132
+ java.lang.String.new(v.to_s)
133
+ end
134
+ end).to_java(java.lang.Comparable)
135
+ end
136
+
137
+ def expression_filter(*args)
138
+ options = args.extract_options!
139
+ expression = (args[0] || options[:expression]).to_s
140
+ parameters = options[:parameters]
141
+ parameter_names = []
142
+ parameter_types = []
143
+ if parameters.is_a? ::Hash
144
+ parameters.each do |name, type|
145
+ parameter_names << name
146
+ parameter_types << type
147
+ end
148
+ parameter_names = parameter_names.to_java(java.lang.String)
149
+ parameter_types = parameter_types.to_java(java.lang.Class)
150
+
151
+ arguments = [expression, parameter_names, parameter_types].compact
152
+ elsif !parameters.nil?
153
+ arguments = [expression, parameters.java_class].compact
154
+ else
155
+ arguments = [expression, java.lang.String.java_class].compact
156
+ end
157
+
158
+ Java::CascadingOperationExpression::ExpressionFilter.new(*arguments)
159
+ end
160
+
161
+ def date_parser(field, format)
162
+ fields = fields(field)
163
+ Java::CascadingOperationText::DateParser.new(fields, format)
164
+ end
165
+
166
+ def date_formatter(fields, format, timezone=nil)
167
+ fields = fields(fields)
168
+ timezone = Java::JavaUtil::TimeZone.get_time_zone(timezone) if timezone
169
+ arguments = [fields, format, timezone].compact
170
+ Java::CascadingOperationText::DateFormatter.new(*arguments)
171
+ end
172
+
173
+ def regex_filter(*args)
174
+ options = args.extract_options!
175
+
176
+ pattern = args[0]
177
+ remove_match = options[:remove_match]
178
+ match_each_element = options[:match_each_element]
179
+ parameters = [pattern.to_s, remove_match, match_each_element].compact
180
+ Java::CascadingOperationRegex::RegexFilter.new(*parameters)
181
+ end
182
+
183
+ def regex_replace(*args)
184
+ options = args.extract_options!
185
+
186
+ fields = fields(args[0])
187
+ pattern = args[1]
188
+ replacement = args[2]
189
+ replace_all = options[:replace_all]
190
+
191
+ parameters = [fields, pattern.to_s, replacement.to_s, replace_all].compact
192
+ Java::CascadingOperationRegex::RegexReplace.new(*parameters)
193
+ end
194
+
195
+ def field_joiner(*args)
196
+ options = args.extract_options!
197
+ delimiter = options[:delimiter] || ','
198
+ fields = fields(options[:into])
199
+
200
+ parameters = [fields, delimiter].compact
201
+ Java::CascadingOperationText::FieldJoiner.new(*parameters)
202
+ end
203
+ end
204
+ end
@@ -0,0 +1,160 @@
1
+ module Cascading
2
+ class Scope
3
+ attr_accessor :scope, :grouping_key_fields, :primary_key_fields, :grouping_primary_key_fields
4
+ @@scheme_keys = {}
5
+
6
+ def initialize(scope, params = {})
7
+ @scope = scope
8
+ @grouping_key_fields = fields(params[:grouping_key_fields] || [])
9
+ @primary_key_fields = fields(params[:primary_key_fields])
10
+ @grouping_primary_key_fields = fields(params[:grouping_primary_key_fields])
11
+ end
12
+
13
+ def copy
14
+ Scope.new(Java::CascadingFlow::Scope.new(@scope),
15
+ :grouping_key_fields => @grouping_key_fields,
16
+ :primary_key_fields => @primary_key_fields,
17
+ :grouping_primary_key_fields => @grouping_primary_key_fields
18
+ )
19
+ end
20
+
21
+ def self.register_scheme_key(scheme, primary_key)
22
+ @@scheme_keys[scheme] = primary_key
23
+ end
24
+
25
+ def self.empty_scope(name)
26
+ Scope.new(Java::CascadingFlow::Scope.new(name))
27
+ end
28
+
29
+ def self.tap_scope(tap, name)
30
+ java_scope = outgoing_scope_for(tap, java.util.HashSet.new)
31
+ # Taps and Pipes don't name their outgoing scopes like other FlowElements
32
+ java_scope.name = name
33
+ scope = Scope.new(java_scope,
34
+ :primary_key_fields => @@scheme_keys[tap.scheme.class],
35
+ :grouping_primary_key_fields => @@scheme_keys[tap.scheme.class]
36
+ )
37
+ vf, gf = scope.values_fields.to_a, scope.grouping_fields.to_a
38
+ pk, gpk = scope.primary_key_fields.to_a, scope.grouping_primary_key_fields.to_a
39
+ raise "Primary key must be a subset of available fields (primary key: #{pk.inspect}, values fields: #{vf.inspect})" unless vf & pk == pk
40
+ raise "Grouping primary key must be a subset of available fields (grouping primary key: #{gpk.inspect}, grouping fields: #{gf.inspect})" unless gf & gpk == gpk
41
+ scope
42
+ end
43
+
44
+ def self.outgoing_scope(flow_element, incoming_scopes, grouping_key_fields, every_applied)
45
+ java_scopes = incoming_scopes.compact.map{ |s| s.scope }
46
+ scope = Scope.new(outgoing_scope_for(flow_element, java.util.HashSet.new(java_scopes)),
47
+ :grouping_key_fields => grouping_key_fields
48
+ )
49
+ scope.grouping_primary_key_fields = fields(grouping_primary_key_fields(flow_element, incoming_scopes, scope))
50
+ scope.primary_key_fields = scope.grouping_primary_key_fields if every_applied
51
+ scope.primary_key_fields = fields(primary_key_fields(flow_element, incoming_scopes, scope)) unless every_applied
52
+ scope
53
+ end
54
+
55
+ def values_fields
56
+ @scope.out_values_fields
57
+ end
58
+
59
+ def grouping_fields
60
+ keys = @grouping_key_fields.to_a
61
+ grouping_fields = @scope.out_grouping_fields.to_a
62
+ # Overwrite key fields only
63
+ fields(keys + grouping_fields[keys.size..-1])
64
+ end
65
+
66
+ def to_s
67
+ kind = 'Unknown'
68
+ kind = 'Tap' if @scope.tap?
69
+ kind = 'Group' if @scope.group?
70
+ kind = 'Each' if @scope.each?
71
+ kind = 'Every' if @scope.every?
72
+ <<-END
73
+ Scope name: #{@scope.name}
74
+ Kind: #{kind}
75
+ Argument selector: #{@scope.argument_selector}
76
+ Declared fields: #{@scope.declared_fields}
77
+ Grouping selectors: #{@scope.grouping_selectors}
78
+ Sorting selectors: #{@scope.sorting_selectors}
79
+ Out grouping
80
+ selector: #{@scope.out_grouping_selector}
81
+ fields: #{grouping_fields}
82
+ key fields: #{@grouping_key_fields}
83
+ primary key fields: #{@grouping_primary_key_fields}
84
+ Out values
85
+ selector: #{@scope.out_values_selector}
86
+ fields: #{values_fields}
87
+ primary key fields: #{@primary_key_fields}
88
+ END
89
+ end
90
+
91
+ private
92
+
93
+ def self.outgoing_scope_for(flow_element, incoming_scopes)
94
+ begin
95
+ flow_element.outgoing_scope_for(incoming_scopes)
96
+ rescue NativeException => e
97
+ raise CascadingException.new(e, 'Exception computing outgoing scope')
98
+ end
99
+ end
100
+
101
+ def self.primary_key_fields(flow_element, incoming_scopes, scope)
102
+ case flow_element
103
+ when Java::CascadingPipe::Each
104
+ # assert incoming_scopes.size == 1
105
+ project_primary_key(incoming_scopes.first.primary_key_fields,
106
+ incoming_scopes.first.values_fields.to_a,
107
+ scope.values_fields.to_a)
108
+ when Java::CascadingPipe::Every
109
+ # assert incoming_scopes.size == 1
110
+ incoming_scopes.first.primary_key_fields
111
+ when Java::CascadingPipe::GroupBy
112
+ if incoming_scopes.size == 1
113
+ incoming_scopes.first.primary_key_fields
114
+ else
115
+ # We must clear the primary key when unioning multiple inputs. If
116
+ # the programmer wants to preserve the primary key, they must use
117
+ # the primary override.
118
+ nil
119
+ end
120
+ when Java::CascadingPipe::CoGroup
121
+ # FIXME: assume grouping_key_fields are the same for all
122
+ # incoming_scopes. Need join to give me names from all incoming
123
+ # scopes to perform rename on primary key fields.
124
+ union_fields(*incoming_scopes.map{ |s| s.primary_key_fields })
125
+ else raise "No primary key rules for FlowElement of type #{flow_element}"
126
+ end
127
+ end
128
+
129
+ def self.project_primary_key(primary_key, old_fields, new_fields)
130
+ return nil if primary_key.nil?
131
+ primary_key = primary_key.to_a
132
+ primary_key if (primary_key & new_fields) == primary_key
133
+ end
134
+
135
+ def self.grouping_primary_key_fields(flow_element, incoming_scopes, scope)
136
+ case flow_element
137
+ when Java::CascadingPipe::Each
138
+ # assert incoming_scopes.size == 1
139
+ project_primary_key(incoming_scopes.first.grouping_primary_key_fields,
140
+ incoming_scopes.first.grouping_fields.to_a,
141
+ scope.grouping_fields.to_a)
142
+ when Java::CascadingPipe::Every
143
+ # assert incoming_scopes.size == 1
144
+ incoming_scopes.first.grouping_primary_key_fields
145
+ when Java::CascadingPipe::GroupBy
146
+ scope.grouping_key_fields
147
+ when Java::CascadingPipe::CoGroup
148
+ scope.grouping_key_fields
149
+ else raise "No primary key rules for FlowElement of type #{flow_element}"
150
+ end
151
+ end
152
+ end
153
+
154
+ # Register default primary keys
155
+ begin
156
+ Scope.register_scheme_key(Java::CascadingScheme::TextLine, ['offset'])
157
+ rescue NameError => ne
158
+ puts 'WARNING: Could not register primary key for TextLine Scheme as it was not on the class path'
159
+ end
160
+ end
data/lib/cascading.rb ADDED
@@ -0,0 +1,63 @@
1
+ # Copyright 2009, Grégoire Marabout. All Rights Reserved.
2
+ #
3
+ # This is free software. Please see the LICENSE and COPYING files for details.
4
+
5
+ require 'java'
6
+
7
+ module Cascading
8
+ # :stopdoc:
9
+ VERSION = '0.0.4'
10
+ LIBPATH = ::File.expand_path(::File.dirname(__FILE__)) + ::File::SEPARATOR
11
+ PATH = ::File.dirname(LIBPATH) + ::File::SEPARATOR
12
+ CASCADING_HOME = ENV['CASCADING_HOME']
13
+ HADOOP_HOME = ENV['HADOOP_HOME']
14
+
15
+ # :startdoc:
16
+
17
+ # Returns the version string for the library.
18
+ #
19
+ def self.version
20
+ VERSION
21
+ end
22
+
23
+ # Returns the library path for the module. If any arguments are given,
24
+ # they will be joined to the end of the libray path using
25
+ # <tt>File.join</tt>.
26
+ #
27
+ def self.libpath( *args )
28
+ args.empty? ? LIBPATH : ::File.join(LIBPATH, args.flatten)
29
+ end
30
+
31
+ # Returns the lpath for the module. If any arguments are given,
32
+ # they will be joined to the end of the path using
33
+ # <tt>File.join</tt>.
34
+ #
35
+ def self.path( *args )
36
+ args.empty? ? PATH : ::File.join(PATH, args.flatten)
37
+ end
38
+
39
+ def self.require_all_jars(from = ::File.join(::File.dirname(__FILE__), "..", "jars"))
40
+ search_me = ::File.expand_path(
41
+ ::File.join(from, '**', '*.jar'))
42
+ Dir.glob(search_me).sort.each do |jar|
43
+ #puts "required: #{jar}"
44
+ require jar
45
+ end
46
+ end
47
+ end
48
+
49
+ Cascading.require_all_jars(Cascading::HADOOP_HOME) if Cascading::HADOOP_HOME
50
+ Cascading.require_all_jars(Cascading::CASCADING_HOME) if Cascading::CASCADING_HOME
51
+
52
+ require 'cascading/assembly'
53
+ require 'cascading/base'
54
+ require 'cascading/cascade'
55
+ require 'cascading/cascading'
56
+ require 'cascading/cascading_exception'
57
+ require 'cascading/expr_stub'
58
+ require 'cascading/flow'
59
+ require 'cascading/operations'
60
+ require 'cascading/scope'
61
+
62
+ # include module to make them available at top package
63
+ include Cascading
data/samples/branch.rb ADDED
@@ -0,0 +1,31 @@
1
+ #! /usr/bin/env jruby
2
+
3
+ $: << File.join(File.dirname(__FILE__), '..', 'lib')
4
+
5
+ require 'cascading'
6
+ require 'samples/cascading'
7
+
8
+ cascade 'branch' do
9
+ flow 'branch' do
10
+ source 'input', tap('samples/data/data2.txt')
11
+
12
+ assembly 'input' do
13
+ split 'line', ['name', 'score1', 'score2', 'id'], :pattern => /[.,]*\s+/
14
+
15
+ branch 'branch1' do
16
+ group_by 'score1' do
17
+ count
18
+ end
19
+ end
20
+
21
+ branch 'branch2' do
22
+ group_by 'score2' do
23
+ count
24
+ end
25
+ end
26
+ end
27
+
28
+ sink 'branch1', tap('output/branch1', :sink_mode => :replace)
29
+ sink 'branch2', tap('output/branch2', :sink_mode => :replace)
30
+ end
31
+ end.complete(sample_properties)
@@ -0,0 +1,41 @@
1
+ module Cascading
2
+ # Constructs properties to be passed to Flow#complete or Cascade#complete
3
+ # which will locate temporary Hadoop files in build/sample. It is necessary
4
+ # to pass these properties only because the sample apps are invoked using
5
+ # JRuby's main method, which confuses the JobConf's attempt to find the
6
+ # containing jar.
7
+ def sample_properties
8
+ build_dir = 'build/sample/build'
9
+ `mkdir -p #{build_dir}`
10
+ tmp_dir = "build/sample/tmp"
11
+ `mkdir -p #{tmp_dir}`
12
+ log_dir = "build/sample/log"
13
+ `mkdir -p #{log_dir}`
14
+
15
+ # Local cluster settings
16
+ #java.lang.System.set_property("test.build.data", build_dir)
17
+ #java.lang.System.set_property("hadoop.tmp.dir", tmp_dir)
18
+ #java.lang.System.set_property("hadoop.log.dir", log_dir)
19
+ #conf = Java::OrgApacheHadoopConf::Configuration.new
20
+ #dfs = Java::OrgApacheHadoopDfs::MiniDFSCluster.new(conf, 4, true, nil);
21
+ #file_sys = dfs.file_system
22
+ #mr = Java::OrgApacheHadoopMapred::MiniMRCluster.new(4, file_sys.uri.to_string, 1)
23
+ #job_conf = mr.create_job_conf
24
+ #job_conf.set("mapred.child.java.opts", "-Xmx512m")
25
+ #job_conf.set("mapred.map.tasks.speculative.execution", "false")
26
+ #job_conf.set("mapred.reduce.tasks.speculative.execution", "false")
27
+
28
+ job_conf = Java::OrgApacheHadoopMapred::JobConf.new
29
+ job_conf.jar = build_dir
30
+ job_conf.set("test.build.data", build_dir)
31
+ job_conf.set("hadoop.tmp.dir", tmp_dir)
32
+ job_conf.set("hadoop.log.dir", log_dir)
33
+
34
+ job_conf.num_map_tasks = 4
35
+ job_conf.num_reduce_tasks = 1
36
+
37
+ properties = java.util.HashMap.new({})
38
+ Java::CascadingFlow::MultiMapReducePlanner.set_job_conf(properties, job_conf)
39
+ properties
40
+ end
41
+ end
data/samples/copy.rb ADDED
@@ -0,0 +1,18 @@
1
+ #! /usr/bin/env jruby
2
+ $: << File.join(File.dirname(__FILE__), '..', 'lib')
3
+
4
+ require 'cascading'
5
+ require 'samples/cascading'
6
+
7
+ cascade 'copy' do
8
+ flow 'copy' do
9
+ source 'input', tap('http://www.census.gov/genealogy/names/dist.all.last')
10
+
11
+ assembly 'input' do
12
+ rename 'line' => 'value'
13
+ reject 'value:string.indexOf("R") == -1'
14
+ end
15
+
16
+ sink 'input', tap('output/copy', :sink_mode => :replace)
17
+ end
18
+ end.complete(sample_properties)