cascading.jruby 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- data/HACKING.md +15 -0
- data/History.txt +0 -0
- data/LICENSE.txt +165 -0
- data/README.md +7 -0
- data/Rakefile +45 -0
- data/bin/make_job +81 -0
- data/lib/cascading/assembly.rb +726 -0
- data/lib/cascading/base.rb +63 -0
- data/lib/cascading/cascade.rb +63 -0
- data/lib/cascading/cascading.rb +134 -0
- data/lib/cascading/cascading_exception.rb +30 -0
- data/lib/cascading/expr_stub.rb +33 -0
- data/lib/cascading/ext/array.rb +15 -0
- data/lib/cascading/flow.rb +168 -0
- data/lib/cascading/operations.rb +204 -0
- data/lib/cascading/scope.rb +160 -0
- data/lib/cascading.rb +63 -0
- data/samples/branch.rb +31 -0
- data/samples/cascading.rb +41 -0
- data/samples/copy.rb +18 -0
- data/samples/data/data2.txt +88799 -0
- data/samples/data/data_join1.txt +3 -0
- data/samples/data/data_join2.txt +3 -0
- data/samples/data/data_join3.txt +3 -0
- data/samples/join.rb +32 -0
- data/samples/logwordcount.rb +22 -0
- data/samples/project.rb +24 -0
- data/samples/rename.rb +21 -0
- data/samples/scorenames.rb +20 -0
- data/samples/splitter.rb +20 -0
- data/samples/union.rb +35 -0
- data/spec/cascading_spec.rb +100 -0
- data/spec/expr_spec.rb +10 -0
- data/spec/primary_key_spec.rb +119 -0
- data/spec/resource/join_input.txt +3 -0
- data/spec/resource/test_input.txt +4 -0
- data/spec/scope_spec.rb +174 -0
- data/spec/spec.opts +6 -0
- data/spec/spec_helper.rb +5 -0
- data/spec/spec_util.rb +188 -0
- data/src/cascading/jruby/Main.java +38 -0
- data/src/cascading/jruby/runner.rb +6 -0
- data/tags +238 -0
- data/tasks/ann.rake +80 -0
- data/tasks/ant.rake +11 -0
- data/tasks/bones.rake +20 -0
- data/tasks/gem.rake +206 -0
- data/tasks/git.rake +40 -0
- data/tasks/notes.rake +27 -0
- data/tasks/post_load.rake +34 -0
- data/tasks/rdoc.rake +50 -0
- data/tasks/rubyforge.rake +55 -0
- data/tasks/samples.rake +13 -0
- data/tasks/setup.rb +300 -0
- data/tasks/spec.rake +59 -0
- data/tasks/svn.rake +47 -0
- data/tasks/test.rake +42 -0
- data/test/data/data1.txt +14 -0
- data/test/data/data2.txt +14 -0
- data/test/test_assembly.rb +321 -0
- data/test/test_cascading.rb +49 -0
- data/test/test_flow.rb +15 -0
- metadata +137 -0
@@ -0,0 +1,204 @@
|
|
1
|
+
# Copyright 2009, Grégoire Marabout. All Rights Reserved.
|
2
|
+
#
|
3
|
+
# This is free software. Please see the LICENSE and COPYING files for details.
|
4
|
+
|
5
|
+
module Cascading
|
6
|
+
module Operations
|
7
|
+
def identity
|
8
|
+
Java::CascadingOperation::Identity.new
|
9
|
+
end
|
10
|
+
|
11
|
+
def sum_function(*args)
|
12
|
+
options = args.extract_options!
|
13
|
+
raise "Need to specify args" if args.empty?
|
14
|
+
type = options[:type] || java.lang.Double.java_class
|
15
|
+
parameters = [Cascading.fields(args),type].compact.to_java
|
16
|
+
|
17
|
+
Java::CascadingOperationAggregator::Sum.new(*parameters)
|
18
|
+
end
|
19
|
+
|
20
|
+
def aggregator_function(args, aggregator_klass)
|
21
|
+
options = args.extract_options!
|
22
|
+
ignore_values = options[:sql] ? [nil].to_java(java.lang.Object) : nil
|
23
|
+
parameters = [Cascading.fields(args), ignore_values].compact
|
24
|
+
aggregator_klass.new(*parameters)
|
25
|
+
end
|
26
|
+
|
27
|
+
def count_function(*args)
|
28
|
+
aggregator_function(args, Java::CascadingOperationAggregator::Count)
|
29
|
+
end
|
30
|
+
|
31
|
+
def average_function(*args)
|
32
|
+
aggregator_function(args, Java::CascadingOperationAggregator::Average)
|
33
|
+
end
|
34
|
+
|
35
|
+
def first_function(*args)
|
36
|
+
aggregator_function(args, Java::CascadingOperationAggregator::First)
|
37
|
+
end
|
38
|
+
|
39
|
+
def min_function(*args)
|
40
|
+
aggregator_function(args, Java::CascadingOperationAggregator::Min)
|
41
|
+
end
|
42
|
+
|
43
|
+
def max_function(*args)
|
44
|
+
aggregator_function(args, Java::CascadingOperationAggregator::Max)
|
45
|
+
end
|
46
|
+
|
47
|
+
def last_function(*args)
|
48
|
+
aggregator_function(args, Java::CascadingOperationAggregator::Last)
|
49
|
+
end
|
50
|
+
|
51
|
+
def regex_parser(*args)
|
52
|
+
options = args.extract_options!
|
53
|
+
|
54
|
+
pattern = args[0].to_s
|
55
|
+
fields = Cascading.fields(options[:fields])
|
56
|
+
groups = options[:groups].to_java(:int) if options[:groups]
|
57
|
+
parameters = [fields, pattern, groups].compact
|
58
|
+
|
59
|
+
Java::CascadingOperationRegex::RegexParser.new(*parameters)
|
60
|
+
end
|
61
|
+
|
62
|
+
def regex_splitter(*args)
|
63
|
+
options = args.extract_options!
|
64
|
+
|
65
|
+
fields = Cascading.fields(args)
|
66
|
+
pattern = options[:pattern].to_s
|
67
|
+
parameters = [fields, pattern].compact
|
68
|
+
Java::CascadingOperationRegex::RegexSplitter.new(*parameters)
|
69
|
+
end
|
70
|
+
|
71
|
+
def regex_split_generator(*args)
|
72
|
+
options = args.extract_options!
|
73
|
+
|
74
|
+
fields = Cascading.fields(args)
|
75
|
+
pattern = options[:pattern].to_s
|
76
|
+
parameters = [fields, pattern].compact
|
77
|
+
Java::CascadingOperationRegex::RegexSplitGenerator.new(*parameters)
|
78
|
+
end
|
79
|
+
|
80
|
+
def regex_generator(*args)
|
81
|
+
options = args.extract_options!
|
82
|
+
|
83
|
+
fields = Cascading.fields(args)
|
84
|
+
pattern = options[:pattern].to_s
|
85
|
+
parameters = [fields, pattern].compact
|
86
|
+
Java::CascadingOperationRegex::RegexGenerator.new(*parameters)
|
87
|
+
end
|
88
|
+
|
89
|
+
def expression_function(*args)
|
90
|
+
options = args.extract_options!
|
91
|
+
|
92
|
+
fields = Cascading.fields(args)
|
93
|
+
expression = options[:expression].to_s
|
94
|
+
parameters = options[:parameters]
|
95
|
+
parameter_names = []
|
96
|
+
parameter_types = []
|
97
|
+
if parameters.is_a? ::Hash
|
98
|
+
parameters.each do |name, type|
|
99
|
+
parameter_names << name
|
100
|
+
parameter_types << type
|
101
|
+
end
|
102
|
+
parameter_names = parameter_names.to_java(java.lang.String)
|
103
|
+
parameter_types = parameter_types.to_java(java.lang.Class)
|
104
|
+
|
105
|
+
arguments = [fields, expression, parameter_names, parameter_types].compact
|
106
|
+
elsif !parameters.nil?
|
107
|
+
arguments = [fields, expression, parameters.java_class].compact
|
108
|
+
else
|
109
|
+
arguments = [fields, expression, java.lang.String.java_class].compact
|
110
|
+
end
|
111
|
+
|
112
|
+
Java::CascadingOperationExpression::ExpressionFunction.new(*arguments)
|
113
|
+
end
|
114
|
+
|
115
|
+
def insert_function(*args)
|
116
|
+
options=args.extract_options!
|
117
|
+
fields = Cascading.fields(args)
|
118
|
+
values = options[:values]
|
119
|
+
|
120
|
+
parameters = [fields, to_java_comparable_array(values)].compact
|
121
|
+
Java::CascadingOperation::Insert.new(*parameters)
|
122
|
+
end
|
123
|
+
|
124
|
+
def to_java_comparable_array(arr)
|
125
|
+
(arr.map do |v|
|
126
|
+
case v.class
|
127
|
+
when Fixnum
|
128
|
+
java.lang.Integer.new(v)
|
129
|
+
when Float
|
130
|
+
java.lang.Double.new(v)
|
131
|
+
else
|
132
|
+
java.lang.String.new(v.to_s)
|
133
|
+
end
|
134
|
+
end).to_java(java.lang.Comparable)
|
135
|
+
end
|
136
|
+
|
137
|
+
def expression_filter(*args)
|
138
|
+
options = args.extract_options!
|
139
|
+
expression = (args[0] || options[:expression]).to_s
|
140
|
+
parameters = options[:parameters]
|
141
|
+
parameter_names = []
|
142
|
+
parameter_types = []
|
143
|
+
if parameters.is_a? ::Hash
|
144
|
+
parameters.each do |name, type|
|
145
|
+
parameter_names << name
|
146
|
+
parameter_types << type
|
147
|
+
end
|
148
|
+
parameter_names = parameter_names.to_java(java.lang.String)
|
149
|
+
parameter_types = parameter_types.to_java(java.lang.Class)
|
150
|
+
|
151
|
+
arguments = [expression, parameter_names, parameter_types].compact
|
152
|
+
elsif !parameters.nil?
|
153
|
+
arguments = [expression, parameters.java_class].compact
|
154
|
+
else
|
155
|
+
arguments = [expression, java.lang.String.java_class].compact
|
156
|
+
end
|
157
|
+
|
158
|
+
Java::CascadingOperationExpression::ExpressionFilter.new(*arguments)
|
159
|
+
end
|
160
|
+
|
161
|
+
def date_parser(field, format)
|
162
|
+
fields = fields(field)
|
163
|
+
Java::CascadingOperationText::DateParser.new(fields, format)
|
164
|
+
end
|
165
|
+
|
166
|
+
def date_formatter(fields, format, timezone=nil)
|
167
|
+
fields = fields(fields)
|
168
|
+
timezone = Java::JavaUtil::TimeZone.get_time_zone(timezone) if timezone
|
169
|
+
arguments = [fields, format, timezone].compact
|
170
|
+
Java::CascadingOperationText::DateFormatter.new(*arguments)
|
171
|
+
end
|
172
|
+
|
173
|
+
def regex_filter(*args)
|
174
|
+
options = args.extract_options!
|
175
|
+
|
176
|
+
pattern = args[0]
|
177
|
+
remove_match = options[:remove_match]
|
178
|
+
match_each_element = options[:match_each_element]
|
179
|
+
parameters = [pattern.to_s, remove_match, match_each_element].compact
|
180
|
+
Java::CascadingOperationRegex::RegexFilter.new(*parameters)
|
181
|
+
end
|
182
|
+
|
183
|
+
def regex_replace(*args)
|
184
|
+
options = args.extract_options!
|
185
|
+
|
186
|
+
fields = fields(args[0])
|
187
|
+
pattern = args[1]
|
188
|
+
replacement = args[2]
|
189
|
+
replace_all = options[:replace_all]
|
190
|
+
|
191
|
+
parameters = [fields, pattern.to_s, replacement.to_s, replace_all].compact
|
192
|
+
Java::CascadingOperationRegex::RegexReplace.new(*parameters)
|
193
|
+
end
|
194
|
+
|
195
|
+
def field_joiner(*args)
|
196
|
+
options = args.extract_options!
|
197
|
+
delimiter = options[:delimiter] || ','
|
198
|
+
fields = fields(options[:into])
|
199
|
+
|
200
|
+
parameters = [fields, delimiter].compact
|
201
|
+
Java::CascadingOperationText::FieldJoiner.new(*parameters)
|
202
|
+
end
|
203
|
+
end
|
204
|
+
end
|
@@ -0,0 +1,160 @@
|
|
1
|
+
module Cascading
|
2
|
+
class Scope
|
3
|
+
attr_accessor :scope, :grouping_key_fields, :primary_key_fields, :grouping_primary_key_fields
|
4
|
+
@@scheme_keys = {}
|
5
|
+
|
6
|
+
def initialize(scope, params = {})
|
7
|
+
@scope = scope
|
8
|
+
@grouping_key_fields = fields(params[:grouping_key_fields] || [])
|
9
|
+
@primary_key_fields = fields(params[:primary_key_fields])
|
10
|
+
@grouping_primary_key_fields = fields(params[:grouping_primary_key_fields])
|
11
|
+
end
|
12
|
+
|
13
|
+
def copy
|
14
|
+
Scope.new(Java::CascadingFlow::Scope.new(@scope),
|
15
|
+
:grouping_key_fields => @grouping_key_fields,
|
16
|
+
:primary_key_fields => @primary_key_fields,
|
17
|
+
:grouping_primary_key_fields => @grouping_primary_key_fields
|
18
|
+
)
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.register_scheme_key(scheme, primary_key)
|
22
|
+
@@scheme_keys[scheme] = primary_key
|
23
|
+
end
|
24
|
+
|
25
|
+
def self.empty_scope(name)
|
26
|
+
Scope.new(Java::CascadingFlow::Scope.new(name))
|
27
|
+
end
|
28
|
+
|
29
|
+
def self.tap_scope(tap, name)
|
30
|
+
java_scope = outgoing_scope_for(tap, java.util.HashSet.new)
|
31
|
+
# Taps and Pipes don't name their outgoing scopes like other FlowElements
|
32
|
+
java_scope.name = name
|
33
|
+
scope = Scope.new(java_scope,
|
34
|
+
:primary_key_fields => @@scheme_keys[tap.scheme.class],
|
35
|
+
:grouping_primary_key_fields => @@scheme_keys[tap.scheme.class]
|
36
|
+
)
|
37
|
+
vf, gf = scope.values_fields.to_a, scope.grouping_fields.to_a
|
38
|
+
pk, gpk = scope.primary_key_fields.to_a, scope.grouping_primary_key_fields.to_a
|
39
|
+
raise "Primary key must be a subset of available fields (primary key: #{pk.inspect}, values fields: #{vf.inspect})" unless vf & pk == pk
|
40
|
+
raise "Grouping primary key must be a subset of available fields (grouping primary key: #{gpk.inspect}, grouping fields: #{gf.inspect})" unless gf & gpk == gpk
|
41
|
+
scope
|
42
|
+
end
|
43
|
+
|
44
|
+
def self.outgoing_scope(flow_element, incoming_scopes, grouping_key_fields, every_applied)
|
45
|
+
java_scopes = incoming_scopes.compact.map{ |s| s.scope }
|
46
|
+
scope = Scope.new(outgoing_scope_for(flow_element, java.util.HashSet.new(java_scopes)),
|
47
|
+
:grouping_key_fields => grouping_key_fields
|
48
|
+
)
|
49
|
+
scope.grouping_primary_key_fields = fields(grouping_primary_key_fields(flow_element, incoming_scopes, scope))
|
50
|
+
scope.primary_key_fields = scope.grouping_primary_key_fields if every_applied
|
51
|
+
scope.primary_key_fields = fields(primary_key_fields(flow_element, incoming_scopes, scope)) unless every_applied
|
52
|
+
scope
|
53
|
+
end
|
54
|
+
|
55
|
+
def values_fields
|
56
|
+
@scope.out_values_fields
|
57
|
+
end
|
58
|
+
|
59
|
+
def grouping_fields
|
60
|
+
keys = @grouping_key_fields.to_a
|
61
|
+
grouping_fields = @scope.out_grouping_fields.to_a
|
62
|
+
# Overwrite key fields only
|
63
|
+
fields(keys + grouping_fields[keys.size..-1])
|
64
|
+
end
|
65
|
+
|
66
|
+
def to_s
|
67
|
+
kind = 'Unknown'
|
68
|
+
kind = 'Tap' if @scope.tap?
|
69
|
+
kind = 'Group' if @scope.group?
|
70
|
+
kind = 'Each' if @scope.each?
|
71
|
+
kind = 'Every' if @scope.every?
|
72
|
+
<<-END
|
73
|
+
Scope name: #{@scope.name}
|
74
|
+
Kind: #{kind}
|
75
|
+
Argument selector: #{@scope.argument_selector}
|
76
|
+
Declared fields: #{@scope.declared_fields}
|
77
|
+
Grouping selectors: #{@scope.grouping_selectors}
|
78
|
+
Sorting selectors: #{@scope.sorting_selectors}
|
79
|
+
Out grouping
|
80
|
+
selector: #{@scope.out_grouping_selector}
|
81
|
+
fields: #{grouping_fields}
|
82
|
+
key fields: #{@grouping_key_fields}
|
83
|
+
primary key fields: #{@grouping_primary_key_fields}
|
84
|
+
Out values
|
85
|
+
selector: #{@scope.out_values_selector}
|
86
|
+
fields: #{values_fields}
|
87
|
+
primary key fields: #{@primary_key_fields}
|
88
|
+
END
|
89
|
+
end
|
90
|
+
|
91
|
+
private
|
92
|
+
|
93
|
+
def self.outgoing_scope_for(flow_element, incoming_scopes)
|
94
|
+
begin
|
95
|
+
flow_element.outgoing_scope_for(incoming_scopes)
|
96
|
+
rescue NativeException => e
|
97
|
+
raise CascadingException.new(e, 'Exception computing outgoing scope')
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
def self.primary_key_fields(flow_element, incoming_scopes, scope)
|
102
|
+
case flow_element
|
103
|
+
when Java::CascadingPipe::Each
|
104
|
+
# assert incoming_scopes.size == 1
|
105
|
+
project_primary_key(incoming_scopes.first.primary_key_fields,
|
106
|
+
incoming_scopes.first.values_fields.to_a,
|
107
|
+
scope.values_fields.to_a)
|
108
|
+
when Java::CascadingPipe::Every
|
109
|
+
# assert incoming_scopes.size == 1
|
110
|
+
incoming_scopes.first.primary_key_fields
|
111
|
+
when Java::CascadingPipe::GroupBy
|
112
|
+
if incoming_scopes.size == 1
|
113
|
+
incoming_scopes.first.primary_key_fields
|
114
|
+
else
|
115
|
+
# We must clear the primary key when unioning multiple inputs. If
|
116
|
+
# the programmer wants to preserve the primary key, they must use
|
117
|
+
# the primary override.
|
118
|
+
nil
|
119
|
+
end
|
120
|
+
when Java::CascadingPipe::CoGroup
|
121
|
+
# FIXME: assume grouping_key_fields are the same for all
|
122
|
+
# incoming_scopes. Need join to give me names from all incoming
|
123
|
+
# scopes to perform rename on primary key fields.
|
124
|
+
union_fields(*incoming_scopes.map{ |s| s.primary_key_fields })
|
125
|
+
else raise "No primary key rules for FlowElement of type #{flow_element}"
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
def self.project_primary_key(primary_key, old_fields, new_fields)
|
130
|
+
return nil if primary_key.nil?
|
131
|
+
primary_key = primary_key.to_a
|
132
|
+
primary_key if (primary_key & new_fields) == primary_key
|
133
|
+
end
|
134
|
+
|
135
|
+
def self.grouping_primary_key_fields(flow_element, incoming_scopes, scope)
|
136
|
+
case flow_element
|
137
|
+
when Java::CascadingPipe::Each
|
138
|
+
# assert incoming_scopes.size == 1
|
139
|
+
project_primary_key(incoming_scopes.first.grouping_primary_key_fields,
|
140
|
+
incoming_scopes.first.grouping_fields.to_a,
|
141
|
+
scope.grouping_fields.to_a)
|
142
|
+
when Java::CascadingPipe::Every
|
143
|
+
# assert incoming_scopes.size == 1
|
144
|
+
incoming_scopes.first.grouping_primary_key_fields
|
145
|
+
when Java::CascadingPipe::GroupBy
|
146
|
+
scope.grouping_key_fields
|
147
|
+
when Java::CascadingPipe::CoGroup
|
148
|
+
scope.grouping_key_fields
|
149
|
+
else raise "No primary key rules for FlowElement of type #{flow_element}"
|
150
|
+
end
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
# Register default primary keys
|
155
|
+
begin
|
156
|
+
Scope.register_scheme_key(Java::CascadingScheme::TextLine, ['offset'])
|
157
|
+
rescue NameError => ne
|
158
|
+
puts 'WARNING: Could not register primary key for TextLine Scheme as it was not on the class path'
|
159
|
+
end
|
160
|
+
end
|
data/lib/cascading.rb
ADDED
@@ -0,0 +1,63 @@
|
|
1
|
+
# Copyright 2009, Grégoire Marabout. All Rights Reserved.
|
2
|
+
#
|
3
|
+
# This is free software. Please see the LICENSE and COPYING files for details.
|
4
|
+
|
5
|
+
require 'java'
|
6
|
+
|
7
|
+
module Cascading
|
8
|
+
# :stopdoc:
|
9
|
+
VERSION = '0.0.4'
|
10
|
+
LIBPATH = ::File.expand_path(::File.dirname(__FILE__)) + ::File::SEPARATOR
|
11
|
+
PATH = ::File.dirname(LIBPATH) + ::File::SEPARATOR
|
12
|
+
CASCADING_HOME = ENV['CASCADING_HOME']
|
13
|
+
HADOOP_HOME = ENV['HADOOP_HOME']
|
14
|
+
|
15
|
+
# :startdoc:
|
16
|
+
|
17
|
+
# Returns the version string for the library.
|
18
|
+
#
|
19
|
+
def self.version
|
20
|
+
VERSION
|
21
|
+
end
|
22
|
+
|
23
|
+
# Returns the library path for the module. If any arguments are given,
|
24
|
+
# they will be joined to the end of the libray path using
|
25
|
+
# <tt>File.join</tt>.
|
26
|
+
#
|
27
|
+
def self.libpath( *args )
|
28
|
+
args.empty? ? LIBPATH : ::File.join(LIBPATH, args.flatten)
|
29
|
+
end
|
30
|
+
|
31
|
+
# Returns the lpath for the module. If any arguments are given,
|
32
|
+
# they will be joined to the end of the path using
|
33
|
+
# <tt>File.join</tt>.
|
34
|
+
#
|
35
|
+
def self.path( *args )
|
36
|
+
args.empty? ? PATH : ::File.join(PATH, args.flatten)
|
37
|
+
end
|
38
|
+
|
39
|
+
def self.require_all_jars(from = ::File.join(::File.dirname(__FILE__), "..", "jars"))
|
40
|
+
search_me = ::File.expand_path(
|
41
|
+
::File.join(from, '**', '*.jar'))
|
42
|
+
Dir.glob(search_me).sort.each do |jar|
|
43
|
+
#puts "required: #{jar}"
|
44
|
+
require jar
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
Cascading.require_all_jars(Cascading::HADOOP_HOME) if Cascading::HADOOP_HOME
|
50
|
+
Cascading.require_all_jars(Cascading::CASCADING_HOME) if Cascading::CASCADING_HOME
|
51
|
+
|
52
|
+
require 'cascading/assembly'
|
53
|
+
require 'cascading/base'
|
54
|
+
require 'cascading/cascade'
|
55
|
+
require 'cascading/cascading'
|
56
|
+
require 'cascading/cascading_exception'
|
57
|
+
require 'cascading/expr_stub'
|
58
|
+
require 'cascading/flow'
|
59
|
+
require 'cascading/operations'
|
60
|
+
require 'cascading/scope'
|
61
|
+
|
62
|
+
# include module to make them available at top package
|
63
|
+
include Cascading
|
data/samples/branch.rb
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
#! /usr/bin/env jruby
|
2
|
+
|
3
|
+
$: << File.join(File.dirname(__FILE__), '..', 'lib')
|
4
|
+
|
5
|
+
require 'cascading'
|
6
|
+
require 'samples/cascading'
|
7
|
+
|
8
|
+
cascade 'branch' do
|
9
|
+
flow 'branch' do
|
10
|
+
source 'input', tap('samples/data/data2.txt')
|
11
|
+
|
12
|
+
assembly 'input' do
|
13
|
+
split 'line', ['name', 'score1', 'score2', 'id'], :pattern => /[.,]*\s+/
|
14
|
+
|
15
|
+
branch 'branch1' do
|
16
|
+
group_by 'score1' do
|
17
|
+
count
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
branch 'branch2' do
|
22
|
+
group_by 'score2' do
|
23
|
+
count
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
sink 'branch1', tap('output/branch1', :sink_mode => :replace)
|
29
|
+
sink 'branch2', tap('output/branch2', :sink_mode => :replace)
|
30
|
+
end
|
31
|
+
end.complete(sample_properties)
|
@@ -0,0 +1,41 @@
|
|
1
|
+
module Cascading
|
2
|
+
# Constructs properties to be passed to Flow#complete or Cascade#complete
|
3
|
+
# which will locate temporary Hadoop files in build/sample. It is necessary
|
4
|
+
# to pass these properties only because the sample apps are invoked using
|
5
|
+
# JRuby's main method, which confuses the JobConf's attempt to find the
|
6
|
+
# containing jar.
|
7
|
+
def sample_properties
|
8
|
+
build_dir = 'build/sample/build'
|
9
|
+
`mkdir -p #{build_dir}`
|
10
|
+
tmp_dir = "build/sample/tmp"
|
11
|
+
`mkdir -p #{tmp_dir}`
|
12
|
+
log_dir = "build/sample/log"
|
13
|
+
`mkdir -p #{log_dir}`
|
14
|
+
|
15
|
+
# Local cluster settings
|
16
|
+
#java.lang.System.set_property("test.build.data", build_dir)
|
17
|
+
#java.lang.System.set_property("hadoop.tmp.dir", tmp_dir)
|
18
|
+
#java.lang.System.set_property("hadoop.log.dir", log_dir)
|
19
|
+
#conf = Java::OrgApacheHadoopConf::Configuration.new
|
20
|
+
#dfs = Java::OrgApacheHadoopDfs::MiniDFSCluster.new(conf, 4, true, nil);
|
21
|
+
#file_sys = dfs.file_system
|
22
|
+
#mr = Java::OrgApacheHadoopMapred::MiniMRCluster.new(4, file_sys.uri.to_string, 1)
|
23
|
+
#job_conf = mr.create_job_conf
|
24
|
+
#job_conf.set("mapred.child.java.opts", "-Xmx512m")
|
25
|
+
#job_conf.set("mapred.map.tasks.speculative.execution", "false")
|
26
|
+
#job_conf.set("mapred.reduce.tasks.speculative.execution", "false")
|
27
|
+
|
28
|
+
job_conf = Java::OrgApacheHadoopMapred::JobConf.new
|
29
|
+
job_conf.jar = build_dir
|
30
|
+
job_conf.set("test.build.data", build_dir)
|
31
|
+
job_conf.set("hadoop.tmp.dir", tmp_dir)
|
32
|
+
job_conf.set("hadoop.log.dir", log_dir)
|
33
|
+
|
34
|
+
job_conf.num_map_tasks = 4
|
35
|
+
job_conf.num_reduce_tasks = 1
|
36
|
+
|
37
|
+
properties = java.util.HashMap.new({})
|
38
|
+
Java::CascadingFlow::MultiMapReducePlanner.set_job_conf(properties, job_conf)
|
39
|
+
properties
|
40
|
+
end
|
41
|
+
end
|
data/samples/copy.rb
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
#! /usr/bin/env jruby
|
2
|
+
$: << File.join(File.dirname(__FILE__), '..', 'lib')
|
3
|
+
|
4
|
+
require 'cascading'
|
5
|
+
require 'samples/cascading'
|
6
|
+
|
7
|
+
cascade 'copy' do
|
8
|
+
flow 'copy' do
|
9
|
+
source 'input', tap('http://www.census.gov/genealogy/names/dist.all.last')
|
10
|
+
|
11
|
+
assembly 'input' do
|
12
|
+
rename 'line' => 'value'
|
13
|
+
reject 'value:string.indexOf("R") == -1'
|
14
|
+
end
|
15
|
+
|
16
|
+
sink 'input', tap('output/copy', :sink_mode => :replace)
|
17
|
+
end
|
18
|
+
end.complete(sample_properties)
|