wukong 1.4.2 → 1.4.5
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG.textile +9 -0
- data/bin/hdp-stream2 +2 -2
- data/examples/contrib/jeans/README.markdown +165 -0
- data/examples/contrib/jeans/data/normalized_sizes +3 -0
- data/examples/contrib/jeans/data/orders.tsv +1302 -0
- data/examples/contrib/jeans/data/sizes +3 -0
- data/examples/contrib/jeans/normalize.rb +20 -0
- data/examples/contrib/jeans/sizes.rb +55 -0
- data/examples/foo.rb +9 -0
- data/examples/word_count.rb +1 -1
- data/lib/wukong.rb +1 -1
- data/lib/wukong/extensions/hash.rb +52 -18
- data/lib/wukong/schema.rb +10 -10
- data/lib/wukong/script.rb +77 -97
- data/lib/wukong/script/hadoop_command.rb +21 -19
- data/lib/wukong/script/local_command.rb +9 -1
- data/lib/wukong/streamer/base.rb +1 -1
- data/spec/spec.opts +1 -0
- data/spec/spec_helper.rb +11 -0
- data/spec/wukong/encoding_spec.rb +36 -0
- data/spec/wukong/script_spec.rb +80 -0
- data/wukong.gemspec +23 -24
- metadata +18 -5
- data/lib/wukong/boot.rb +0 -47
- data/spec/bin/hdp-wc_spec.rb +0 -4
@@ -0,0 +1,20 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# run like so:
|
3
|
+
# $> ruby normalize.rb --run=local data/sizes.tsv data/normalized_sizes.tsv
|
4
|
+
require 'rubygems'
|
5
|
+
require 'wukong'
|
6
|
+
require 'active_support/core_ext/enumerable' # for array#sum
|
7
|
+
|
8
|
+
module Normalize
|
9
|
+
class Mapper < Wukong::Streamer::RecordStreamer
|
10
|
+
def process(country, *sizes)
|
11
|
+
sizes.map!(&:to_i)
|
12
|
+
sum = sizes.sum.to_f
|
13
|
+
normalized = sizes.map{|x| 100 * x/sum }
|
14
|
+
s = normalized.join(",")
|
15
|
+
yield [country, s]
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
Wukong::Script.new(Normalize::Mapper, nil).run
|
@@ -0,0 +1,55 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# run like so:
|
3
|
+
# $> ruby sizes.rb --run=local data/orders.tsv data/sizes
|
4
|
+
require 'rubygems'
|
5
|
+
require 'wukong'
|
6
|
+
|
7
|
+
module JeanSizes
|
8
|
+
class Mapper < Wukong::Streamer::RecordStreamer
|
9
|
+
def process(code,model,time,country,reg,col, n1,c1, venue,n3,n4, *sizes)
|
10
|
+
yield [country, *sizes]
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
#
|
15
|
+
# This uses a ListReducer. It's nice and simple, but requires first
|
16
|
+
# accumulating each key's records in memory.
|
17
|
+
#
|
18
|
+
class JeansListReducer < Wukong::Streamer::ListReducer
|
19
|
+
def finalize
|
20
|
+
return if values.empty?
|
21
|
+
sums = []; 13.times{ sums << 0 }
|
22
|
+
values.each do |country, *sizes|
|
23
|
+
sizes.map!(&:to_i)
|
24
|
+
sums = sums.zip(sizes).map{|sum, val| sum + val }
|
25
|
+
end
|
26
|
+
yield [key, *sums]
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
|
31
|
+
#
|
32
|
+
# This uses an AccumulatingReducer directly.
|
33
|
+
# It has the advantage of a minimal footprint.
|
34
|
+
#
|
35
|
+
class JeansAccumulatingReducer < Wukong::Streamer::AccumulatingReducer
|
36
|
+
attr_accessor :sums
|
37
|
+
|
38
|
+
# start the sum with 0 for each size
|
39
|
+
def start! *_
|
40
|
+
self.sums = []; 13.times{ self.sums << 0 }
|
41
|
+
end
|
42
|
+
# accumulate each size count into the sizes_sum
|
43
|
+
def accumulate country, *sizes
|
44
|
+
sizes.map!(&:to_i)
|
45
|
+
self.sums = self.sums.zip(sizes).map{|sum, val| sum + val }
|
46
|
+
end
|
47
|
+
# emit [country, size_0_sum, size_1_sum, ...]
|
48
|
+
def finalize
|
49
|
+
yield [key, sums].flatten
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
end
|
54
|
+
|
55
|
+
Wukong::Script.new(JeanSizes::Mapper, JeanSizes::JeansListReducer).run
|
data/examples/foo.rb
ADDED
data/examples/word_count.rb
CHANGED
data/lib/wukong.rb
CHANGED
@@ -1,9 +1,9 @@
|
|
1
|
-
require 'wukong/boot'
|
2
1
|
require 'wukong/extensions'
|
3
2
|
require 'wukong/datatypes'
|
4
3
|
require 'wukong/logger'
|
5
4
|
require 'wukong/bad_record'
|
6
5
|
autoload :TypedStruct, 'wukong/typed_struct'
|
6
|
+
require 'configliere'; Configliere.use :define
|
7
7
|
module Wukong
|
8
8
|
autoload :Dfs, 'wukong/dfs'
|
9
9
|
autoload :Script, 'wukong/script'
|
@@ -40,9 +40,9 @@ class Hash
|
|
40
40
|
end
|
41
41
|
|
42
42
|
# lambda for recursive merges
|
43
|
-
Hash::DEEP_MERGER = proc do |key,v1,v2|
|
43
|
+
::Hash::DEEP_MERGER = proc do |key,v1,v2|
|
44
44
|
(v1.respond_to?(:merge) && v2.respond_to?(:merge)) ? v1.merge(v2.compact, &Hash::DEEP_MERGER) : (v2.nil? ? v1 : v2)
|
45
|
-
end
|
45
|
+
end unless defined?(::Hash::DEEP_MERGER)
|
46
46
|
|
47
47
|
#
|
48
48
|
# Merge hashes recursively.
|
@@ -72,36 +72,60 @@ class Hash
|
|
72
72
|
merge! hsh2, &Hash::DEEP_MERGER
|
73
73
|
end
|
74
74
|
|
75
|
-
|
76
75
|
#
|
77
76
|
# Treat hash as tree of hashes:
|
78
77
|
#
|
79
78
|
# x = { 1 => :val, :subhash => { 1 => :val1 } }
|
80
|
-
# x.deep_set(:subhash,
|
81
|
-
# # => { 1 => :val, :subhash => { 1 => :val1,
|
79
|
+
# x.deep_set(:subhash, :cat, :hat)
|
80
|
+
# # => { 1 => :val, :subhash => { 1 => :val1, :cat => :hat } }
|
82
81
|
# x.deep_set(:subhash, 1, :newval)
|
83
|
-
# # => { 1 => :val, :subhash => { 1 => :newval,
|
82
|
+
# # => { 1 => :val, :subhash => { 1 => :newval, :cat => :hat } }
|
84
83
|
#
|
85
84
|
#
|
86
85
|
def deep_set *args
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
# grab last subtree (building out if necessary)
|
92
|
-
head_keys.each{|key| hsh = (hsh[key] ||= {}) }
|
86
|
+
val = args.pop
|
87
|
+
last_key = args.pop
|
88
|
+
# dig down to last subtree (building out if necessary)
|
89
|
+
hsh = args.empty? ? self : args.inject(self){|hsh, key| hsh[key] ||= {} }
|
93
90
|
# set leaf value
|
94
91
|
hsh[last_key] = val
|
95
92
|
end
|
96
93
|
|
97
|
-
#
|
98
|
-
|
99
|
-
|
94
|
+
#
|
95
|
+
# Treat hash as tree of hashes:
|
96
|
+
#
|
97
|
+
# x = { 1 => :val, :subhash => { 1 => :val1 } }
|
98
|
+
# x.deep_get(:subhash, 1)
|
99
|
+
# # => :val
|
100
|
+
# x.deep_get(:subhash, 2)
|
101
|
+
# # => nil
|
102
|
+
# x.deep_get(:subhash, 2, 3)
|
103
|
+
# # => nil
|
104
|
+
# x.deep_get(:subhash, 2)
|
105
|
+
# # => nil
|
106
|
+
#
|
107
|
+
def deep_get *args
|
108
|
+
last_key = args.pop
|
109
|
+
# dig down to last subtree (building out if necessary)
|
110
|
+
hsh = args.inject(self){|hsh, key| hsh[key] || {} }
|
111
|
+
# get leaf value
|
112
|
+
hsh[last_key]
|
100
113
|
end
|
101
114
|
|
102
|
-
|
103
|
-
|
104
|
-
|
115
|
+
|
116
|
+
#
|
117
|
+
# Treat hash as tree of hashes:
|
118
|
+
#
|
119
|
+
# x = { 1 => :val, :subhash => { 1 => :val1, 2 => :val2 } }
|
120
|
+
# x.deep_delete(:subhash, 1)
|
121
|
+
# #=> :val
|
122
|
+
# x
|
123
|
+
# #=> { 1 => :val, :subhash => { 2 => :val2 } }
|
124
|
+
#
|
125
|
+
def deep_delete *args
|
126
|
+
last_key = args.pop
|
127
|
+
last_hsh = args.empty? ? self : (deep_get(*args)||{})
|
128
|
+
last_hsh.delete(last_key)
|
105
129
|
end
|
106
130
|
|
107
131
|
#
|
@@ -117,4 +141,14 @@ class Hash
|
|
117
141
|
replace(compact)
|
118
142
|
end
|
119
143
|
|
144
|
+
# Stolen from ActiveSupport::CoreExtensions::Hash::ReverseMerge.
|
145
|
+
def reverse_merge(other_hash)
|
146
|
+
other_hash.merge(self)
|
147
|
+
end
|
148
|
+
|
149
|
+
# Stolen from ActiveSupport::CoreExtensions::Hash::ReverseMerge.
|
150
|
+
def reverse_merge!(other_hash)
|
151
|
+
replace(reverse_merge(other_hash))
|
152
|
+
end
|
153
|
+
|
120
154
|
end
|
data/lib/wukong/schema.rb
CHANGED
@@ -9,16 +9,16 @@ class << Integer ; def to_sql() 'INT' end ; end
|
|
9
9
|
class << Bignum ; def to_sql() 'BIGINT' end ; end
|
10
10
|
class << String ; def to_sql() 'VARCHAR(255) CHARACTER SET ASCII' end ; end
|
11
11
|
class << Symbol ; def to_sql() 'VARCHAR(255) CHARACTER SET ASCII' end ; end
|
12
|
-
class << BigDecimal ; def
|
13
|
-
class << EpochTime ; def
|
14
|
-
class << FilePath ; def
|
15
|
-
class << Flag ; def
|
16
|
-
class << IPAddress ; def
|
17
|
-
class << URI ; def
|
18
|
-
class << Csv ; def
|
19
|
-
class << Yaml ; def
|
20
|
-
class << Json ; def
|
21
|
-
class << Regex ; def
|
12
|
+
class << BigDecimal ; def to_sql() 'DECIMAL' end ; end if defined?(BigDecimal)
|
13
|
+
class << EpochTime ; def to_sql() 'INT' end ; end if defined?(EpochTime)
|
14
|
+
class << FilePath ; def to_sql() 'VARCHAR(255) CHARACTER SET ASCII' end ; end if defined?(FilePath)
|
15
|
+
class << Flag ; def to_sql() 'CHAR(1) CHARACTER SET ASCII' end ; end if defined?(Flag)
|
16
|
+
class << IPAddress ; def to_sql() 'CHAR(15) CHARACTER SET ASCII' end ; end if defined?(IPAddress)
|
17
|
+
class << URI ; def to_sql() 'VARCHAR(255) CHARACTER SET ASCII' end ; end if defined?(URI)
|
18
|
+
class << Csv ; def to_sql() 'TEXT' end ; end if defined?(Csv)
|
19
|
+
class << Yaml ; def to_sql() 'TEXT' end ; end if defined?(Yaml)
|
20
|
+
class << Json ; def to_sql() 'TEXT' end ; end if defined?(Json)
|
21
|
+
class << Regex ; def to_sql() 'TEXT' end ; end if defined?(Regex)
|
22
22
|
class String ; def to_sql() self ; end ; end
|
23
23
|
class Symbol ; def to_sql() self.to_s.upcase ; end ; end
|
24
24
|
|
data/lib/wukong/script.rb
CHANGED
@@ -1,7 +1,8 @@
|
|
1
1
|
require 'pathname'
|
2
2
|
require 'wukong/script/hadoop_command'
|
3
3
|
require 'wukong/script/local_command'
|
4
|
-
require '
|
4
|
+
require 'configliere' ; Configliere.use(:commandline, :env_var, :define)
|
5
|
+
require 'rbconfig' # for uncovering ruby_interpreter_path
|
5
6
|
module Wukong
|
6
7
|
|
7
8
|
# == How to run a Wukong script
|
@@ -58,6 +59,38 @@ module Wukong
|
|
58
59
|
include Wukong::LocalCommand
|
59
60
|
attr_accessor :mapper_klass, :reducer_klass, :options
|
60
61
|
|
62
|
+
# ---------------------------------------------------------------------------
|
63
|
+
#
|
64
|
+
# Default options for Wukong
|
65
|
+
# http://github.com/infochimps/wukong
|
66
|
+
#
|
67
|
+
# If you set an environment variable WUKONG_CONFIG, *or* if the file
|
68
|
+
# $HOME/.wukong.rb exists, that file will be +require+'d as well.
|
69
|
+
#
|
70
|
+
# Important values to set:
|
71
|
+
#
|
72
|
+
# * hadoop_home -- Path to root of hadoop install. If your hadoop runner is
|
73
|
+
# /usr/local/share/hadoop/bin/hadoop
|
74
|
+
# then your hadoop_home is
|
75
|
+
# /usr/local/share/hadoop.
|
76
|
+
# You can also set a :hadoop_runner that gives the full path to the hadoop script
|
77
|
+
#
|
78
|
+
# * default_run_mode -- Whether to run using hadoop (and
|
79
|
+
# thus, requiring a working hadoop install), or to run in local mode
|
80
|
+
# (script --map | sort | script --reduce)
|
81
|
+
#
|
82
|
+
Settings.define :default_run_mode, :default => 'hadoop', :description => 'Run as local or as hadoop?', :wukong => true, :hide_help => false
|
83
|
+
Settings.define :default_mapper, :default => '/bin/cat', :description => 'The command to run when a nil mapper is given.', :wukong => true, :hide_help => true
|
84
|
+
Settings.define :default_reducer, :default => '/bin/cat', :description => 'The command to run when a nil reducer is given.', :wukong => true, :hide_help => true
|
85
|
+
Settings.define :hadoop_home, :default => '/usr/lib/hadoop', :env_var => 'HADOOP_HOME', :description => "Path to hadoop installation; :hadoop_home/bin/hadoop should run hadoop.", :wukong => true
|
86
|
+
Settings.define :hadoop_runner, :description => "Path to hadoop script; usually, set :hadoop_home instead of this.", :wukong => true
|
87
|
+
Settings.define :map, :description => "run the script's map phase. Reads/writes to STDIN/STDOUT.", :wukong => true
|
88
|
+
Settings.define :reduce, :description => "run the script's reduce phase. Reads/writes to STDIN/STDOUT. You can only choose one of --run, --map or --reduce.", :wukong => true
|
89
|
+
Settings.define :run, :description => "run the script's main phase. In hadoop mode, invokes the hadoop script; in local mode, runs your_script.rb --map | sort | your_script.rb --reduce", :wukong => true
|
90
|
+
Settings.define :local, :description => "run in local mode (invokes 'your_script.rb --map | sort | your_script.rb --reduce'", :wukong => true
|
91
|
+
Settings.define :hadoop, :description => "run in hadoop mode (invokes the system hadoop runner script)", :wukong => true
|
92
|
+
Settings.define :dry_run, :description => "echo the command that will be run, but don't run it", :wukong => true
|
93
|
+
|
61
94
|
#
|
62
95
|
# Instantiate the Script with the Mapper and the Reducer class (each a
|
63
96
|
# Wukong::Streamer) it should call back.
|
@@ -86,8 +119,9 @@ module Wukong
|
|
86
119
|
# MyScript.new(MyMapper, nil).run
|
87
120
|
#
|
88
121
|
def initialize mapper_klass, reducer_klass, extra_options={}
|
89
|
-
self.options =
|
90
|
-
|
122
|
+
self.options = Settings.dup
|
123
|
+
options.resolve!
|
124
|
+
options.merge! extra_options
|
91
125
|
self.mapper_klass = mapper_klass
|
92
126
|
self.reducer_klass = reducer_klass
|
93
127
|
# If no reducer_klass and no reduce_command, then skip the reduce phase
|
@@ -97,65 +131,12 @@ module Wukong
|
|
97
131
|
#
|
98
132
|
# Gives default options. Command line parameters take precedence
|
99
133
|
#
|
100
|
-
# MAKE SURE YOU CALL SUPER: write your script according to the
|
134
|
+
# MAKE SURE YOU CALL SUPER: write your script according to the pattern
|
101
135
|
#
|
102
136
|
# super.merge :my_option => :val
|
103
137
|
#
|
104
138
|
def default_options
|
105
|
-
|
106
|
-
end
|
107
|
-
|
108
|
-
# Options that don't need to go in the :all_args hash
|
109
|
-
def std_options
|
110
|
-
@std_options ||= [:run, :map, :reduce, ] + HADOOP_OPTIONS_MAP.keys
|
111
|
-
end
|
112
|
-
|
113
|
-
#
|
114
|
-
# Parse the command-line args into the options hash.
|
115
|
-
#
|
116
|
-
# I should not reinvent the wheel.
|
117
|
-
# Yet: here we are.
|
118
|
-
#
|
119
|
-
# '--foo=foo_val' produces :foo => 'foo_val' in the options hash.
|
120
|
-
# '--' After seeing a non-'--' flag, or a '--' on its own, no further flags are parsed
|
121
|
-
#
|
122
|
-
# options[:all_args] contains all arguments that are not in std_options
|
123
|
-
# options[:rest] contains all arguments following the first non-flag (or the '--')
|
124
|
-
#
|
125
|
-
def process_argv!
|
126
|
-
options[:all_args] = []
|
127
|
-
options[:rest] = []
|
128
|
-
args = ARGV.dup
|
129
|
-
while (! args.blank?) do
|
130
|
-
arg = args.shift
|
131
|
-
case
|
132
|
-
when arg == '--'
|
133
|
-
options[:rest] += args
|
134
|
-
when arg =~ /\A--(\w+)(?:=(.+))?\z/
|
135
|
-
opt, val = [$1, $2]
|
136
|
-
opt = opt.to_sym
|
137
|
-
val ||= true
|
138
|
-
self.options[opt] = val
|
139
|
-
options[:all_args] << arg unless std_options.include?(opt)
|
140
|
-
else
|
141
|
-
options[:all_args] << arg
|
142
|
-
options[:rest] << arg
|
143
|
-
end
|
144
|
-
# p [options, arg, args]
|
145
|
-
end
|
146
|
-
options[:all_args] = options[:all_args].join(" ")
|
147
|
-
end
|
148
|
-
|
149
|
-
def this_script_filename
|
150
|
-
Pathname.new($0).realpath
|
151
|
-
end
|
152
|
-
|
153
|
-
def ruby_interpreter_path
|
154
|
-
Pathname.new(
|
155
|
-
File.join(Config::CONFIG["bindir"],
|
156
|
-
Config::CONFIG["RUBY_INSTALL_NAME"]+
|
157
|
-
Config::CONFIG["EXEEXT"])
|
158
|
-
).realpath
|
139
|
+
{}
|
159
140
|
end
|
160
141
|
|
161
142
|
#
|
@@ -164,8 +145,8 @@ module Wukong
|
|
164
145
|
def map_command
|
165
146
|
case
|
166
147
|
when mapper_klass
|
167
|
-
"#{ruby_interpreter_path} #{this_script_filename} --map " +
|
168
|
-
else options[:map_command] ||
|
148
|
+
"#{ruby_interpreter_path} #{this_script_filename} --map " + non_wukong_params
|
149
|
+
else options[:map_command] || options[:default_mapper] end
|
169
150
|
end
|
170
151
|
|
171
152
|
#
|
@@ -175,8 +156,8 @@ module Wukong
|
|
175
156
|
def reduce_command
|
176
157
|
case
|
177
158
|
when reducer_klass
|
178
|
-
"#{ruby_interpreter_path} #{this_script_filename} --reduce " +
|
179
|
-
else options[:reduce_command] ||
|
159
|
+
"#{ruby_interpreter_path} #{this_script_filename} --reduce " + non_wukong_params
|
160
|
+
else options[:reduce_command] || options[:default_reducer] end
|
180
161
|
end
|
181
162
|
|
182
163
|
#
|
@@ -197,14 +178,16 @@ module Wukong
|
|
197
178
|
end
|
198
179
|
|
199
180
|
def run_mode
|
181
|
+
return 'local' if options[:local]
|
182
|
+
return 'hadoop' if options[:hadoop]
|
200
183
|
# if only --run is given, assume default run mode
|
201
|
-
options[:run] =
|
184
|
+
options[:run] = options[:default_run_mode] if (options[:run] == true)
|
202
185
|
options[:run].to_s
|
203
186
|
end
|
204
187
|
|
205
188
|
def input_output_paths
|
206
189
|
# input / output paths
|
207
|
-
input_path, output_path = options
|
190
|
+
input_path, output_path = options.rest[0..1]
|
208
191
|
raise "You need to specify a parsed input directory and a directory for output. Got #{ARGV.inspect}" if (! options[:dry_run]) && (input_path.blank? || output_path.blank?)
|
209
192
|
[input_path, output_path]
|
210
193
|
end
|
@@ -216,6 +199,29 @@ module Wukong
|
|
216
199
|
end
|
217
200
|
end
|
218
201
|
|
202
|
+
# Reassemble all the non-internal-to-wukong options into a command line for
|
203
|
+
# the map/reducer phase scripts
|
204
|
+
def non_wukong_params
|
205
|
+
options.
|
206
|
+
reject{|param, val| options.param_definitions[param][:wukong] }.
|
207
|
+
map{|param,val| "--#{param}=#{val}" }.
|
208
|
+
join(" ")
|
209
|
+
end
|
210
|
+
|
211
|
+
# the full, real path to the script file
|
212
|
+
def this_script_filename
|
213
|
+
Pathname.new($0).realpath
|
214
|
+
end
|
215
|
+
|
216
|
+
# use the full ruby interpreter path to run slave processes
|
217
|
+
def ruby_interpreter_path
|
218
|
+
Pathname.new(
|
219
|
+
File.join(Config::CONFIG["bindir"],
|
220
|
+
Config::CONFIG["RUBY_INSTALL_NAME"]+
|
221
|
+
Config::CONFIG["EXEEXT"])
|
222
|
+
).realpath
|
223
|
+
end
|
224
|
+
|
219
225
|
#
|
220
226
|
# Execute the runner phase
|
221
227
|
#
|
@@ -243,41 +249,15 @@ module Wukong
|
|
243
249
|
when options[:run]
|
244
250
|
exec_hadoop_streaming
|
245
251
|
else
|
246
|
-
|
252
|
+
options.dump_help %Q{Please specify a run mode: you probably want to start with
|
253
|
+
#{$0} --run --local input.tsv output.tsv
|
254
|
+
although
|
255
|
+
cat input.tsv | #{$0} --map > mapped.tsv
|
256
|
+
or
|
257
|
+
cat mapped.tsv | sort | #{$0} --reduce > reduced.tsv
|
258
|
+
can be useful for initial testing.}
|
247
259
|
end
|
248
260
|
end
|
249
|
-
|
250
|
-
#
|
251
|
-
# Command line usage
|
252
|
-
#
|
253
|
-
def help
|
254
|
-
$stderr.puts "#{self.class} script"
|
255
|
-
$stderr.puts %Q{
|
256
|
-
#{$0} --run=hadoop input_hdfs_path output_hdfs_dir # run the script with hadoop streaming
|
257
|
-
#{$0} --run=local input_hdfs_path output_hdfs_dir # run the script on local filesystem using unix pipes
|
258
|
-
#{$0} --run input_hdfs_path output_hdfs_dir # run the script with the mode given in config/wukong*.yaml
|
259
|
-
#{$0} --map
|
260
|
-
#{$0} --reduce # dispatch to the mapper or reducer
|
261
|
-
|
262
|
-
All flags must precede the input and output paths.
|
263
|
-
Additional flags:
|
264
|
-
--dry_run
|
265
|
-
Hadoop Options (see hadoop documentation)
|
266
|
-
--max_node_map_tasks => 'mapred.tasktracker.map.tasks.maximum',
|
267
|
-
--max_node_reduce_tasks => 'mapred.tasktracker.reduce.tasks.maximum',
|
268
|
-
--map_tasks => 'mapred.map.tasks',
|
269
|
-
--reduce_tasks => 'mapred.reduce.tasks',
|
270
|
-
--sort_fields => 'stream.num.map.output.key.fields',
|
271
|
-
--key_field_separator => 'map.output.key.field.separator',
|
272
|
-
--partition_fields => 'num.key.fields.for.partition',
|
273
|
-
--output_field_separator => 'stream.map.output.field.separator',
|
274
|
-
--map_speculative => 'mapred.map.tasks.speculative.execution',
|
275
|
-
--timeout => 'mapred.task.timeout',
|
276
|
-
--reuse_jvms => 'mapred.job.reuse.jvm.num.tasks',
|
277
|
-
--ignore_exit_status => 'stream.non.zero.exit.status.is.failure',
|
278
|
-
You can specify as well arbitrary script-specific command line flags; they are added to your options[] hash.
|
279
|
-
}
|
280
|
-
end
|
281
261
|
end
|
282
262
|
|
283
263
|
end
|