wukong-hadoop 0.0.2 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.yardopts +5 -0
- data/Gemfile +2 -0
- data/LICENSE.md +95 -0
- data/Rakefile +0 -3
- data/bin/wu-hadoop +1 -11
- data/examples/counter.rb +1 -1
- data/lib/wukong-hadoop.rb +73 -6
- data/lib/wukong-hadoop/runner.rb +210 -0
- data/lib/wukong-hadoop/{driver → runner}/hadoop_invocation.rb +2 -7
- data/lib/wukong-hadoop/{driver → runner}/local_invocation.rb +0 -0
- data/lib/wukong-hadoop/{driver → runner}/map_logic.rb +4 -2
- data/lib/wukong-hadoop/runner/overwritables.rb +35 -0
- data/lib/wukong-hadoop/{driver → runner}/reduce_logic.rb +3 -2
- data/lib/wukong-hadoop/version.rb +1 -1
- data/spec/spec_helper.rb +11 -4
- data/spec/wukong-hadoop/hadoop_mode_spec.rb +11 -19
- data/spec/wukong-hadoop/local_mode_spec.rb +6 -6
- data/spec/wukong-hadoop/{driver_spec.rb → runner_spec.rb} +35 -33
- data/spec/wukong-hadoop/{wu_hadoop_spec.rb → wu-hadoop_spec.rb} +4 -4
- data/spec/wukong-hadoop_spec.rb +5 -0
- data/wukong-hadoop.gemspec +1 -1
- metadata +18 -19
- data/lib/wukong-hadoop/configuration.rb +0 -137
- data/lib/wukong-hadoop/driver.rb +0 -191
- data/lib/wukong-hadoop/driver/inputs_and_outputs.rb +0 -27
- data/spec/support/driver_helper.rb +0 -15
- data/spec/support/integration_helper.rb +0 -39
@@ -1,137 +0,0 @@
|
|
1
|
-
module Wukong
|
2
|
-
module Hadoop
|
3
|
-
|
4
|
-
# Configure the given settings object for use with Wukong::Hadoop.
|
5
|
-
#
|
6
|
-
# @param [Configliere::Param] settings the settings to configure
|
7
|
-
# @return [Configliere::Param the configured settings
|
8
|
-
def self.configure settings
|
9
|
-
# Hadoop Options
|
10
|
-
settings.define :hadoop_home, wukong_hadoop: true, description: 'Path to hadoop installation. HADOOP_HOME/bin/hadoop is used to run hadoop.', env_var: 'HADOOP_HOME', default: '/usr/lib/hadoop'
|
11
|
-
settings.define :hadoop_runner, wukong_hadoop: true, description: 'Path to hadoop executable. Use this for non-standard hadoop installations.'
|
12
|
-
|
13
|
-
# Translate simplified args to their hairy hadoop equivalents
|
14
|
-
settings.define :io_sort_mb, wukong_hadoop: true, jobconf: true, description: 'io.sort.mb'
|
15
|
-
settings.define :io_sort_record_percent, wukong_hadoop: true, jobconf: true, description: 'io.sort.record.percent'
|
16
|
-
settings.define :job_name, wukong_hadoop: true, jobconf: true, description: 'mapred.job.name'
|
17
|
-
settings.define :key_field_separator, wukong_hadoop: true, jobconf: true, description: 'map.output.key.field.separator'
|
18
|
-
settings.define :map_speculative, wukong_hadoop: true, jobconf: true, description: 'mapred.map.tasks.speculative.execution'
|
19
|
-
settings.define :reduce_speculative, wukong_hadoop: true, jobconf: true, description: 'mapred.reduce.tasks.speculative.execution'
|
20
|
-
settings.define :map_tasks, wukong_hadoop: true, jobconf: true, description: 'mapred.map.tasks'
|
21
|
-
settings.define :max_maps_per_cluster, wukong_hadoop: true, jobconf: true, description: 'mapred.max.maps.per.cluster'
|
22
|
-
settings.define :max_maps_per_node, wukong_hadoop: true, jobconf: true, description: 'mapred.max.maps.per.node'
|
23
|
-
settings.define :max_node_map_tasks, wukong_hadoop: true, jobconf: true, description: 'mapred.tasktracker.map.tasks.maximum'
|
24
|
-
settings.define :max_node_reduce_tasks, wukong_hadoop: true, jobconf: true, description: 'mapred.tasktracker.reduce.tasks.maximum'
|
25
|
-
settings.define :max_record_length, wukong_hadoop: true, jobconf: true, description: 'mapred.linerecordreader.maxlength'
|
26
|
-
settings.define :max_reduces_per_cluster, wukong_hadoop: true, jobconf: true, description: 'mapred.max.reduces.per.cluster'
|
27
|
-
settings.define :max_reduces_per_node, wukong_hadoop: true, jobconf: true, description: 'mapred.max.reduces.per.node'
|
28
|
-
settings.define :max_tracker_failures, wukong_hadoop: true, jobconf: true, description: 'mapred.max.tracker.failures'
|
29
|
-
settings.define :max_map_attempts, wukong_hadoop: true, jobconf: true, description: 'mapred.map.max.attempts'
|
30
|
-
settings.define :max_reduce_attempts, wukong_hadoop: true, jobconf: true, description: 'mapred.reduce.max.attempts'
|
31
|
-
settings.define :min_split_size, wukong_hadoop: true, jobconf: true, description: 'mapred.min.split.size'
|
32
|
-
settings.define :output_field_separator, wukong_hadoop: true, jobconf: true, description: 'stream.map.output.field.separator'
|
33
|
-
settings.define :partition_fields, wukong_hadoop: true, jobconf: true, description: 'num.key.fields.for.partition'
|
34
|
-
settings.define :reduce_tasks, wukong_hadoop: true, jobconf: true, description: 'mapred.reduce.tasks'
|
35
|
-
settings.define :respect_exit_status, wukong_hadoop: true, jobconf: true, description: 'stream.non.zero.exit.is.failure'
|
36
|
-
settings.define :reuse_jvms, wukong_hadoop: true, jobconf: true, description: 'mapred.job.reuse.jvm.num.tasks'
|
37
|
-
settings.define :sort_fields, wukong_hadoop: true, jobconf: true, description: 'stream.num.map.output.key.fields'
|
38
|
-
settings.define :timeout, wukong_hadoop: true, jobconf: true, description: 'mapred.task.timeout'
|
39
|
-
settings.define :noempty, wukong_hadoop: true, description: "Don't create zero-byte reduce files"
|
40
|
-
settings.define :split_on_xml_tag, wukong_hadoop: true, description: "Parse XML document by specifying the tag name: 'anything found between <tag> and </tag> will be treated as one record for map tasks'"
|
41
|
-
settings.define :input_format, wukong_hadoop: true, description: 'Fully qualified Java class name defining an alternative InputFormat.'
|
42
|
-
settings.define :output_format, wukong_hadoop: true, description: 'Fully qualified Java class name defining an alternative OutputFormat.'
|
43
|
-
settings.define :java_opts, wukong_hadoop: true, description: 'Additional Java options to be passed to hadoop streaming.', :type => Array, :default => []
|
44
|
-
settings.define :files, wukong_hadoop: true, description: "Comma-separated list of files (or globs) to be copied to the MapReduce cluster (-files).", :type => Array, :default => []
|
45
|
-
settings.define :jars, wukong_hadoop: true, description: "Comma-separated list of jars (or globs) to include on the Hadoop CLASSPATH (-libjars).", :type => Array, :default => []
|
46
|
-
settings.define :archives, wukong_hadoop: true, description: "Comma-separated list of archives to be unarchived on each worker (-archives).", :type => Array, :default => []
|
47
|
-
|
48
|
-
# Options given on the command-line
|
49
|
-
settings.define :mode, description: "Run in either 'hadoop' or 'local' mode", wukong_hadoop: true, :default => 'hadoop'
|
50
|
-
settings.define :map_command, description: "Shell command to run as mapper, in place of a constructed wu-local command", wukong_hadoop: true
|
51
|
-
settings.define :reduce_command, description: "Shell command to run as reducer, in place of a constructed wu-local command", wukong_hadoop: true
|
52
|
-
settings.define :sort_command, description: "Shell command to run as sorter (only in `local' mode)", wukong_hadoop: true, :default => 'sort'
|
53
|
-
settings.define :command_prefix, description: "Prefex to insert before all Wukong commands", wukong_hadoop: true
|
54
|
-
settings.define :mapper, description: "Name of processor to use as a mapper", wukong_hadoop: true
|
55
|
-
settings.define :reducer, description: "Name of processor to use as a reducer", wukong_hadoop: true
|
56
|
-
settings.define :gemfile, description: "Specify an alternative Gemfile to execute this wukong script with", wukong_hadoop: true
|
57
|
-
settings.define :dry_run, description: "Echo the command that will be run, but don't run it", wukong_hadoop: true, :type => :boolean, :default => false
|
58
|
-
settings.define :rm, description: "Recursively remove the destination directory.", wukong_hadoop: true, :type => :boolean, :default => false
|
59
|
-
settings.define :input, description: "Comma-separated list of input paths", wukong_hadoop: true
|
60
|
-
settings.define :output, description: "Output path.", wukong_hadoop: true
|
61
|
-
|
62
|
-
settings.use(:commandline)
|
63
|
-
|
64
|
-
def settings.usage()
|
65
|
-
"usage: #{File.basename($0)} PROCESSOR|FLOW [PROCESSOR|FLOW] [ --param=value | -p value | --param | -p]"
|
66
|
-
end
|
67
|
-
|
68
|
-
settings.description = <<EOF
|
69
|
-
wu-hadoop is a tool to model and launch Wukong processors as
|
70
|
-
map/reduce workflows within the Hadoop framework.
|
71
|
-
|
72
|
-
Use wu-hadoop with existing processors in `local' mode to test the
|
73
|
-
logic of your job, reading from the specified --input and printing to
|
74
|
-
STDOUT:
|
75
|
-
|
76
|
-
$ wu-hadoop examples/word_count.rb --mode=local --input=examples/sonnet_18.txt
|
77
|
-
a 2
|
78
|
-
all 1
|
79
|
-
and 2
|
80
|
-
...
|
81
|
-
|
82
|
-
where it is assumed that your mapper is called 'mapper' and your
|
83
|
-
reducer 'reducer'. You can also cat in data:
|
84
|
-
|
85
|
-
$ cat examples/sonnet_18.txt | wu-hadoop examples/word_count.rb --mode=local
|
86
|
-
|
87
|
-
Or pass options directly:
|
88
|
-
|
89
|
-
$ wu-hadoop examples/word_count.rb --mode=local --input=examples/sonnet_18.txt --fold_case --min_length=3
|
90
|
-
all 1
|
91
|
-
and 5
|
92
|
-
art 1
|
93
|
-
brag 1
|
94
|
-
...
|
95
|
-
|
96
|
-
Or define both processors in separate files:
|
97
|
-
|
98
|
-
$ wu-hadoop examples/tokenizer.rb examples/counter.rb --mode=local --input=examples/sonnet_18.txt
|
99
|
-
|
100
|
-
Or by name:
|
101
|
-
|
102
|
-
$ wu-hadoop examples/processors.rb --mode=local --input=examples/sonnet_18.txt --mapper=tokenizer --reducer=counter
|
103
|
-
|
104
|
-
Or just by command:
|
105
|
-
|
106
|
-
$ wu-hadoop processors.rb --mapper=tokenizer --reduce_command='uniq -c' ...
|
107
|
-
$ wu-hadoop processors.rb --map_command='cut -f3' --reducer=counter ...
|
108
|
-
$ wu-hadoop --map_command='cut -f3' --reduce_command='uniq -c' ...
|
109
|
-
|
110
|
-
If you don't specify a --reducer explicitly, and you didn't give two
|
111
|
-
separate arguments, and no processor named :reducer exists in the
|
112
|
-
environment, then we assume you are launching a map-only job and
|
113
|
-
'mapred.tasktracker.reduce.tasks.maximum' will correspondingly be set
|
114
|
-
to 0:
|
115
|
-
|
116
|
-
$ wu-hadoop examples/tokenizer.rb --mode=local --input=examples/sonnet_18.txt
|
117
|
-
Shall
|
118
|
-
I
|
119
|
-
compare
|
120
|
-
thee
|
121
|
-
...
|
122
|
-
|
123
|
-
You can achieve this directly with the --reduce_tasks=0 option.
|
124
|
-
|
125
|
-
Many other Hadoop options have been wrapped with similarly friendly
|
126
|
-
names below. These are ignored when running in `local' mode.
|
127
|
-
|
128
|
-
Some options (like `--sort_command') only make sense in `local' mode.
|
129
|
-
These are ignored in `hadoop' mode.
|
130
|
-
EOF
|
131
|
-
settings
|
132
|
-
end
|
133
|
-
|
134
|
-
# All Hadoop configuration for Wukong lives within this object.
|
135
|
-
Configuration = configure(Configliere::Param.new) unless defined? Configuration
|
136
|
-
end
|
137
|
-
end
|
data/lib/wukong-hadoop/driver.rb
DELETED
@@ -1,191 +0,0 @@
|
|
1
|
-
require 'shellwords'
|
2
|
-
require_relative("driver/inputs_and_outputs")
|
3
|
-
require_relative("driver/map_logic")
|
4
|
-
require_relative("driver/reduce_logic")
|
5
|
-
require_relative("driver/local_invocation")
|
6
|
-
require_relative("driver/hadoop_invocation")
|
7
|
-
|
8
|
-
module Wukong
|
9
|
-
module Hadoop
|
10
|
-
|
11
|
-
# The <tt>Hadoop::Driver</tt> class contains the logic to examine
|
12
|
-
# arguments and construct command lines which it will execute to
|
13
|
-
# create the desired behavior.
|
14
|
-
#
|
15
|
-
# The Hadoop::Driver will introspect on its arguments to guess (if
|
16
|
-
# not given) the processors to use as mapper and reducer in a
|
17
|
-
# map/reduce job. It will also decide whether to run that job in
|
18
|
-
# local or Hadoop mode. These decisions result in a command which
|
19
|
-
# it will ultimately execute.
|
20
|
-
class Driver < Wukong::Driver
|
21
|
-
|
22
|
-
include InputsAndOutputs
|
23
|
-
include MapLogic
|
24
|
-
include ReduceLogic
|
25
|
-
include HadoopInvocation
|
26
|
-
include LocalInvocation
|
27
|
-
include Logging
|
28
|
-
|
29
|
-
# The settings used by this driver.
|
30
|
-
#
|
31
|
-
# @param [Configliere::Param]
|
32
|
-
attr_accessor :settings
|
33
|
-
|
34
|
-
# The (processed) arguments for this driver.
|
35
|
-
#
|
36
|
-
# @param [Array<String, Pathname>]
|
37
|
-
attr_reader :args
|
38
|
-
|
39
|
-
# Initialize and run a new Wukong::Hadoop::Driver for the given
|
40
|
-
# +settings+.
|
41
|
-
#
|
42
|
-
# Will rescue all Wukong::Error exceptions by printing a nice
|
43
|
-
# message to STDERR and exiting.
|
44
|
-
#
|
45
|
-
# @param [Configliere::Param] settings
|
46
|
-
# @param [Array<String>] extra_args
|
47
|
-
def self.run(settings, *extra_args)
|
48
|
-
begin
|
49
|
-
new(settings, *extra_args).run!
|
50
|
-
rescue Wukong::Error => e
|
51
|
-
$stderr.puts e.message
|
52
|
-
exit(127)
|
53
|
-
end
|
54
|
-
end
|
55
|
-
|
56
|
-
# Run this driver.
|
57
|
-
def run!
|
58
|
-
if mode == :local
|
59
|
-
# log.info "Launching local!"
|
60
|
-
execute_command!(local_commandline)
|
61
|
-
else
|
62
|
-
ensure_input_and_output!
|
63
|
-
remove_output_path! if settings[:rm] || settings[:overwrite]
|
64
|
-
log.info "Launching Hadoop!"
|
65
|
-
execute_command!(hadoop_commandline)
|
66
|
-
end
|
67
|
-
end
|
68
|
-
|
69
|
-
# Initialize a new driver with the given +settings+ and +args+.
|
70
|
-
#
|
71
|
-
# @param [Configliere::Param] settings
|
72
|
-
# @param [Array<String>] args
|
73
|
-
def initialize(settings, *args)
|
74
|
-
@settings = settings
|
75
|
-
self.args = args
|
76
|
-
end
|
77
|
-
|
78
|
-
# Set the +args+ for this driver.
|
79
|
-
#
|
80
|
-
# Arguments can be either (registered) processor names or files.
|
81
|
-
#
|
82
|
-
# An error will be raised on missing files or those which
|
83
|
-
# couldn't be loaded.
|
84
|
-
#
|
85
|
-
# An error will be raised if more than two arguments (mapper and
|
86
|
-
# reducer) are passed.
|
87
|
-
#
|
88
|
-
# @param [Array<String>] args
|
89
|
-
def args= args
|
90
|
-
raise Error.new("Cannot provide more than two arguments") if args.length > 2
|
91
|
-
@args = args.map do |arg|
|
92
|
-
if processor_registered?(arg)
|
93
|
-
arg
|
94
|
-
else
|
95
|
-
begin
|
96
|
-
rp = Pathname.new(arg).realpath
|
97
|
-
load rp
|
98
|
-
rp
|
99
|
-
rescue => e
|
100
|
-
raise Error.new("No such processor or file: #{arg}")
|
101
|
-
end
|
102
|
-
end
|
103
|
-
end
|
104
|
-
end
|
105
|
-
|
106
|
-
# What mode is this driver in?
|
107
|
-
#
|
108
|
-
# @return [:hadoop, :local]
|
109
|
-
def mode
|
110
|
-
settings[:mode].to_s == 'local' ? :local : :hadoop
|
111
|
-
end
|
112
|
-
|
113
|
-
# Were mapper and/or reducer named by a single argument?
|
114
|
-
#
|
115
|
-
# @return [true, false]
|
116
|
-
def single_job_arg?
|
117
|
-
args.size == 1
|
118
|
-
end
|
119
|
-
|
120
|
-
# Were mapper and/or reducer named by separate arguments?
|
121
|
-
#
|
122
|
-
# @return [true, false]
|
123
|
-
def separate_map_and_reduce_args?
|
124
|
-
args.size == 2
|
125
|
-
end
|
126
|
-
|
127
|
-
# Is there a processor registered with the given +name+?
|
128
|
-
#
|
129
|
-
# @param [#to_s] name
|
130
|
-
# @return [true, false]
|
131
|
-
def processor_registered? name
|
132
|
-
Wukong.registry.registered?(name.to_s.to_sym)
|
133
|
-
end
|
134
|
-
|
135
|
-
# Return the guessed name of a processor at the given +path+.
|
136
|
-
#
|
137
|
-
# @param [String] path
|
138
|
-
# @return [String]
|
139
|
-
def processor_name_from_file(path)
|
140
|
-
File.basename(path, '.rb')
|
141
|
-
end
|
142
|
-
|
143
|
-
# Does the given +path+ contain a processor named after itself?
|
144
|
-
#
|
145
|
-
# @param [String] path
|
146
|
-
# @return [true, false]
|
147
|
-
def file_is_processor?(path)
|
148
|
-
processor_registered?(processor_name_from_file(path))
|
149
|
-
end
|
150
|
-
|
151
|
-
# The prefix to insert befor all invocations of the
|
152
|
-
# <tt>wu-local</tt> runner.
|
153
|
-
#
|
154
|
-
# @return [String]
|
155
|
-
def command_prefix
|
156
|
-
settings[:command_prefix]
|
157
|
-
end
|
158
|
-
|
159
|
-
# Returns parameters to pass to an invocation of
|
160
|
-
# <tt>wu-local</tt>.
|
161
|
-
#
|
162
|
-
# Parameters like <tt>--reduce_tasks</tt> which are relevant to
|
163
|
-
# Wukong-Hadoop will be interpreted and *not* passed. Others
|
164
|
-
# will be passed unmodified.
|
165
|
-
#
|
166
|
-
# @return [String]
|
167
|
-
def params_to_pass
|
168
|
-
s = (Wukong.loaded_deploy_pack? ? Deploy.pre_deploy_settings : settings)
|
169
|
-
s.reject{ |param, val| s.definition_of(param, :wukong_hadoop) }.map{ |param,val| "--#{param}=#{Shellwords.escape(val.to_s)}" }.join(" ")
|
170
|
-
end
|
171
|
-
|
172
|
-
# Execute a command composed of the given parts.
|
173
|
-
#
|
174
|
-
# Will print the command instead of the <tt>--dry_run</tt>
|
175
|
-
# option was given.
|
176
|
-
#
|
177
|
-
# @param [Array<String>] args
|
178
|
-
def execute_command!(*args)
|
179
|
-
command = args.flatten.reject(&:blank?).join(" \\\n ")
|
180
|
-
if settings[:dry_run]
|
181
|
-
log.info("Dry run:")
|
182
|
-
puts command
|
183
|
-
else
|
184
|
-
puts `#{command}`
|
185
|
-
raise "Streaming command failed!" unless $?.success?
|
186
|
-
end
|
187
|
-
end
|
188
|
-
|
189
|
-
end
|
190
|
-
end
|
191
|
-
end
|
@@ -1,27 +0,0 @@
|
|
1
|
-
module Wukong
|
2
|
-
module Hadoop
|
3
|
-
|
4
|
-
# Provides methods for determining input and output paths.
|
5
|
-
# Written as a separate module to allow easy overriding from other
|
6
|
-
# plugins.
|
7
|
-
module InputsAndOutputs
|
8
|
-
|
9
|
-
# The input paths to read from.
|
10
|
-
#
|
11
|
-
# @return [String]
|
12
|
-
def input_paths
|
13
|
-
(settings[:input] || [])
|
14
|
-
end
|
15
|
-
|
16
|
-
# The output path to write to.
|
17
|
-
#
|
18
|
-
# @return [String]
|
19
|
-
def output_path
|
20
|
-
settings[:output]
|
21
|
-
end
|
22
|
-
|
23
|
-
end
|
24
|
-
end
|
25
|
-
end
|
26
|
-
|
27
|
-
|
@@ -1,15 +0,0 @@
|
|
1
|
-
module Wukong
|
2
|
-
module Hadoop
|
3
|
-
module DriverHelper
|
4
|
-
|
5
|
-
def driver *args
|
6
|
-
params = ::Wukong::Hadoop.configure(Configliere::Param.new)
|
7
|
-
params.resolve!
|
8
|
-
params.merge!(args.pop) if args.last.is_a?(Hash)
|
9
|
-
Wukong::Hadoop::Driver.new(params, *args)
|
10
|
-
end
|
11
|
-
|
12
|
-
end
|
13
|
-
end
|
14
|
-
end
|
15
|
-
|
@@ -1,39 +0,0 @@
|
|
1
|
-
module Wukong
|
2
|
-
module Hadoop
|
3
|
-
module IntegrationHelper
|
4
|
-
|
5
|
-
def root
|
6
|
-
@root ||= Pathname.new(File.expand_path('../../..', __FILE__))
|
7
|
-
end
|
8
|
-
|
9
|
-
def lib_dir *args
|
10
|
-
root.join('lib', *args)
|
11
|
-
end
|
12
|
-
|
13
|
-
def bin_dir *args
|
14
|
-
root.join('bin', *args)
|
15
|
-
end
|
16
|
-
|
17
|
-
def examples_dir *args
|
18
|
-
root.join('examples', *args)
|
19
|
-
end
|
20
|
-
|
21
|
-
def integration_env
|
22
|
-
{
|
23
|
-
"PATH" => [bin_dir.to_s, ENV["PATH"]].compact.join(':'),
|
24
|
-
"RUBYLIB" => [lib_dir.to_s, ENV["RUBYLIB"]].compact.join(':')
|
25
|
-
}
|
26
|
-
end
|
27
|
-
|
28
|
-
def integration_cwd
|
29
|
-
root.to_s
|
30
|
-
end
|
31
|
-
|
32
|
-
def example_script *args
|
33
|
-
examples_dir(*args)
|
34
|
-
end
|
35
|
-
|
36
|
-
end
|
37
|
-
end
|
38
|
-
end
|
39
|
-
|