wukong-hadoop 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +59 -0
- data/.rspec +2 -0
- data/Gemfile +3 -0
- data/README.md +339 -0
- data/Rakefile +13 -0
- data/bin/hdp-bin +44 -0
- data/bin/hdp-bzip +23 -0
- data/bin/hdp-cat +3 -0
- data/bin/hdp-catd +3 -0
- data/bin/hdp-cp +3 -0
- data/bin/hdp-du +86 -0
- data/bin/hdp-get +3 -0
- data/bin/hdp-kill +3 -0
- data/bin/hdp-kill-task +3 -0
- data/bin/hdp-ls +11 -0
- data/bin/hdp-mkdir +2 -0
- data/bin/hdp-mkdirp +12 -0
- data/bin/hdp-mv +3 -0
- data/bin/hdp-parts_to_keys.rb +77 -0
- data/bin/hdp-ps +3 -0
- data/bin/hdp-put +3 -0
- data/bin/hdp-rm +32 -0
- data/bin/hdp-sort +40 -0
- data/bin/hdp-stream +40 -0
- data/bin/hdp-stream-flat +22 -0
- data/bin/hdp-stream2 +39 -0
- data/bin/hdp-sync +17 -0
- data/bin/hdp-wc +67 -0
- data/bin/wu-hadoop +14 -0
- data/examples/counter.rb +17 -0
- data/examples/map_only.rb +28 -0
- data/examples/processors.rb +4 -0
- data/examples/sonnet_18.txt +14 -0
- data/examples/tokenizer.rb +28 -0
- data/examples/word_count.rb +44 -0
- data/features/step_definitions/wu_hadoop_steps.rb +4 -0
- data/features/support/env.rb +1 -0
- data/features/wu_hadoop.feature +113 -0
- data/lib/wukong-hadoop.rb +21 -0
- data/lib/wukong-hadoop/configuration.rb +133 -0
- data/lib/wukong-hadoop/driver.rb +190 -0
- data/lib/wukong-hadoop/driver/hadoop_invocation.rb +184 -0
- data/lib/wukong-hadoop/driver/inputs_and_outputs.rb +27 -0
- data/lib/wukong-hadoop/driver/local_invocation.rb +48 -0
- data/lib/wukong-hadoop/driver/map_logic.rb +104 -0
- data/lib/wukong-hadoop/driver/reduce_logic.rb +129 -0
- data/lib/wukong-hadoop/extensions.rb +2 -0
- data/lib/wukong-hadoop/hadoop_env_methods.rb +80 -0
- data/lib/wukong-hadoop/version.rb +6 -0
- data/spec/spec_helper.rb +21 -0
- data/spec/support/driver_helper.rb +15 -0
- data/spec/support/integration_helper.rb +39 -0
- data/spec/wukong-hadoop/driver_spec.rb +117 -0
- data/spec/wukong-hadoop/hadoop_env_methods_spec.rb +14 -0
- data/spec/wukong-hadoop/hadoop_mode_spec.rb +78 -0
- data/spec/wukong-hadoop/local_mode_spec.rb +22 -0
- data/spec/wukong-hadoop/wu_hadoop_spec.rb +34 -0
- data/wukong-hadoop.gemspec +33 -0
- metadata +168 -0
@@ -0,0 +1,190 @@
|
|
1
|
+
require 'shellwords'
|
2
|
+
require_relative("driver/inputs_and_outputs")
|
3
|
+
require_relative("driver/map_logic")
|
4
|
+
require_relative("driver/reduce_logic")
|
5
|
+
require_relative("driver/local_invocation")
|
6
|
+
require_relative("driver/hadoop_invocation")
|
7
|
+
|
8
|
+
module Wukong
|
9
|
+
module Hadoop
|
10
|
+
|
11
|
+
# The <tt>Hadoop::Driver</tt> class contains the logic to examine
|
12
|
+
# arguments and construct command lines which it will execute to
|
13
|
+
# create the desired behavior.
|
14
|
+
#
|
15
|
+
# The Hadoop::Driver will introspect on its arguments to guess (if
|
16
|
+
# not given) the processors to use as mapper and reducer in a
|
17
|
+
# map/reduce job. It will also decide whether to run that job in
|
18
|
+
# local or Hadoop mode. These decisions result in a command which
|
19
|
+
# it will ultimately execute.
|
20
|
+
class Driver < Wukong::Driver
|
21
|
+
|
22
|
+
include InputsAndOutputs
|
23
|
+
include MapLogic
|
24
|
+
include ReduceLogic
|
25
|
+
include HadoopInvocation
|
26
|
+
include LocalInvocation
|
27
|
+
|
28
|
+
# The settings used by this driver.
|
29
|
+
#
|
30
|
+
# @param [Configliere::Param]
|
31
|
+
attr_accessor :settings
|
32
|
+
|
33
|
+
# The (processed) arguments for this driver.
|
34
|
+
#
|
35
|
+
# @param [Array<String, Pathname>]
|
36
|
+
attr_reader :args
|
37
|
+
|
38
|
+
# Initialize and run a new Wukong::Hadoop::Driver for the given
|
39
|
+
# +settings+.
|
40
|
+
#
|
41
|
+
# Will rescue all Wukong::Error exceptions by printing a nice
|
42
|
+
# message to STDERR and exiting.
|
43
|
+
#
|
44
|
+
# @param [Configliere::Param] settings
|
45
|
+
# @param [Array<String>] extra_args
|
46
|
+
def self.run(settings, *extra_args)
|
47
|
+
begin
|
48
|
+
new(settings, *extra_args).run!
|
49
|
+
rescue Wukong::Error => e
|
50
|
+
$stderr.puts e.message
|
51
|
+
exit(127)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
# Run this driver.
|
56
|
+
def run!
|
57
|
+
if mode == :local
|
58
|
+
# Log.info "Launching local!"
|
59
|
+
execute_command!(local_commandline)
|
60
|
+
else
|
61
|
+
ensure_input_and_output!
|
62
|
+
remove_output_path! if settings[:rm] || settings[:overwrite]
|
63
|
+
Log.info "Launching Hadoop!"
|
64
|
+
execute_command!(hadoop_commandline)
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
# Initialize a new driver with the given +settings+ and +args+.
|
69
|
+
#
|
70
|
+
# @param [Configliere::Param] settings
|
71
|
+
# @param [Array<String>] args
|
72
|
+
def initialize(settings, *args)
|
73
|
+
@settings = settings
|
74
|
+
self.args = args
|
75
|
+
end
|
76
|
+
|
77
|
+
# Set the +args+ for this driver.
|
78
|
+
#
|
79
|
+
# Arguments can be either (registered) processor names or files.
|
80
|
+
#
|
81
|
+
# An error will be raised on missing files or those which
|
82
|
+
# couldn't be loaded.
|
83
|
+
#
|
84
|
+
# An error will be raised if more than two arguments (mapper and
|
85
|
+
# reducer) are passed.
|
86
|
+
#
|
87
|
+
# @param [Array<String>] args
|
88
|
+
def args= args
|
89
|
+
raise Error.new("Cannot provide more than two arguments") if args.length > 2
|
90
|
+
@args = args.map do |arg|
|
91
|
+
if processor_registered?(arg)
|
92
|
+
arg
|
93
|
+
else
|
94
|
+
begin
|
95
|
+
rp = Pathname.new(arg).realpath
|
96
|
+
load rp
|
97
|
+
rp
|
98
|
+
rescue => e
|
99
|
+
raise Error.new("No such processor or file: #{arg}")
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
# What mode is this driver in?
|
106
|
+
#
|
107
|
+
# @return [:hadoop, :local]
|
108
|
+
def mode
|
109
|
+
settings[:mode].to_s == 'local' ? :local : :hadoop
|
110
|
+
end
|
111
|
+
|
112
|
+
# Were mapper and/or reducer named by a single argument?
|
113
|
+
#
|
114
|
+
# @return [true, false]
|
115
|
+
def single_job_arg?
|
116
|
+
args.size == 1
|
117
|
+
end
|
118
|
+
|
119
|
+
# Were mapper and/or reducer named by separate arguments?
|
120
|
+
#
|
121
|
+
# @return [true, false]
|
122
|
+
def separate_map_and_reduce_args?
|
123
|
+
args.size == 2
|
124
|
+
end
|
125
|
+
|
126
|
+
# Is there a processor registered with the given +name+?
|
127
|
+
#
|
128
|
+
# @param [#to_s] name
|
129
|
+
# @return [true, false]
|
130
|
+
def processor_registered? name
|
131
|
+
Wukong.registry.registered?(name.to_s.to_sym)
|
132
|
+
end
|
133
|
+
|
134
|
+
# Return the guessed name of a processor at the given +path+.
|
135
|
+
#
|
136
|
+
# @param [String] path
|
137
|
+
# @return [String]
|
138
|
+
def processor_name_from_file(path)
|
139
|
+
File.basename(path, '.rb')
|
140
|
+
end
|
141
|
+
|
142
|
+
# Does the given +path+ contain a processor named after itself?
|
143
|
+
#
|
144
|
+
# @param [String] path
|
145
|
+
# @return [true, false]
|
146
|
+
def file_is_processor?(path)
|
147
|
+
processor_registered?(processor_name_from_file(path))
|
148
|
+
end
|
149
|
+
|
150
|
+
# The prefix to insert befor all invocations of the
|
151
|
+
# <tt>wu-local</tt> runner.
|
152
|
+
#
|
153
|
+
# @return [String]
|
154
|
+
def command_prefix
|
155
|
+
settings[:command_prefix]
|
156
|
+
end
|
157
|
+
|
158
|
+
# Returns parameters to pass to an invocation of
|
159
|
+
# <tt>wu-local</tt>.
|
160
|
+
#
|
161
|
+
# Parameters like <tt>--reduce_tasks</tt> which are relevant to
|
162
|
+
# Wukong-Hadoop will be interpreted and *not* passed. Others
|
163
|
+
# will be passed unmodified.
|
164
|
+
#
|
165
|
+
# @return [String]
|
166
|
+
def params_to_pass
|
167
|
+
s = (Wukong.loaded_deploy_pack? ? Deploy.pre_deploy_settings : settings)
|
168
|
+
s.reject{ |param, val| s.definition_of(param, :wukong_hadoop) }.map{ |param,val| "--#{param}=#{Shellwords.escape(val.to_s)}" }.join(" ")
|
169
|
+
end
|
170
|
+
|
171
|
+
# Execute a command composed of the given parts.
|
172
|
+
#
|
173
|
+
# Will print the command instead of the <tt>--dry_run</tt>
|
174
|
+
# option was given.
|
175
|
+
#
|
176
|
+
# @param [Array<String>] args
|
177
|
+
def execute_command!(*args)
|
178
|
+
command = args.flatten.reject(&:blank?).join(" \\\n ")
|
179
|
+
if settings[:dry_run]
|
180
|
+
Log.info("Dry run:")
|
181
|
+
puts command
|
182
|
+
else
|
183
|
+
puts `#{command}`
|
184
|
+
raise "Streaming command failed!" unless $?.success?
|
185
|
+
end
|
186
|
+
end
|
187
|
+
|
188
|
+
end
|
189
|
+
end
|
190
|
+
end
|
@@ -0,0 +1,184 @@
|
|
1
|
+
module Wukong
|
2
|
+
module Hadoop
|
3
|
+
|
4
|
+
# Provides methods for executing a map/reduce job on a Hadoop
|
5
|
+
# cluster via {Hadoop
|
6
|
+
# streaming}[http://hadoop.apache.org/docs/r0.15.2/streaming.html].
|
7
|
+
module HadoopInvocation
|
8
|
+
|
9
|
+
# Raise an error unless we have input and output.
|
10
|
+
def ensure_input_and_output!
|
11
|
+
raise Error.new("Explicit --input and --output paths are required to run a job in Hadoop mode.") if input_paths.nil? || input_paths.empty? || output_path.nil? || output_path.empty?
|
12
|
+
end
|
13
|
+
|
14
|
+
# Remove the output path.
|
15
|
+
#
|
16
|
+
# Will not actually do anything if the <tt>--dry_run</tt> option
|
17
|
+
# is also given.
|
18
|
+
def remove_output_path!
|
19
|
+
cmd = %Q{#{settings[:hadoop_runner]} fs -rmr '#{output_path}'}
|
20
|
+
Log.info "Removing output file #{output_path}: #{cmd}"
|
21
|
+
puts `#{cmd}` unless settings[:dry_run]
|
22
|
+
end
|
23
|
+
|
24
|
+
# Return the Hadoop command used to launch this job in a Hadoop
|
25
|
+
# cluster.
|
26
|
+
#
|
27
|
+
# You should be able to copy, paste, and run this command
|
28
|
+
# unmodified when debugging.
|
29
|
+
#
|
30
|
+
# @return [String]
|
31
|
+
def hadoop_commandline
|
32
|
+
[
|
33
|
+
hadoop_runner,
|
34
|
+
"jar #{settings[:hadoop_home]}/contrib/streaming/hadoop-*streaming*.jar",
|
35
|
+
hadoop_jobconf_options,
|
36
|
+
"-D mapred.job.name='#{job_name}'",
|
37
|
+
hadoop_other_args,
|
38
|
+
"-mapper '#{mapper_commandline}'",
|
39
|
+
"-reducer '#{reducer_commandline}'",
|
40
|
+
"-input '#{input_paths}'",
|
41
|
+
"-output '#{output_path}'",
|
42
|
+
hadoop_files,
|
43
|
+
io_formats,
|
44
|
+
hadoop_recycle_env,
|
45
|
+
].flatten.compact.join(" \t\\\n ")
|
46
|
+
end
|
47
|
+
|
48
|
+
# The job name that will be passed to Hadoop.
|
49
|
+
#
|
50
|
+
# Respects the <tt>--job_name</tt> option if given, otherwise
|
51
|
+
# constructs one from the given processors, input, and output
|
52
|
+
# paths.
|
53
|
+
#
|
54
|
+
# @return [String]
|
55
|
+
def job_name
|
56
|
+
return settings[:job_name] if settings[:job_name]
|
57
|
+
relevant_filename = args.compact.uniq.map { |path| File.basename(path, '.rb') }.join('-')
|
58
|
+
"#{relevant_filename}---#{input_paths}---#{output_path}".gsub(%r{[^\w/\.\-\+]+}, '')
|
59
|
+
end
|
60
|
+
|
61
|
+
# The input format to use.
|
62
|
+
#
|
63
|
+
# Respects the value of <tt>--input_format</tt>.
|
64
|
+
#
|
65
|
+
# @return [String]
|
66
|
+
def input_format
|
67
|
+
settings[:input_format]
|
68
|
+
end
|
69
|
+
|
70
|
+
# The output format to use.
|
71
|
+
#
|
72
|
+
# Respects the value of <tt>--output_format</tt>.
|
73
|
+
#
|
74
|
+
# @return [String]
|
75
|
+
def output_format
|
76
|
+
settings[:output_format]
|
77
|
+
end
|
78
|
+
|
79
|
+
# :nodoc:
|
80
|
+
def io_formats
|
81
|
+
input = "-inputformat '#{input_format}'" if input_format
|
82
|
+
output = "-outputformat '#{output_format}'" if output_format
|
83
|
+
[input, output]
|
84
|
+
end
|
85
|
+
|
86
|
+
# The name of the Hadoop binary to use.
|
87
|
+
#
|
88
|
+
# Respects the value of <tt>--hadoop_runner</tt> if given.
|
89
|
+
#
|
90
|
+
# @return [String]
|
91
|
+
def hadoop_runner
|
92
|
+
settings[:hadoop_runner] || File.join(settings[:hadoop_home], 'bin/hadoop')
|
93
|
+
end
|
94
|
+
|
95
|
+
# Return an array of jobconf (-D) options that will be passed to Hadoop.
|
96
|
+
#
|
97
|
+
# Translates the "friendly" <tt>wu-hadoop</tt> names into the
|
98
|
+
# less-friendly Hadoop names.
|
99
|
+
#
|
100
|
+
# @return [Array<String>]
|
101
|
+
def hadoop_jobconf_options
|
102
|
+
jobconf_options = []
|
103
|
+
settings[:reuse_jvms] = '-1' if (settings[:reuse_jvms] == true)
|
104
|
+
settings[:respect_exit_status] = 'false' if (settings[:ignore_exit_status] == true)
|
105
|
+
# If no reducer and no reduce_command, then skip the reduce phase
|
106
|
+
settings[:reduce_tasks] = 0 unless (reduce? || settings[:reduce_tasks].nil?)
|
107
|
+
# Fields hadoop should use to distribute records to reducers
|
108
|
+
unless settings[:partition_fields].blank?
|
109
|
+
jobconf_options += [jobconf(:partition_fields), jobconf(:output_field_separator)]
|
110
|
+
end
|
111
|
+
jobconf_options += [
|
112
|
+
:io_sort_mb, :io_sort_record_percent,
|
113
|
+
:map_speculative, :map_tasks,
|
114
|
+
:max_maps_per_cluster, :max_maps_per_node,
|
115
|
+
:max_node_map_tasks, :max_node_reduce_tasks,
|
116
|
+
:max_reduces_per_cluster, :max_reduces_per_node,
|
117
|
+
:max_record_length, :min_split_size,
|
118
|
+
:output_field_separator, :key_field_separator,
|
119
|
+
:partition_fields, :sort_fields,
|
120
|
+
:reduce_tasks, :respect_exit_status,
|
121
|
+
:reuse_jvms, :timeout,
|
122
|
+
:max_tracker_failures, :max_map_attempts,
|
123
|
+
:max_reduce_attempts
|
124
|
+
].map do |opt|
|
125
|
+
defn = settings.definition_of(opt, :description)
|
126
|
+
val = settings[opt]
|
127
|
+
java_opt(defn, val)
|
128
|
+
end
|
129
|
+
jobconf_options.flatten.compact
|
130
|
+
end
|
131
|
+
|
132
|
+
# Returns other arguments used by Hadoop streaming.
|
133
|
+
#
|
134
|
+
# @return [String]
|
135
|
+
def hadoop_other_args
|
136
|
+
extra_str_args = parsed_java_opts
|
137
|
+
if settings[:split_on_xml_tag]
|
138
|
+
extra_str_args << %Q{-inputreader 'StreamXmlRecordReader,begin=<#{options.split_on_xml_tag}>,end=</#{options.split_on_xml_tag}>'}
|
139
|
+
end
|
140
|
+
extra_str_args << ' -lazyOutput' if settings[:noempty] # don't create reduce file if no records
|
141
|
+
extra_str_args << ' -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner' unless settings[:partition_fields].blank?
|
142
|
+
extra_str_args
|
143
|
+
end
|
144
|
+
|
145
|
+
# :nodoc:
|
146
|
+
#
|
147
|
+
# http://hadoop.apache.org/docs/r0.20.2/streaming.html#Package+Files+With+Job+Submissions
|
148
|
+
def hadoop_files
|
149
|
+
args.find_all { |arg| arg.to_s =~ /\.rb$/ }.map do |arg|
|
150
|
+
"-file '#{arg}'"
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
# :nodoc:
|
155
|
+
def ruby_interpreter_path
|
156
|
+
Pathname.new(File.join(Config::CONFIG['bindir'], Config::CONFIG['RUBY_INSTALL_NAME'] + Config::CONFIG['EXEEXT'])).realpath
|
157
|
+
end
|
158
|
+
|
159
|
+
# :nodoc:
|
160
|
+
def use_alternative_gemfile
|
161
|
+
ENV['BUNDLE_GEMFILE'] = settings[:gemfile]
|
162
|
+
end
|
163
|
+
|
164
|
+
# :nodoc:
|
165
|
+
def hadoop_recycle_env
|
166
|
+
use_alternative_gemfile if settings[:gemfile]
|
167
|
+
%w[BUNDLE_GEMFILE].map{ |var| %Q{-cmdenv '#{var}=#{ENV[var]}'} if ENV[var] }.compact
|
168
|
+
end
|
169
|
+
|
170
|
+
# :nodoc:
|
171
|
+
def parsed_java_opts
|
172
|
+
settings[:java_opts].map do |java_opt|
|
173
|
+
java_opt.split('-D').reject{ |opt| opt.blank? }.map{ |opt| '-D ' + opt.strip }
|
174
|
+
end.flatten
|
175
|
+
end
|
176
|
+
|
177
|
+
# :nodoc:
|
178
|
+
def java_opt option, value
|
179
|
+
"-D %s=%s" % [option, Shellwords.escape(value.to_s)] if value
|
180
|
+
end
|
181
|
+
|
182
|
+
end
|
183
|
+
end
|
184
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module Wukong
|
2
|
+
module Hadoop
|
3
|
+
|
4
|
+
# Provides methods for determining input and output paths.
|
5
|
+
# Written as a separate module to allow easy overriding from other
|
6
|
+
# plugins.
|
7
|
+
module InputsAndOutputs
|
8
|
+
|
9
|
+
# The input paths to read from.
|
10
|
+
#
|
11
|
+
# @return [String]
|
12
|
+
def input_paths
|
13
|
+
(settings[:input] || [])
|
14
|
+
end
|
15
|
+
|
16
|
+
# The output path to write to.
|
17
|
+
#
|
18
|
+
# @return [String]
|
19
|
+
def output_path
|
20
|
+
settings[:output]
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
|
@@ -0,0 +1,48 @@
|
|
1
|
+
require 'shellwords'
|
2
|
+
module Wukong
|
3
|
+
module Hadoop
|
4
|
+
|
5
|
+
# Provides methods for executing a map/reduce job locally on the
|
6
|
+
# command-line.
|
7
|
+
module LocalInvocation
|
8
|
+
|
9
|
+
# Returns the full local command used by Wukong-Hadoop when
|
10
|
+
# simulating a map/reduce job on the command-line.
|
11
|
+
#
|
12
|
+
# You should be able to run this commmand directly to simulate
|
13
|
+
# the job yourself.
|
14
|
+
#
|
15
|
+
# @return [String]
|
16
|
+
def local_commandline
|
17
|
+
[
|
18
|
+
[cat_input, mapper_commandline].tap do |pipeline|
|
19
|
+
pipeline.concat([sort_commandline, reducer_commandline]) if reduce?
|
20
|
+
end.flatten.compact.join(' | '),
|
21
|
+
cat_output
|
22
|
+
].flatten.compact.join(' ')
|
23
|
+
end
|
24
|
+
|
25
|
+
# Returns the sort command used by Wukong-Hadoop when simulating
|
26
|
+
# a map/reduce job on the command-line.
|
27
|
+
#
|
28
|
+
# @return [String]
|
29
|
+
def sort_commandline
|
30
|
+
settings[:sort_command]
|
31
|
+
end
|
32
|
+
|
33
|
+
# :nodoc:
|
34
|
+
def cat_input
|
35
|
+
return unless input_paths && (!input_paths.empty?)
|
36
|
+
paths = Shellwords.join(input_paths.split(','))
|
37
|
+
"cat #{paths}"
|
38
|
+
end
|
39
|
+
|
40
|
+
# :nodoc:
|
41
|
+
def cat_output
|
42
|
+
return unless output_path
|
43
|
+
"> #{output_path}"
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|