wukong-hadoop 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +59 -0
- data/.rspec +2 -0
- data/Gemfile +3 -0
- data/README.md +339 -0
- data/Rakefile +13 -0
- data/bin/hdp-bin +44 -0
- data/bin/hdp-bzip +23 -0
- data/bin/hdp-cat +3 -0
- data/bin/hdp-catd +3 -0
- data/bin/hdp-cp +3 -0
- data/bin/hdp-du +86 -0
- data/bin/hdp-get +3 -0
- data/bin/hdp-kill +3 -0
- data/bin/hdp-kill-task +3 -0
- data/bin/hdp-ls +11 -0
- data/bin/hdp-mkdir +2 -0
- data/bin/hdp-mkdirp +12 -0
- data/bin/hdp-mv +3 -0
- data/bin/hdp-parts_to_keys.rb +77 -0
- data/bin/hdp-ps +3 -0
- data/bin/hdp-put +3 -0
- data/bin/hdp-rm +32 -0
- data/bin/hdp-sort +40 -0
- data/bin/hdp-stream +40 -0
- data/bin/hdp-stream-flat +22 -0
- data/bin/hdp-stream2 +39 -0
- data/bin/hdp-sync +17 -0
- data/bin/hdp-wc +67 -0
- data/bin/wu-hadoop +14 -0
- data/examples/counter.rb +17 -0
- data/examples/map_only.rb +28 -0
- data/examples/processors.rb +4 -0
- data/examples/sonnet_18.txt +14 -0
- data/examples/tokenizer.rb +28 -0
- data/examples/word_count.rb +44 -0
- data/features/step_definitions/wu_hadoop_steps.rb +4 -0
- data/features/support/env.rb +1 -0
- data/features/wu_hadoop.feature +113 -0
- data/lib/wukong-hadoop.rb +21 -0
- data/lib/wukong-hadoop/configuration.rb +133 -0
- data/lib/wukong-hadoop/driver.rb +190 -0
- data/lib/wukong-hadoop/driver/hadoop_invocation.rb +184 -0
- data/lib/wukong-hadoop/driver/inputs_and_outputs.rb +27 -0
- data/lib/wukong-hadoop/driver/local_invocation.rb +48 -0
- data/lib/wukong-hadoop/driver/map_logic.rb +104 -0
- data/lib/wukong-hadoop/driver/reduce_logic.rb +129 -0
- data/lib/wukong-hadoop/extensions.rb +2 -0
- data/lib/wukong-hadoop/hadoop_env_methods.rb +80 -0
- data/lib/wukong-hadoop/version.rb +6 -0
- data/spec/spec_helper.rb +21 -0
- data/spec/support/driver_helper.rb +15 -0
- data/spec/support/integration_helper.rb +39 -0
- data/spec/wukong-hadoop/driver_spec.rb +117 -0
- data/spec/wukong-hadoop/hadoop_env_methods_spec.rb +14 -0
- data/spec/wukong-hadoop/hadoop_mode_spec.rb +78 -0
- data/spec/wukong-hadoop/local_mode_spec.rb +22 -0
- data/spec/wukong-hadoop/wu_hadoop_spec.rb +34 -0
- data/wukong-hadoop.gemspec +33 -0
- metadata +168 -0
@@ -0,0 +1,190 @@
|
|
1
|
+
require 'shellwords'
|
2
|
+
require_relative("driver/inputs_and_outputs")
|
3
|
+
require_relative("driver/map_logic")
|
4
|
+
require_relative("driver/reduce_logic")
|
5
|
+
require_relative("driver/local_invocation")
|
6
|
+
require_relative("driver/hadoop_invocation")
|
7
|
+
|
8
|
+
module Wukong
|
9
|
+
module Hadoop
|
10
|
+
|
11
|
+
# The <tt>Hadoop::Driver</tt> class contains the logic to examine
|
12
|
+
# arguments and construct command lines which it will execute to
|
13
|
+
# create the desired behavior.
|
14
|
+
#
|
15
|
+
# The Hadoop::Driver will introspect on its arguments to guess (if
|
16
|
+
# not given) the processors to use as mapper and reducer in a
|
17
|
+
# map/reduce job. It will also decide whether to run that job in
|
18
|
+
# local or Hadoop mode. These decisions result in a command which
|
19
|
+
# it will ultimately execute.
|
20
|
+
class Driver < Wukong::Driver
|
21
|
+
|
22
|
+
include InputsAndOutputs
|
23
|
+
include MapLogic
|
24
|
+
include ReduceLogic
|
25
|
+
include HadoopInvocation
|
26
|
+
include LocalInvocation
|
27
|
+
|
28
|
+
# The settings used by this driver.
|
29
|
+
#
|
30
|
+
# @param [Configliere::Param]
|
31
|
+
attr_accessor :settings
|
32
|
+
|
33
|
+
# The (processed) arguments for this driver.
|
34
|
+
#
|
35
|
+
# @param [Array<String, Pathname>]
|
36
|
+
attr_reader :args
|
37
|
+
|
38
|
+
# Initialize and run a new Wukong::Hadoop::Driver for the given
|
39
|
+
# +settings+.
|
40
|
+
#
|
41
|
+
# Will rescue all Wukong::Error exceptions by printing a nice
|
42
|
+
# message to STDERR and exiting.
|
43
|
+
#
|
44
|
+
# @param [Configliere::Param] settings
|
45
|
+
# @param [Array<String>] extra_args
|
46
|
+
def self.run(settings, *extra_args)
|
47
|
+
begin
|
48
|
+
new(settings, *extra_args).run!
|
49
|
+
rescue Wukong::Error => e
|
50
|
+
$stderr.puts e.message
|
51
|
+
exit(127)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
# Run this driver.
|
56
|
+
def run!
|
57
|
+
if mode == :local
|
58
|
+
# Log.info "Launching local!"
|
59
|
+
execute_command!(local_commandline)
|
60
|
+
else
|
61
|
+
ensure_input_and_output!
|
62
|
+
remove_output_path! if settings[:rm] || settings[:overwrite]
|
63
|
+
Log.info "Launching Hadoop!"
|
64
|
+
execute_command!(hadoop_commandline)
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
# Initialize a new driver with the given +settings+ and +args+.
|
69
|
+
#
|
70
|
+
# @param [Configliere::Param] settings
|
71
|
+
# @param [Array<String>] args
|
72
|
+
def initialize(settings, *args)
|
73
|
+
@settings = settings
|
74
|
+
self.args = args
|
75
|
+
end
|
76
|
+
|
77
|
+
# Set the +args+ for this driver.
|
78
|
+
#
|
79
|
+
# Arguments can be either (registered) processor names or files.
|
80
|
+
#
|
81
|
+
# An error will be raised on missing files or those which
|
82
|
+
# couldn't be loaded.
|
83
|
+
#
|
84
|
+
# An error will be raised if more than two arguments (mapper and
|
85
|
+
# reducer) are passed.
|
86
|
+
#
|
87
|
+
# @param [Array<String>] args
|
88
|
+
def args= args
|
89
|
+
raise Error.new("Cannot provide more than two arguments") if args.length > 2
|
90
|
+
@args = args.map do |arg|
|
91
|
+
if processor_registered?(arg)
|
92
|
+
arg
|
93
|
+
else
|
94
|
+
begin
|
95
|
+
rp = Pathname.new(arg).realpath
|
96
|
+
load rp
|
97
|
+
rp
|
98
|
+
rescue => e
|
99
|
+
raise Error.new("No such processor or file: #{arg}")
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
# What mode is this driver in?
|
106
|
+
#
|
107
|
+
# @return [:hadoop, :local]
|
108
|
+
def mode
|
109
|
+
settings[:mode].to_s == 'local' ? :local : :hadoop
|
110
|
+
end
|
111
|
+
|
112
|
+
# Were mapper and/or reducer named by a single argument?
|
113
|
+
#
|
114
|
+
# @return [true, false]
|
115
|
+
def single_job_arg?
|
116
|
+
args.size == 1
|
117
|
+
end
|
118
|
+
|
119
|
+
# Were mapper and/or reducer named by separate arguments?
|
120
|
+
#
|
121
|
+
# @return [true, false]
|
122
|
+
def separate_map_and_reduce_args?
|
123
|
+
args.size == 2
|
124
|
+
end
|
125
|
+
|
126
|
+
# Is there a processor registered with the given +name+?
|
127
|
+
#
|
128
|
+
# @param [#to_s] name
|
129
|
+
# @return [true, false]
|
130
|
+
def processor_registered? name
|
131
|
+
Wukong.registry.registered?(name.to_s.to_sym)
|
132
|
+
end
|
133
|
+
|
134
|
+
# Return the guessed name of a processor at the given +path+.
|
135
|
+
#
|
136
|
+
# @param [String] path
|
137
|
+
# @return [String]
|
138
|
+
def processor_name_from_file(path)
|
139
|
+
File.basename(path, '.rb')
|
140
|
+
end
|
141
|
+
|
142
|
+
# Does the given +path+ contain a processor named after itself?
|
143
|
+
#
|
144
|
+
# @param [String] path
|
145
|
+
# @return [true, false]
|
146
|
+
def file_is_processor?(path)
|
147
|
+
processor_registered?(processor_name_from_file(path))
|
148
|
+
end
|
149
|
+
|
150
|
+
# The prefix to insert befor all invocations of the
|
151
|
+
# <tt>wu-local</tt> runner.
|
152
|
+
#
|
153
|
+
# @return [String]
|
154
|
+
def command_prefix
|
155
|
+
settings[:command_prefix]
|
156
|
+
end
|
157
|
+
|
158
|
+
# Returns parameters to pass to an invocation of
|
159
|
+
# <tt>wu-local</tt>.
|
160
|
+
#
|
161
|
+
# Parameters like <tt>--reduce_tasks</tt> which are relevant to
|
162
|
+
# Wukong-Hadoop will be interpreted and *not* passed. Others
|
163
|
+
# will be passed unmodified.
|
164
|
+
#
|
165
|
+
# @return [String]
|
166
|
+
def params_to_pass
|
167
|
+
s = (Wukong.loaded_deploy_pack? ? Deploy.pre_deploy_settings : settings)
|
168
|
+
s.reject{ |param, val| s.definition_of(param, :wukong_hadoop) }.map{ |param,val| "--#{param}=#{Shellwords.escape(val.to_s)}" }.join(" ")
|
169
|
+
end
|
170
|
+
|
171
|
+
# Execute a command composed of the given parts.
|
172
|
+
#
|
173
|
+
# Will print the command instead of the <tt>--dry_run</tt>
|
174
|
+
# option was given.
|
175
|
+
#
|
176
|
+
# @param [Array<String>] args
|
177
|
+
def execute_command!(*args)
|
178
|
+
command = args.flatten.reject(&:blank?).join(" \\\n ")
|
179
|
+
if settings[:dry_run]
|
180
|
+
Log.info("Dry run:")
|
181
|
+
puts command
|
182
|
+
else
|
183
|
+
puts `#{command}`
|
184
|
+
raise "Streaming command failed!" unless $?.success?
|
185
|
+
end
|
186
|
+
end
|
187
|
+
|
188
|
+
end
|
189
|
+
end
|
190
|
+
end
|
@@ -0,0 +1,184 @@
|
|
1
|
+
module Wukong
|
2
|
+
module Hadoop
|
3
|
+
|
4
|
+
# Provides methods for executing a map/reduce job on a Hadoop
|
5
|
+
# cluster via {Hadoop
|
6
|
+
# streaming}[http://hadoop.apache.org/docs/r0.15.2/streaming.html].
|
7
|
+
module HadoopInvocation
|
8
|
+
|
9
|
+
# Raise an error unless we have input and output.
|
10
|
+
def ensure_input_and_output!
|
11
|
+
raise Error.new("Explicit --input and --output paths are required to run a job in Hadoop mode.") if input_paths.nil? || input_paths.empty? || output_path.nil? || output_path.empty?
|
12
|
+
end
|
13
|
+
|
14
|
+
# Remove the output path.
|
15
|
+
#
|
16
|
+
# Will not actually do anything if the <tt>--dry_run</tt> option
|
17
|
+
# is also given.
|
18
|
+
def remove_output_path!
|
19
|
+
cmd = %Q{#{settings[:hadoop_runner]} fs -rmr '#{output_path}'}
|
20
|
+
Log.info "Removing output file #{output_path}: #{cmd}"
|
21
|
+
puts `#{cmd}` unless settings[:dry_run]
|
22
|
+
end
|
23
|
+
|
24
|
+
# Return the Hadoop command used to launch this job in a Hadoop
|
25
|
+
# cluster.
|
26
|
+
#
|
27
|
+
# You should be able to copy, paste, and run this command
|
28
|
+
# unmodified when debugging.
|
29
|
+
#
|
30
|
+
# @return [String]
|
31
|
+
def hadoop_commandline
|
32
|
+
[
|
33
|
+
hadoop_runner,
|
34
|
+
"jar #{settings[:hadoop_home]}/contrib/streaming/hadoop-*streaming*.jar",
|
35
|
+
hadoop_jobconf_options,
|
36
|
+
"-D mapred.job.name='#{job_name}'",
|
37
|
+
hadoop_other_args,
|
38
|
+
"-mapper '#{mapper_commandline}'",
|
39
|
+
"-reducer '#{reducer_commandline}'",
|
40
|
+
"-input '#{input_paths}'",
|
41
|
+
"-output '#{output_path}'",
|
42
|
+
hadoop_files,
|
43
|
+
io_formats,
|
44
|
+
hadoop_recycle_env,
|
45
|
+
].flatten.compact.join(" \t\\\n ")
|
46
|
+
end
|
47
|
+
|
48
|
+
# The job name that will be passed to Hadoop.
|
49
|
+
#
|
50
|
+
# Respects the <tt>--job_name</tt> option if given, otherwise
|
51
|
+
# constructs one from the given processors, input, and output
|
52
|
+
# paths.
|
53
|
+
#
|
54
|
+
# @return [String]
|
55
|
+
def job_name
|
56
|
+
return settings[:job_name] if settings[:job_name]
|
57
|
+
relevant_filename = args.compact.uniq.map { |path| File.basename(path, '.rb') }.join('-')
|
58
|
+
"#{relevant_filename}---#{input_paths}---#{output_path}".gsub(%r{[^\w/\.\-\+]+}, '')
|
59
|
+
end
|
60
|
+
|
61
|
+
# The input format to use.
|
62
|
+
#
|
63
|
+
# Respects the value of <tt>--input_format</tt>.
|
64
|
+
#
|
65
|
+
# @return [String]
|
66
|
+
def input_format
|
67
|
+
settings[:input_format]
|
68
|
+
end
|
69
|
+
|
70
|
+
# The output format to use.
|
71
|
+
#
|
72
|
+
# Respects the value of <tt>--output_format</tt>.
|
73
|
+
#
|
74
|
+
# @return [String]
|
75
|
+
def output_format
|
76
|
+
settings[:output_format]
|
77
|
+
end
|
78
|
+
|
79
|
+
# :nodoc:
|
80
|
+
def io_formats
|
81
|
+
input = "-inputformat '#{input_format}'" if input_format
|
82
|
+
output = "-outputformat '#{output_format}'" if output_format
|
83
|
+
[input, output]
|
84
|
+
end
|
85
|
+
|
86
|
+
# The name of the Hadoop binary to use.
|
87
|
+
#
|
88
|
+
# Respects the value of <tt>--hadoop_runner</tt> if given.
|
89
|
+
#
|
90
|
+
# @return [String]
|
91
|
+
def hadoop_runner
|
92
|
+
settings[:hadoop_runner] || File.join(settings[:hadoop_home], 'bin/hadoop')
|
93
|
+
end
|
94
|
+
|
95
|
+
# Return an array of jobconf (-D) options that will be passed to Hadoop.
|
96
|
+
#
|
97
|
+
# Translates the "friendly" <tt>wu-hadoop</tt> names into the
|
98
|
+
# less-friendly Hadoop names.
|
99
|
+
#
|
100
|
+
# @return [Array<String>]
|
101
|
+
def hadoop_jobconf_options
|
102
|
+
jobconf_options = []
|
103
|
+
settings[:reuse_jvms] = '-1' if (settings[:reuse_jvms] == true)
|
104
|
+
settings[:respect_exit_status] = 'false' if (settings[:ignore_exit_status] == true)
|
105
|
+
# If no reducer and no reduce_command, then skip the reduce phase
|
106
|
+
settings[:reduce_tasks] = 0 unless (reduce? || settings[:reduce_tasks].nil?)
|
107
|
+
# Fields hadoop should use to distribute records to reducers
|
108
|
+
unless settings[:partition_fields].blank?
|
109
|
+
jobconf_options += [jobconf(:partition_fields), jobconf(:output_field_separator)]
|
110
|
+
end
|
111
|
+
jobconf_options += [
|
112
|
+
:io_sort_mb, :io_sort_record_percent,
|
113
|
+
:map_speculative, :map_tasks,
|
114
|
+
:max_maps_per_cluster, :max_maps_per_node,
|
115
|
+
:max_node_map_tasks, :max_node_reduce_tasks,
|
116
|
+
:max_reduces_per_cluster, :max_reduces_per_node,
|
117
|
+
:max_record_length, :min_split_size,
|
118
|
+
:output_field_separator, :key_field_separator,
|
119
|
+
:partition_fields, :sort_fields,
|
120
|
+
:reduce_tasks, :respect_exit_status,
|
121
|
+
:reuse_jvms, :timeout,
|
122
|
+
:max_tracker_failures, :max_map_attempts,
|
123
|
+
:max_reduce_attempts
|
124
|
+
].map do |opt|
|
125
|
+
defn = settings.definition_of(opt, :description)
|
126
|
+
val = settings[opt]
|
127
|
+
java_opt(defn, val)
|
128
|
+
end
|
129
|
+
jobconf_options.flatten.compact
|
130
|
+
end
|
131
|
+
|
132
|
+
# Returns other arguments used by Hadoop streaming.
|
133
|
+
#
|
134
|
+
# @return [String]
|
135
|
+
def hadoop_other_args
|
136
|
+
extra_str_args = parsed_java_opts
|
137
|
+
if settings[:split_on_xml_tag]
|
138
|
+
extra_str_args << %Q{-inputreader 'StreamXmlRecordReader,begin=<#{options.split_on_xml_tag}>,end=</#{options.split_on_xml_tag}>'}
|
139
|
+
end
|
140
|
+
extra_str_args << ' -lazyOutput' if settings[:noempty] # don't create reduce file if no records
|
141
|
+
extra_str_args << ' -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner' unless settings[:partition_fields].blank?
|
142
|
+
extra_str_args
|
143
|
+
end
|
144
|
+
|
145
|
+
# :nodoc:
|
146
|
+
#
|
147
|
+
# http://hadoop.apache.org/docs/r0.20.2/streaming.html#Package+Files+With+Job+Submissions
|
148
|
+
def hadoop_files
|
149
|
+
args.find_all { |arg| arg.to_s =~ /\.rb$/ }.map do |arg|
|
150
|
+
"-file '#{arg}'"
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
# :nodoc:
|
155
|
+
def ruby_interpreter_path
|
156
|
+
Pathname.new(File.join(Config::CONFIG['bindir'], Config::CONFIG['RUBY_INSTALL_NAME'] + Config::CONFIG['EXEEXT'])).realpath
|
157
|
+
end
|
158
|
+
|
159
|
+
# :nodoc:
|
160
|
+
def use_alternative_gemfile
|
161
|
+
ENV['BUNDLE_GEMFILE'] = settings[:gemfile]
|
162
|
+
end
|
163
|
+
|
164
|
+
# :nodoc:
|
165
|
+
def hadoop_recycle_env
|
166
|
+
use_alternative_gemfile if settings[:gemfile]
|
167
|
+
%w[BUNDLE_GEMFILE].map{ |var| %Q{-cmdenv '#{var}=#{ENV[var]}'} if ENV[var] }.compact
|
168
|
+
end
|
169
|
+
|
170
|
+
# :nodoc:
|
171
|
+
def parsed_java_opts
|
172
|
+
settings[:java_opts].map do |java_opt|
|
173
|
+
java_opt.split('-D').reject{ |opt| opt.blank? }.map{ |opt| '-D ' + opt.strip }
|
174
|
+
end.flatten
|
175
|
+
end
|
176
|
+
|
177
|
+
# :nodoc:
|
178
|
+
def java_opt option, value
|
179
|
+
"-D %s=%s" % [option, Shellwords.escape(value.to_s)] if value
|
180
|
+
end
|
181
|
+
|
182
|
+
end
|
183
|
+
end
|
184
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module Wukong
|
2
|
+
module Hadoop
|
3
|
+
|
4
|
+
# Provides methods for determining input and output paths.
|
5
|
+
# Written as a separate module to allow easy overriding from other
|
6
|
+
# plugins.
|
7
|
+
module InputsAndOutputs
|
8
|
+
|
9
|
+
# The input paths to read from.
|
10
|
+
#
|
11
|
+
# @return [String]
|
12
|
+
def input_paths
|
13
|
+
(settings[:input] || [])
|
14
|
+
end
|
15
|
+
|
16
|
+
# The output path to write to.
|
17
|
+
#
|
18
|
+
# @return [String]
|
19
|
+
def output_path
|
20
|
+
settings[:output]
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
|
@@ -0,0 +1,48 @@
|
|
1
|
+
require 'shellwords'
|
2
|
+
module Wukong
|
3
|
+
module Hadoop
|
4
|
+
|
5
|
+
# Provides methods for executing a map/reduce job locally on the
|
6
|
+
# command-line.
|
7
|
+
module LocalInvocation
|
8
|
+
|
9
|
+
# Returns the full local command used by Wukong-Hadoop when
|
10
|
+
# simulating a map/reduce job on the command-line.
|
11
|
+
#
|
12
|
+
# You should be able to run this commmand directly to simulate
|
13
|
+
# the job yourself.
|
14
|
+
#
|
15
|
+
# @return [String]
|
16
|
+
def local_commandline
|
17
|
+
[
|
18
|
+
[cat_input, mapper_commandline].tap do |pipeline|
|
19
|
+
pipeline.concat([sort_commandline, reducer_commandline]) if reduce?
|
20
|
+
end.flatten.compact.join(' | '),
|
21
|
+
cat_output
|
22
|
+
].flatten.compact.join(' ')
|
23
|
+
end
|
24
|
+
|
25
|
+
# Returns the sort command used by Wukong-Hadoop when simulating
|
26
|
+
# a map/reduce job on the command-line.
|
27
|
+
#
|
28
|
+
# @return [String]
|
29
|
+
def sort_commandline
|
30
|
+
settings[:sort_command]
|
31
|
+
end
|
32
|
+
|
33
|
+
# :nodoc:
|
34
|
+
def cat_input
|
35
|
+
return unless input_paths && (!input_paths.empty?)
|
36
|
+
paths = Shellwords.join(input_paths.split(','))
|
37
|
+
"cat #{paths}"
|
38
|
+
end
|
39
|
+
|
40
|
+
# :nodoc:
|
41
|
+
def cat_output
|
42
|
+
return unless output_path
|
43
|
+
"> #{output_path}"
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|