wukong 1.4.12 → 1.5.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG.textile +20 -1
- data/bin/bootstrap.sh +32 -0
- data/bin/hdp-sort +3 -0
- data/bin/hdp-stream +3 -0
- data/docpages/README-elastic_map_reduce.textile +377 -0
- data/docpages/avro/avro_notes.textile +56 -0
- data/docpages/avro/tethering.textile +19 -0
- data/docpages/pig/commandline_params.txt +26 -0
- data/examples/emr/elastic_mapreduce_example.rb +35 -0
- data/lib/wukong/logger.rb +8 -1
- data/lib/wukong/script/avro_command.rb +5 -0
- data/lib/wukong/script/emr_command.rb +119 -0
- data/lib/wukong/script/hadoop_command.rb +72 -90
- data/lib/wukong/script/local_command.rb +18 -8
- data/lib/wukong/script.rb +87 -92
- data/wukong.gemspec +27 -18
- metadata +30 -21
@@ -0,0 +1,119 @@
|
|
1
|
+
require 'right_aws'
|
2
|
+
require 'configliere/config_block'
|
3
|
+
Settings.read(File.expand_path('~/.wukong/emr.yaml'))
|
4
|
+
Settings.define :access_key, :description => 'AWS Access key', :env_var => 'AWS_ACCESS_KEY_ID'
|
5
|
+
Settings.define :secret_access_key, :description => 'AWS Secret Access key', :env_var => 'AWS_SECRET_ACCESS_KEY'
|
6
|
+
Settings.define :emr_runner, :description => 'Path to the elastic-mapreduce command (~ etc will be expanded)'
|
7
|
+
Settings.define :emr_root, :description => 'S3 url to use as the base for Elastic MapReduce storage'
|
8
|
+
Settings.define :key_pair_file, :description => 'AWS Key pair file', :finally => lambda{ Settings.key_pair_file = File.expand_path(Settings.key_pair_file.to_s) }
|
9
|
+
Settings.define :key_pair, :description => "AWS Key pair name. If not specified, it's taken from key_pair_file's basename", :finally => lambda{ Settings.key_pair ||= File.basename(Settings.key_pair_file.to_s, '.pem') }
|
10
|
+
Settings.define :instance_type, :description => 'AWS instance type to use', :default => 'm1.small'
|
11
|
+
Settings.define :master_instance_type, :description => 'Overrides the instance type for the master node', :finally => lambda{ Settings.master_instance_type ||= Settings.instance_type }
|
12
|
+
Settings.define :jobflow
|
13
|
+
module Wukong
|
14
|
+
#
|
15
|
+
# EMR Options
|
16
|
+
#
|
17
|
+
module EmrCommand
|
18
|
+
|
19
|
+
def execute_emr_workflow
|
20
|
+
copy_script_to_cloud
|
21
|
+
execute_emr_runner
|
22
|
+
end
|
23
|
+
|
24
|
+
def copy_script_to_cloud
|
25
|
+
Log.info " Copying this script to the cloud."
|
26
|
+
S3Util.store(this_script_filename, mapper_s3_uri)
|
27
|
+
S3Util.store(this_script_filename, reducer_s3_uri)
|
28
|
+
S3Util.store(File.expand_path('~/ics/wukong/bin/bootstrap.sh'), bootstrap_s3_uri)
|
29
|
+
S3Util.store(File.expand_path('/tmp/wukong-libs.tar.bz2'), wukong_libs_s3_uri)
|
30
|
+
S3Util.store(File.expand_path('/tmp/wukong-libs.jar'), s3_path('bin', "wukong-libs.jar"))
|
31
|
+
end
|
32
|
+
|
33
|
+
def execute_emr_runner
|
34
|
+
command_args = [
|
35
|
+
:hadoop_version, :availability_zone, :key_pair, :key_pair_file,
|
36
|
+
].map{|args| Settings.dashed_flag_for(*args) }
|
37
|
+
command_args += [
|
38
|
+
%Q{--verbose --debug --access-id #{Settings.access_key} --private-key #{Settings.secret_access_key} },
|
39
|
+
"--stream",
|
40
|
+
"--mapper=#{mapper_s3_uri}",
|
41
|
+
"--reducer=#{reducer_s3_uri}",
|
42
|
+
"--input=#{mapper_s3_uri} --output=#{Settings.emr_root+'/foo-out.tsv'}",
|
43
|
+
#"--enable-debugging --log-uri=#{log_s3_uri}",
|
44
|
+
"--cache-archive=#{s3_path('bin', "wukong-libs.jar")}#wukong-libs.jar",
|
45
|
+
"--cache=#{wukong_libs_s3_uri}##{File.basename wukong_libs_s3_uri}",
|
46
|
+
"--bootstrap-action=#{bootstrap_s3_uri}",
|
47
|
+
]
|
48
|
+
if Settings.jobflow
|
49
|
+
command_args << "--jobflow=#{Settings[:jobflow]}"
|
50
|
+
else
|
51
|
+
command_args << '--alive --create'
|
52
|
+
command_args << "--name=#{job_name}"
|
53
|
+
command_args += [ [:instance_type, :slave_instance_type] , :master_instance_type, :num_instances, ].map{|args| Settings.dashed_flag_for(*args) }
|
54
|
+
end
|
55
|
+
execute_command!( File.expand_path(Settings.emr_runner), *command_args )
|
56
|
+
end
|
57
|
+
|
58
|
+
# A short name for this job
|
59
|
+
def job_handle
|
60
|
+
File.basename($0,'.rb')
|
61
|
+
end
|
62
|
+
|
63
|
+
def mapper_s3_uri
|
64
|
+
s3_path(job_handle+'-mapper.rb')
|
65
|
+
end
|
66
|
+
def reducer_s3_uri
|
67
|
+
s3_path(job_handle+'-reducer.rb')
|
68
|
+
end
|
69
|
+
def log_s3_uri
|
70
|
+
s3_path('log', job_handle)
|
71
|
+
end
|
72
|
+
def bootstrap_s3_uri
|
73
|
+
s3_path('bin', "bootstrap-#{job_handle}.sh")
|
74
|
+
end
|
75
|
+
def wukong_libs_s3_uri
|
76
|
+
s3_path('bin', "wukong-libs.tar.bz2")
|
77
|
+
end
|
78
|
+
|
79
|
+
def s3_path *path_segs
|
80
|
+
File.join(Settings.emr_root, path_segs.flatten.compact)
|
81
|
+
end
|
82
|
+
|
83
|
+
module ClassMethods
|
84
|
+
|
85
|
+
# Standard hack to create ClassMethods-on-include
|
86
|
+
def self.included base
|
87
|
+
base.class_eval do
|
88
|
+
extend ClassMethods
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
class S3Util
|
94
|
+
# class methods
|
95
|
+
class << self
|
96
|
+
def s3
|
97
|
+
@s3 ||= RightAws::S3Interface.new(
|
98
|
+
Settings.access_key, Settings.secret_access_key,
|
99
|
+
{:multi_thread => true, :logger => Log})
|
100
|
+
end
|
101
|
+
|
102
|
+
def bucket_and_path_from_uri uri
|
103
|
+
uri =~ %r{^s3\w*://([\w\.\-]+)\W*(.*)} and return([$1, $2])
|
104
|
+
end
|
105
|
+
|
106
|
+
def store filename, uri
|
107
|
+
Log.debug " #{filename} => #{uri}"
|
108
|
+
dest_bucket, dest_key = bucket_and_path_from_uri(uri)
|
109
|
+
contents = File.open(filename)
|
110
|
+
s3.store_object(:bucket => dest_bucket, :key => dest_key, :data => contents)
|
111
|
+
end
|
112
|
+
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
116
|
+
Script.class_eval do
|
117
|
+
include EmrCommand
|
118
|
+
end
|
119
|
+
end
|
@@ -2,85 +2,108 @@
|
|
2
2
|
module Wukong
|
3
3
|
module HadoopCommand
|
4
4
|
|
5
|
-
# ===========================================================================
|
6
|
-
#
|
7
|
-
# Hadoop Environment
|
8
|
-
#
|
9
|
-
|
10
5
|
# ===========================================================================
|
11
6
|
#
|
12
7
|
# Hadoop Options
|
13
8
|
#
|
9
|
+
Settings.define :hadoop_home, :default => '/usr/lib/hadoop', :description => "Path to hadoop installation; ENV['HADOOP_HOME'] by default. HADOOP_HOME/bin/hadoop is used to run hadoop.", :env_var => 'HADOOP_HOME', :wukong => true
|
10
|
+
Settings.define :hadoop_runner, :description => "Path to hadoop script. Usually set --hadoop_home instead of this.", :wukong => true
|
14
11
|
|
15
12
|
#
|
16
|
-
# Translate
|
13
|
+
# Translate simplified args to their hairy hadoop equivalents
|
17
14
|
#
|
18
|
-
Settings.define :max_node_map_tasks, :jobconf => true, :description => 'mapred.tasktracker.map.tasks.maximum',
|
19
|
-
Settings.define :max_node_reduce_tasks, :jobconf => true, :description => 'mapred.tasktracker.reduce.tasks.maximum',
|
20
|
-
Settings.define :map_tasks, :jobconf => true, :description => 'mapred.map.tasks',
|
21
|
-
Settings.define :reduce_tasks, :jobconf => true, :description => 'mapred.reduce.tasks',
|
22
|
-
Settings.define :sort_fields, :jobconf => true, :description => 'stream.num.map.output.key.fields',
|
23
|
-
Settings.define :key_field_separator, :jobconf => true, :description => 'map.output.key.field.separator',
|
24
|
-
Settings.define :partition_fields, :jobconf => true, :description => 'num.key.fields.for.partition',
|
25
|
-
Settings.define :output_field_separator, :jobconf => true, :description => 'stream.map.output.field.separator',
|
26
|
-
Settings.define :map_speculative, :jobconf => true, :description => 'mapred.map.tasks.speculative.execution',
|
27
|
-
Settings.define :timeout, :jobconf => true, :description => 'mapred.task.timeout',
|
28
|
-
Settings.define :reuse_jvms, :jobconf => true, :description => 'mapred.job.reuse.jvm.num.tasks',
|
29
|
-
Settings.define :respect_exit_status, :jobconf => true, :description => 'stream.non.zero.exit.is.failure',
|
30
|
-
Settings.define :
|
31
|
-
Settings.define :
|
15
|
+
Settings.define :max_node_map_tasks, :jobconf => true, :description => 'mapred.tasktracker.map.tasks.maximum', :wukong => true
|
16
|
+
Settings.define :max_node_reduce_tasks, :jobconf => true, :description => 'mapred.tasktracker.reduce.tasks.maximum', :wukong => true
|
17
|
+
Settings.define :map_tasks, :jobconf => true, :description => 'mapred.map.tasks', :wukong => true
|
18
|
+
Settings.define :reduce_tasks, :jobconf => true, :description => 'mapred.reduce.tasks', :wukong => true
|
19
|
+
Settings.define :sort_fields, :jobconf => true, :description => 'stream.num.map.output.key.fields', :wukong => true
|
20
|
+
Settings.define :key_field_separator, :jobconf => true, :description => 'map.output.key.field.separator', :wukong => true
|
21
|
+
Settings.define :partition_fields, :jobconf => true, :description => 'num.key.fields.for.partition', :wukong => true
|
22
|
+
Settings.define :output_field_separator, :jobconf => true, :description => 'stream.map.output.field.separator', :wukong => true
|
23
|
+
Settings.define :map_speculative, :jobconf => true, :description => 'mapred.map.tasks.speculative.execution', :wukong => true
|
24
|
+
Settings.define :timeout, :jobconf => true, :description => 'mapred.task.timeout', :wukong => true
|
25
|
+
Settings.define :reuse_jvms, :jobconf => true, :description => 'mapred.job.reuse.jvm.num.tasks', :wukong => true
|
26
|
+
Settings.define :respect_exit_status, :jobconf => true, :description => 'stream.non.zero.exit.is.failure', :wukong => true
|
27
|
+
Settings.define :io_sort_mb, :jobconf => true, :description => 'io.sort.mb', :wukong => true
|
28
|
+
Settings.define :io_sort_record_percent, :jobconf => true, :description => 'io.sort.record.percent', :wukong => true
|
29
|
+
Settings.define :job_name, :jobconf => true, :description => 'mapred.job.name', :wukong => true
|
30
|
+
Settings.define :max_reduces_per_node, :jobconf => true, :description => 'mapred.max.reduces.per.node', :wukong => true
|
31
|
+
Settings.define :max_reduces_per_cluster,:jobconf => true, :description => 'mapred.max.reduces.per.cluster', :wukong => true
|
32
|
+
Settings.define :max_maps_per_node, :jobconf => true, :description => 'mapred.max.maps.per.node', :wukong => true
|
33
|
+
Settings.define :max_maps_per_cluster, :jobconf => true, :description => 'mapred.max.maps.per.cluster', :wukong => true
|
34
|
+
Settings.define :max_record_length, :jobconf => true, :description => 'mapred.linerecordreader.maxlength', :wukong => true # "Safeguards against corrupted data: lines longer than this (in bytes) are treated as bad records."
|
32
35
|
Settings.define :noempty, :description => "don't create zero-byte reduce files (hadoop mode only)", :wukong => true
|
33
|
-
Settings.define :job_name, :jobconf => true, :description => 'mapred.job.name', :wukong => true
|
34
|
-
# mapred.linerecordreader.maxlength :description => "Safeguards against corrupted data: lines longer than this (in bytes) are treated as bad records."
|
35
|
-
Settings.define :max_reduces_per_node, :jobconf => true, :description => 'mapred.max.reduces.per.node', :wukong => true
|
36
|
-
Settings.define :max_reduces_per_cluster,:jobconf => true, :description => 'mapred.max.reduces.per.cluster', :wukong => true
|
37
|
-
Settings.define :max_maps_per_node, :jobconf => true, :description => 'mapred.max.maps.per.node', :wukong => true
|
38
|
-
Settings.define :max_maps_per_cluster, :jobconf => true, :description => 'mapred.max.maps.per.cluster', :wukong => true
|
39
36
|
|
40
|
-
#
|
41
|
-
#
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
37
|
+
#
|
38
|
+
# Assemble the hadoop command to execute
|
39
|
+
# and launch the hadoop runner to execute the script across all tasktrackers
|
40
|
+
#
|
41
|
+
def execute_hadoop_workflow
|
42
|
+
# If no reducer_klass and no reduce_command, then skip the reduce phase
|
43
|
+
options[:reduce_tasks] = 0 if (! reducer_klass) && (! options[:reduce_command]) && (! options[:reduce_tasks])
|
44
|
+
# Input paths join by ','
|
45
|
+
input_paths = @input_paths.join(',')
|
46
|
+
#
|
47
|
+
# Use Settings[:hadoop_home] to set the path your config install.
|
48
|
+
hadoop_commandline = [
|
49
|
+
hadoop_runner,
|
50
|
+
"jar #{Settings[:hadoop_home]}/contrib/streaming/hadoop-*streaming*.jar",
|
51
|
+
hadoop_jobconf_options,
|
52
|
+
"-D mapred.job.name '#{job_name}",
|
53
|
+
"-mapper '#{map_commandline}'",
|
54
|
+
"-reducer '#{reduce_commandline}'",
|
55
|
+
"-input '#{input_paths}'",
|
56
|
+
"-output '#{output_path}'",
|
57
|
+
hadoop_recycle_env,
|
58
|
+
hadoop_other_args(input_paths, output_path),
|
59
|
+
].flatten.compact.join(" \t\\\n ")
|
60
|
+
Log.info " Launching hadoop!"
|
61
|
+
execute_command!(hadoop_commandline)
|
46
62
|
end
|
47
63
|
|
48
|
-
|
49
|
-
|
50
|
-
|
64
|
+
def hadoop_jobconf_options
|
65
|
+
jobconf_options = []
|
66
|
+
# The fields should hadoop treat as the keys
|
67
|
+
jobconf_options += [
|
51
68
|
jobconf(:key_field_separator),
|
52
69
|
jobconf(:sort_fields),
|
53
70
|
]
|
54
|
-
|
55
|
-
|
56
|
-
# Define what fields hadoop should use to distribute records to reducers
|
57
|
-
def hadoop_partition_args
|
71
|
+
# Fields hadoop should use to distribute records to reducers
|
58
72
|
unless options[:partition_fields].blank?
|
59
|
-
[
|
73
|
+
jobconf_options += [
|
60
74
|
'-partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner',
|
61
75
|
jobconf(:output_field_separator),
|
62
76
|
jobconf(:partition_fields),
|
63
77
|
]
|
64
78
|
end
|
65
|
-
|
66
|
-
|
67
|
-
# Emit options for setting the number of mappers and reducers.
|
68
|
-
def hadoop_num_tasks_args
|
69
|
-
[
|
79
|
+
# Setting the number of mappers and reducers.
|
80
|
+
jobconf_options += [
|
70
81
|
jobconf(:max_node_map_tasks),
|
71
82
|
jobconf(:max_node_reduce_tasks),
|
83
|
+
jobconf(:max_reduces_per_node),
|
84
|
+
jobconf(:max_reduces_per_cluster),
|
85
|
+
jobconf(:max_maps_per_node),
|
86
|
+
jobconf(:max_maps_per_cluster),
|
72
87
|
jobconf(:map_tasks),
|
73
88
|
jobconf(:reduce_tasks)
|
74
89
|
]
|
90
|
+
jobconf_options.flatten.compact
|
75
91
|
end
|
76
92
|
|
77
|
-
|
93
|
+
# emit a -jobconf hadoop option if the simplified command line arg is present
|
94
|
+
# if not, the resulting nil will be elided later
|
95
|
+
def jobconf option
|
96
|
+
if options[option]
|
97
|
+
"-D %s=%s" % [options.description_for(option), options[option]]
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
def hadoop_other_args
|
78
102
|
extra_str_args = [ options[:extra_args] ]
|
79
103
|
extra_str_args += ' -lazyOutput' if options[:noempty] # don't create reduce file if no records
|
80
104
|
options[:reuse_jvms] = '-1' if (options[:reuse_jvms] == true)
|
81
105
|
options[:respect_exit_status] = 'false' if (options[:ignore_exit_status] == true)
|
82
|
-
|
83
|
-
extra_hsh_args = [:job_name, :map_speculative, :timeout, :reuse_jvms, :respect_exit_status].map{|opt| jobconf(opt) }
|
106
|
+
extra_hsh_args = [:map_speculative, :timeout, :reuse_jvms, :respect_exit_status].map{|opt| jobconf(opt) }
|
84
107
|
extra_str_args + extra_hsh_args
|
85
108
|
end
|
86
109
|
|
@@ -95,29 +118,6 @@ module Wukong
|
|
95
118
|
options[:hadoop_runner] || (options[:hadoop_home]+'/bin/hadoop')
|
96
119
|
end
|
97
120
|
|
98
|
-
#
|
99
|
-
# Assemble the hadoop command to execute
|
100
|
-
#
|
101
|
-
def hadoop_command input_path, output_path
|
102
|
-
# If this is wrong, create a config/wukong-site.rb or
|
103
|
-
# otherwise set Settings[:hadoop_home] to the
|
104
|
-
# root of your config install.
|
105
|
-
[
|
106
|
-
hadoop_runner,
|
107
|
-
"jar #{Settings[:hadoop_home]}/contrib/streaming/hadoop-*streaming*.jar",
|
108
|
-
hadoop_partition_args,
|
109
|
-
hadoop_sort_args,
|
110
|
-
hadoop_num_tasks_args,
|
111
|
-
"-mapper '#{map_command}'",
|
112
|
-
"-reducer '#{reduce_command}'",
|
113
|
-
"-input '#{input_path}'",
|
114
|
-
"-output '#{output_path}'",
|
115
|
-
hadoop_recycle_env,
|
116
|
-
hadoop_other_args(input_path, output_path),
|
117
|
-
].flatten.compact.join(" \t\\\n ")
|
118
|
-
end
|
119
|
-
|
120
|
-
|
121
121
|
module ClassMethods
|
122
122
|
#
|
123
123
|
# Via @pskomoroch via @tlipcon,
|
@@ -201,6 +201,7 @@ module Wukong
|
|
201
201
|
ENV['stream_map_streamprocessor']
|
202
202
|
end
|
203
203
|
end
|
204
|
+
|
204
205
|
# Standard ClassMethods-on-include trick
|
205
206
|
def self.included base
|
206
207
|
base.class_eval do
|
@@ -209,22 +210,3 @@ module Wukong
|
|
209
210
|
end
|
210
211
|
end
|
211
212
|
end
|
212
|
-
|
213
|
-
|
214
|
-
# -inputformat <name of inputformat (class)> (“auto” by default)
|
215
|
-
# -input <additional DFS input path>
|
216
|
-
# -python <python command to use on nodes> (“python” by default)
|
217
|
-
# -name <job name> (“program.py” by default)
|
218
|
-
# -numMapTasks <number>
|
219
|
-
# -numReduceTasks <number> (no sorting or reducing will take place if this is 0)
|
220
|
-
# -priority <priority value> (“NORMAL” by default)
|
221
|
-
# -libjar <path to jar> (this jar gets put in the class path)
|
222
|
-
# -libegg <path to egg> (this egg gets put in the Python path)
|
223
|
-
# -file <local file> (this file will be put in the dir where the python program gets executed)
|
224
|
-
# -cacheFile hdfs://<host>:<fs_port>/<path to file>#<link name> (a link ”<link name>” to the given file will be in the dir)
|
225
|
-
# -cacheArchive hdfs://<host>:<fs_port>/<path to jar>#<link name> (link points to dir that contains files from given jar)
|
226
|
-
# -cmdenv <env var name>=<value>
|
227
|
-
# -jobconf <property name>=<value>
|
228
|
-
# -addpath yes (replace each input key by a tuple consisting of the path of the corresponding input file and the original key)
|
229
|
-
# -fake yes (fake run, only prints the underlying shell commands but does not actually execute them)
|
230
|
-
# -memlimit <number of bytes> (set an upper limit on the amount of memory that can be used)
|
@@ -1,21 +1,31 @@
|
|
1
1
|
module Wukong
|
2
|
+
#
|
3
|
+
# Local execution Options
|
4
|
+
#
|
2
5
|
module LocalCommand
|
3
6
|
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
7
|
+
def execute_local_workflow
|
8
|
+
Log.info " Reading STDIN / Writing STDOUT"
|
9
|
+
execute_command!(local_commandline)
|
10
|
+
end
|
8
11
|
|
9
12
|
# program, including arg, to sort input between mapper and reducer in local
|
10
13
|
# mode. You could override to for example run 'sort -n' (numeric sort).
|
11
|
-
def
|
14
|
+
def local_mode_sort_commandline
|
12
15
|
'sort'
|
13
16
|
end
|
14
17
|
|
15
|
-
|
16
|
-
|
18
|
+
#
|
19
|
+
# Commandline string to execute the job in local mode
|
20
|
+
#
|
21
|
+
# With an input path of '-', just uses $stdin
|
22
|
+
# With an output path of '-', just uses $stdout
|
23
|
+
#
|
24
|
+
def local_commandline
|
25
|
+
@input_paths = input_paths.map(&:strip).join(' ')
|
26
|
+
cmd_input_str = (input_paths == '-') ? "" : "cat '#{input_paths}' | "
|
17
27
|
cmd_output_str = (output_path == '-') ? "" : "> '#{output_path}'"
|
18
|
-
%Q{ #{cmd_input_str} #{
|
28
|
+
%Q{ #{cmd_input_str} #{mapper_commandline} | #{local_mode_sort_commandline} | #{reducer_commandline} #{cmd_output_str} }
|
19
29
|
end
|
20
30
|
|
21
31
|
end
|
data/lib/wukong/script.rb
CHANGED
@@ -4,7 +4,6 @@ require 'wukong/script/local_command'
|
|
4
4
|
require 'configliere' ; Configliere.use(:commandline, :env_var, :define)
|
5
5
|
require 'rbconfig' # for uncovering ruby_interpreter_path
|
6
6
|
module Wukong
|
7
|
-
|
8
7
|
# == How to run a Wukong script
|
9
8
|
#
|
10
9
|
# your/script.rb --run path/to/input_files path/to/output_dir
|
@@ -29,6 +28,13 @@ module Wukong
|
|
29
28
|
# To use more than one file as input, you can use normal * ? [] wildcards or
|
30
29
|
# give a comma-separated list -- see the hadoop documentation for syntax.
|
31
30
|
#
|
31
|
+
# == Run in Elastic MapReduce Mode (--run=emr)
|
32
|
+
#
|
33
|
+
# Wukong can be used to start scripts on the amazon cloud
|
34
|
+
#
|
35
|
+
# * copies the script to s3 in two parts
|
36
|
+
# * invokes it using the amazon API
|
37
|
+
#
|
32
38
|
# == Run locally (--run=local)
|
33
39
|
#
|
34
40
|
# To run your script locally, use --run=local
|
@@ -57,7 +63,8 @@ module Wukong
|
|
57
63
|
class Script
|
58
64
|
include Wukong::HadoopCommand
|
59
65
|
include Wukong::LocalCommand
|
60
|
-
|
66
|
+
attr_reader :mapper_klass, :reducer_klass, :options
|
67
|
+
attr_reader :input_paths, :output_path
|
61
68
|
|
62
69
|
# ---------------------------------------------------------------------------
|
63
70
|
#
|
@@ -79,18 +86,14 @@ module Wukong
|
|
79
86
|
# thus, requiring a working hadoop install), or to run in local mode
|
80
87
|
# (script --map | sort | script --reduce)
|
81
88
|
#
|
82
|
-
Settings.define :default_run_mode, :default => 'hadoop',
|
83
|
-
Settings.define :
|
84
|
-
Settings.define :
|
85
|
-
Settings.define :
|
86
|
-
Settings.define :
|
87
|
-
Settings.define :
|
88
|
-
Settings.define :
|
89
|
-
Settings.define :
|
90
|
-
Settings.define :run, :description => "run the script's main phase. In hadoop mode, invokes the hadoop script; in local mode, runs your_script.rb --map | sort | your_script.rb --reduce", :wukong => true
|
91
|
-
Settings.define :local, :description => "run in local mode (invokes 'your_script.rb --map | sort | your_script.rb --reduce'", :wukong => true
|
92
|
-
Settings.define :hadoop, :description => "run in hadoop mode (invokes the system hadoop runner script)", :wukong => true
|
93
|
-
Settings.define :dry_run, :description => "echo the command that will be run, but don't run it", :wukong => true
|
89
|
+
Settings.define :default_run_mode, :default => 'hadoop', :description => 'Run mode: local, hadoop, emr (elastic mapreduce)', :wukong => true, :hide_help => false
|
90
|
+
Settings.define :map_command, :description => "shell command to run as mapper, in place of this wukong script", :wukong => true
|
91
|
+
Settings.define :reduce_command, :description => "shell command to run as reducer, in place of this wukong script", :wukong => true
|
92
|
+
Settings.define :run, :description => "run the script's workflow: Specify 'hadoop' to use hadoop streaming; 'local' to run your_script.rb --map | sort | your_script.rb --reduce; 'emr' to launch on the amazon cloud.", :wukong => true
|
93
|
+
Settings.define :map, :description => "run the script's map phase. Reads/writes to STDIN/STDOUT.", :wukong => true
|
94
|
+
Settings.define :reduce, :description => "run the script's reduce phase. Reads/writes to STDIN/STDOUT. You can only choose one of --run, --map or --reduce.", :wukong => true
|
95
|
+
Settings.define :dry_run, :description => "echo the command that will be run, but don't run it", :wukong => true
|
96
|
+
Settings.define :rm, :description => "Recursively remove the destination directory. Only used in hadoop mode.", :wukong => true
|
94
97
|
|
95
98
|
#
|
96
99
|
# Instantiate the Script with the Mapper and the Reducer class (each a
|
@@ -120,25 +123,46 @@ module Wukong
|
|
120
123
|
# MyScript.new(MyMapper, nil).run
|
121
124
|
#
|
122
125
|
def initialize mapper_klass, reducer_klass=nil, extra_options={}
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
126
|
+
Settings.resolve!
|
127
|
+
@options = Settings.dup
|
128
|
+
options.merge! extra_options
|
129
|
+
@mapper_klass = mapper_klass
|
130
|
+
@reducer_klass = reducer_klass
|
131
|
+
@output_path = options.rest.pop
|
132
|
+
@input_paths = options.rest.reject(&:blank?)
|
133
|
+
if (input_paths.blank? || output_path.blank?) && (not options[:dry_run]) && (not ['map', 'reduce'].include?(run_mode))
|
134
|
+
raise "You need to specify a parsed input directory and a directory for output. Got #{ARGV.inspect}"
|
135
|
+
end
|
131
136
|
end
|
132
137
|
|
133
138
|
#
|
134
|
-
#
|
135
|
-
#
|
136
|
-
#
|
137
|
-
#
|
138
|
-
# super.merge :my_option => :val
|
139
|
+
# In --run mode, use the framework (local, hadoop, emr, etc) to re-launch
|
140
|
+
# the script as mapper, reducer, etc.
|
141
|
+
# If --map or --reduce, dispatch to the mapper or reducer.
|
139
142
|
#
|
140
|
-
def
|
141
|
-
|
143
|
+
def run
|
144
|
+
case run_mode
|
145
|
+
when 'map' then mapper_klass.new(self.options).stream
|
146
|
+
when 'reduce' then reducer_klass.new(self.options).stream
|
147
|
+
when 'local' then execute_local_workflow
|
148
|
+
when 'hadoop', 'mapred' then execute_hadoop_workflow
|
149
|
+
when 'emr'
|
150
|
+
require 'wukong/script/emr_command'
|
151
|
+
execute_emr_workflow
|
152
|
+
else dump_help
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
# if only --run is given, assume default run mode
|
157
|
+
def run_mode
|
158
|
+
case
|
159
|
+
when options[:map] then 'map'
|
160
|
+
when options[:reduce] then 'reduce'
|
161
|
+
when ($0 =~ /-mapper\.rb$/) then 'map'
|
162
|
+
when ($0 =~ /-reducer\.rb$/) then 'reduce'
|
163
|
+
when (options[:run] == true) then options[:default_run_mode]
|
164
|
+
else options[:run].to_s
|
165
|
+
end
|
142
166
|
end
|
143
167
|
|
144
168
|
#
|
@@ -146,11 +170,11 @@ module Wukong
|
|
146
170
|
# In hadoop mode, this is given to the hadoop streaming command.
|
147
171
|
# In local mode, it's given to the system() call
|
148
172
|
#
|
149
|
-
def
|
173
|
+
def mapper_commandline
|
150
174
|
if mapper_klass
|
151
|
-
|
175
|
+
"#{ruby_interpreter_path} #{this_script_filename} --map " + non_wukong_params
|
152
176
|
else
|
153
|
-
options[:map_command]
|
177
|
+
options[:map_command]
|
154
178
|
end
|
155
179
|
end
|
156
180
|
|
@@ -159,7 +183,7 @@ module Wukong
|
|
159
183
|
# In hadoop mode, this is given to the hadoop streaming command.
|
160
184
|
# In local mode, it's given to the system() call
|
161
185
|
#
|
162
|
-
def
|
186
|
+
def reducer_commandline
|
163
187
|
if reducer_klass
|
164
188
|
"#{ruby_interpreter_path} #{this_script_filename} --reduce " + non_wukong_params
|
165
189
|
else
|
@@ -167,41 +191,37 @@ module Wukong
|
|
167
191
|
end
|
168
192
|
end
|
169
193
|
|
194
|
+
def job_name
|
195
|
+
options[:job_name] ||
|
196
|
+
"#{File.basename(this_script_filename)}---#{input_paths}---#{output_path}".gsub(%r{[^\w/\.\-\+]+}, '')
|
197
|
+
end
|
198
|
+
|
199
|
+
|
200
|
+
protected
|
201
|
+
|
170
202
|
#
|
171
|
-
#
|
203
|
+
# Execute the runner phase:
|
204
|
+
# use the running framework to relaunch the script in map and in reduce mode
|
172
205
|
#
|
173
|
-
def
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
command = local_command input_path, output_path
|
179
|
-
when 'hadoop', 'mapred'
|
180
|
-
$stderr.puts " Launching hadoop as"
|
181
|
-
command = hadoop_command input_path, output_path
|
206
|
+
def execute_command! *args
|
207
|
+
command = args.flatten.compact.join(" \\\n ")
|
208
|
+
Log.info "Running\n\n#{command}\n"
|
209
|
+
if options[:dry_run]
|
210
|
+
Log.info '== [Not running preceding command: dry run] =='
|
182
211
|
else
|
183
|
-
|
212
|
+
maybe_overwrite_output_paths! output_path
|
213
|
+
$stdout.puts `#{command}`
|
184
214
|
end
|
185
215
|
end
|
186
216
|
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
options[:run].to_s
|
193
|
-
end
|
194
|
-
|
195
|
-
def input_output_paths
|
196
|
-
output_path = options.rest.pop
|
197
|
-
input_paths = options.rest.reject(&:blank?)
|
198
|
-
raise "You need to specify a parsed input directory and a directory for output. Got #{ARGV.inspect}" if (! options[:dry_run]) && (input_paths.blank? || output_path.blank?)
|
199
|
-
[input_paths, output_path]
|
200
|
-
end
|
201
|
-
|
217
|
+
#
|
218
|
+
# In hadoop mode only, removes the destination path before launching
|
219
|
+
#
|
220
|
+
# To the panic-stricken: look in .Trash/current/path/to/accidentally_deleted_files
|
221
|
+
#
|
202
222
|
def maybe_overwrite_output_paths! output_path
|
203
|
-
if (options[:overwrite] || options[:rm]) && (run_mode
|
204
|
-
|
223
|
+
if (options[:overwrite] || options[:rm]) && (run_mode == 'hadoop')
|
224
|
+
Log.info "Removing output file #{output_path}"
|
205
225
|
`hdp-rm -r '#{output_path}'`
|
206
226
|
end
|
207
227
|
end
|
@@ -222,39 +242,15 @@ module Wukong
|
|
222
242
|
|
223
243
|
# use the full ruby interpreter path to run slave processes
|
224
244
|
def ruby_interpreter_path
|
225
|
-
Pathname.new(
|
226
|
-
|
227
|
-
|
228
|
-
).realpath
|
229
|
-
end
|
230
|
-
|
231
|
-
#
|
232
|
-
# Execute the runner phase
|
233
|
-
#
|
234
|
-
def exec_hadoop_streaming
|
235
|
-
$stderr.puts "Streaming on self"
|
236
|
-
input_path, output_path = input_output_paths
|
237
|
-
command = runner_command(input_path, output_path)
|
238
|
-
$stderr.puts command
|
239
|
-
unless options[:dry_run]
|
240
|
-
maybe_overwrite_output_paths! output_path
|
241
|
-
$stdout.puts `#{command}`
|
242
|
-
end
|
245
|
+
Pathname.new(File.join(
|
246
|
+
Config::CONFIG["bindir"],
|
247
|
+
Config::CONFIG["RUBY_INSTALL_NAME"]+Config::CONFIG["EXEEXT"])).realpath
|
243
248
|
end
|
244
249
|
|
245
250
|
#
|
246
|
-
#
|
247
|
-
# Otherwise,
|
251
|
+
# Usage
|
248
252
|
#
|
249
|
-
def
|
250
|
-
case
|
251
|
-
when options[:map]
|
252
|
-
mapper_klass.new(self.options).stream
|
253
|
-
when options[:reduce]
|
254
|
-
reducer_klass.new(self.options).stream
|
255
|
-
when options[:run]
|
256
|
-
exec_hadoop_streaming
|
257
|
-
else
|
253
|
+
def dump_help
|
258
254
|
options.dump_help %Q{Please specify a run mode: you probably want to start with
|
259
255
|
#{$0} --run --local input.tsv output.tsv
|
260
256
|
although
|
@@ -262,8 +258,7 @@ although
|
|
262
258
|
or
|
263
259
|
cat mapped.tsv | sort | #{$0} --reduce > reduced.tsv
|
264
260
|
can be useful for initial testing.}
|
265
|
-
end
|
266
261
|
end
|
267
|
-
end
|
268
262
|
|
263
|
+
end
|
269
264
|
end
|