wukong 1.4.12 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG.textile +20 -1
- data/bin/bootstrap.sh +32 -0
- data/bin/hdp-sort +3 -0
- data/bin/hdp-stream +3 -0
- data/docpages/README-elastic_map_reduce.textile +377 -0
- data/docpages/avro/avro_notes.textile +56 -0
- data/docpages/avro/tethering.textile +19 -0
- data/docpages/pig/commandline_params.txt +26 -0
- data/examples/emr/elastic_mapreduce_example.rb +35 -0
- data/lib/wukong/logger.rb +8 -1
- data/lib/wukong/script/avro_command.rb +5 -0
- data/lib/wukong/script/emr_command.rb +119 -0
- data/lib/wukong/script/hadoop_command.rb +72 -90
- data/lib/wukong/script/local_command.rb +18 -8
- data/lib/wukong/script.rb +87 -92
- data/wukong.gemspec +27 -18
- metadata +30 -21
@@ -0,0 +1,119 @@
|
|
1
|
+
require 'right_aws'
|
2
|
+
require 'configliere/config_block'
|
3
|
+
Settings.read(File.expand_path('~/.wukong/emr.yaml'))
|
4
|
+
Settings.define :access_key, :description => 'AWS Access key', :env_var => 'AWS_ACCESS_KEY_ID'
|
5
|
+
Settings.define :secret_access_key, :description => 'AWS Secret Access key', :env_var => 'AWS_SECRET_ACCESS_KEY'
|
6
|
+
Settings.define :emr_runner, :description => 'Path to the elastic-mapreduce command (~ etc will be expanded)'
|
7
|
+
Settings.define :emr_root, :description => 'S3 url to use as the base for Elastic MapReduce storage'
|
8
|
+
Settings.define :key_pair_file, :description => 'AWS Key pair file', :finally => lambda{ Settings.key_pair_file = File.expand_path(Settings.key_pair_file.to_s) }
|
9
|
+
Settings.define :key_pair, :description => "AWS Key pair name. If not specified, it's taken from key_pair_file's basename", :finally => lambda{ Settings.key_pair ||= File.basename(Settings.key_pair_file.to_s, '.pem') }
|
10
|
+
Settings.define :instance_type, :description => 'AWS instance type to use', :default => 'm1.small'
|
11
|
+
Settings.define :master_instance_type, :description => 'Overrides the instance type for the master node', :finally => lambda{ Settings.master_instance_type ||= Settings.instance_type }
|
12
|
+
Settings.define :jobflow
|
13
|
+
module Wukong
|
14
|
+
#
|
15
|
+
# EMR Options
|
16
|
+
#
|
17
|
+
module EmrCommand
|
18
|
+
|
19
|
+
def execute_emr_workflow
|
20
|
+
copy_script_to_cloud
|
21
|
+
execute_emr_runner
|
22
|
+
end
|
23
|
+
|
24
|
+
def copy_script_to_cloud
|
25
|
+
Log.info " Copying this script to the cloud."
|
26
|
+
S3Util.store(this_script_filename, mapper_s3_uri)
|
27
|
+
S3Util.store(this_script_filename, reducer_s3_uri)
|
28
|
+
S3Util.store(File.expand_path('~/ics/wukong/bin/bootstrap.sh'), bootstrap_s3_uri)
|
29
|
+
S3Util.store(File.expand_path('/tmp/wukong-libs.tar.bz2'), wukong_libs_s3_uri)
|
30
|
+
S3Util.store(File.expand_path('/tmp/wukong-libs.jar'), s3_path('bin', "wukong-libs.jar"))
|
31
|
+
end
|
32
|
+
|
33
|
+
def execute_emr_runner
|
34
|
+
command_args = [
|
35
|
+
:hadoop_version, :availability_zone, :key_pair, :key_pair_file,
|
36
|
+
].map{|args| Settings.dashed_flag_for(*args) }
|
37
|
+
command_args += [
|
38
|
+
%Q{--verbose --debug --access-id #{Settings.access_key} --private-key #{Settings.secret_access_key} },
|
39
|
+
"--stream",
|
40
|
+
"--mapper=#{mapper_s3_uri}",
|
41
|
+
"--reducer=#{reducer_s3_uri}",
|
42
|
+
"--input=#{mapper_s3_uri} --output=#{Settings.emr_root+'/foo-out.tsv'}",
|
43
|
+
#"--enable-debugging --log-uri=#{log_s3_uri}",
|
44
|
+
"--cache-archive=#{s3_path('bin', "wukong-libs.jar")}#wukong-libs.jar",
|
45
|
+
"--cache=#{wukong_libs_s3_uri}##{File.basename wukong_libs_s3_uri}",
|
46
|
+
"--bootstrap-action=#{bootstrap_s3_uri}",
|
47
|
+
]
|
48
|
+
if Settings.jobflow
|
49
|
+
command_args << "--jobflow=#{Settings[:jobflow]}"
|
50
|
+
else
|
51
|
+
command_args << '--alive --create'
|
52
|
+
command_args << "--name=#{job_name}"
|
53
|
+
command_args += [ [:instance_type, :slave_instance_type] , :master_instance_type, :num_instances, ].map{|args| Settings.dashed_flag_for(*args) }
|
54
|
+
end
|
55
|
+
execute_command!( File.expand_path(Settings.emr_runner), *command_args )
|
56
|
+
end
|
57
|
+
|
58
|
+
# A short name for this job
|
59
|
+
def job_handle
|
60
|
+
File.basename($0,'.rb')
|
61
|
+
end
|
62
|
+
|
63
|
+
def mapper_s3_uri
|
64
|
+
s3_path(job_handle+'-mapper.rb')
|
65
|
+
end
|
66
|
+
def reducer_s3_uri
|
67
|
+
s3_path(job_handle+'-reducer.rb')
|
68
|
+
end
|
69
|
+
def log_s3_uri
|
70
|
+
s3_path('log', job_handle)
|
71
|
+
end
|
72
|
+
def bootstrap_s3_uri
|
73
|
+
s3_path('bin', "bootstrap-#{job_handle}.sh")
|
74
|
+
end
|
75
|
+
def wukong_libs_s3_uri
|
76
|
+
s3_path('bin', "wukong-libs.tar.bz2")
|
77
|
+
end
|
78
|
+
|
79
|
+
def s3_path *path_segs
|
80
|
+
File.join(Settings.emr_root, path_segs.flatten.compact)
|
81
|
+
end
|
82
|
+
|
83
|
+
module ClassMethods
|
84
|
+
|
85
|
+
# Standard hack to create ClassMethods-on-include
|
86
|
+
def self.included base
|
87
|
+
base.class_eval do
|
88
|
+
extend ClassMethods
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
class S3Util
|
94
|
+
# class methods
|
95
|
+
class << self
|
96
|
+
def s3
|
97
|
+
@s3 ||= RightAws::S3Interface.new(
|
98
|
+
Settings.access_key, Settings.secret_access_key,
|
99
|
+
{:multi_thread => true, :logger => Log})
|
100
|
+
end
|
101
|
+
|
102
|
+
def bucket_and_path_from_uri uri
|
103
|
+
uri =~ %r{^s3\w*://([\w\.\-]+)\W*(.*)} and return([$1, $2])
|
104
|
+
end
|
105
|
+
|
106
|
+
def store filename, uri
|
107
|
+
Log.debug " #{filename} => #{uri}"
|
108
|
+
dest_bucket, dest_key = bucket_and_path_from_uri(uri)
|
109
|
+
contents = File.open(filename)
|
110
|
+
s3.store_object(:bucket => dest_bucket, :key => dest_key, :data => contents)
|
111
|
+
end
|
112
|
+
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
116
|
+
Script.class_eval do
|
117
|
+
include EmrCommand
|
118
|
+
end
|
119
|
+
end
|
@@ -2,85 +2,108 @@
|
|
2
2
|
module Wukong
|
3
3
|
module HadoopCommand
|
4
4
|
|
5
|
-
# ===========================================================================
|
6
|
-
#
|
7
|
-
# Hadoop Environment
|
8
|
-
#
|
9
|
-
|
10
5
|
# ===========================================================================
|
11
6
|
#
|
12
7
|
# Hadoop Options
|
13
8
|
#
|
9
|
+
Settings.define :hadoop_home, :default => '/usr/lib/hadoop', :description => "Path to hadoop installation; ENV['HADOOP_HOME'] by default. HADOOP_HOME/bin/hadoop is used to run hadoop.", :env_var => 'HADOOP_HOME', :wukong => true
|
10
|
+
Settings.define :hadoop_runner, :description => "Path to hadoop script. Usually set --hadoop_home instead of this.", :wukong => true
|
14
11
|
|
15
12
|
#
|
16
|
-
# Translate
|
13
|
+
# Translate simplified args to their hairy hadoop equivalents
|
17
14
|
#
|
18
|
-
Settings.define :max_node_map_tasks, :jobconf => true, :description => 'mapred.tasktracker.map.tasks.maximum',
|
19
|
-
Settings.define :max_node_reduce_tasks, :jobconf => true, :description => 'mapred.tasktracker.reduce.tasks.maximum',
|
20
|
-
Settings.define :map_tasks, :jobconf => true, :description => 'mapred.map.tasks',
|
21
|
-
Settings.define :reduce_tasks, :jobconf => true, :description => 'mapred.reduce.tasks',
|
22
|
-
Settings.define :sort_fields, :jobconf => true, :description => 'stream.num.map.output.key.fields',
|
23
|
-
Settings.define :key_field_separator, :jobconf => true, :description => 'map.output.key.field.separator',
|
24
|
-
Settings.define :partition_fields, :jobconf => true, :description => 'num.key.fields.for.partition',
|
25
|
-
Settings.define :output_field_separator, :jobconf => true, :description => 'stream.map.output.field.separator',
|
26
|
-
Settings.define :map_speculative, :jobconf => true, :description => 'mapred.map.tasks.speculative.execution',
|
27
|
-
Settings.define :timeout, :jobconf => true, :description => 'mapred.task.timeout',
|
28
|
-
Settings.define :reuse_jvms, :jobconf => true, :description => 'mapred.job.reuse.jvm.num.tasks',
|
29
|
-
Settings.define :respect_exit_status, :jobconf => true, :description => 'stream.non.zero.exit.is.failure',
|
30
|
-
Settings.define :
|
31
|
-
Settings.define :
|
15
|
+
Settings.define :max_node_map_tasks, :jobconf => true, :description => 'mapred.tasktracker.map.tasks.maximum', :wukong => true
|
16
|
+
Settings.define :max_node_reduce_tasks, :jobconf => true, :description => 'mapred.tasktracker.reduce.tasks.maximum', :wukong => true
|
17
|
+
Settings.define :map_tasks, :jobconf => true, :description => 'mapred.map.tasks', :wukong => true
|
18
|
+
Settings.define :reduce_tasks, :jobconf => true, :description => 'mapred.reduce.tasks', :wukong => true
|
19
|
+
Settings.define :sort_fields, :jobconf => true, :description => 'stream.num.map.output.key.fields', :wukong => true
|
20
|
+
Settings.define :key_field_separator, :jobconf => true, :description => 'map.output.key.field.separator', :wukong => true
|
21
|
+
Settings.define :partition_fields, :jobconf => true, :description => 'num.key.fields.for.partition', :wukong => true
|
22
|
+
Settings.define :output_field_separator, :jobconf => true, :description => 'stream.map.output.field.separator', :wukong => true
|
23
|
+
Settings.define :map_speculative, :jobconf => true, :description => 'mapred.map.tasks.speculative.execution', :wukong => true
|
24
|
+
Settings.define :timeout, :jobconf => true, :description => 'mapred.task.timeout', :wukong => true
|
25
|
+
Settings.define :reuse_jvms, :jobconf => true, :description => 'mapred.job.reuse.jvm.num.tasks', :wukong => true
|
26
|
+
Settings.define :respect_exit_status, :jobconf => true, :description => 'stream.non.zero.exit.is.failure', :wukong => true
|
27
|
+
Settings.define :io_sort_mb, :jobconf => true, :description => 'io.sort.mb', :wukong => true
|
28
|
+
Settings.define :io_sort_record_percent, :jobconf => true, :description => 'io.sort.record.percent', :wukong => true
|
29
|
+
Settings.define :job_name, :jobconf => true, :description => 'mapred.job.name', :wukong => true
|
30
|
+
Settings.define :max_reduces_per_node, :jobconf => true, :description => 'mapred.max.reduces.per.node', :wukong => true
|
31
|
+
Settings.define :max_reduces_per_cluster,:jobconf => true, :description => 'mapred.max.reduces.per.cluster', :wukong => true
|
32
|
+
Settings.define :max_maps_per_node, :jobconf => true, :description => 'mapred.max.maps.per.node', :wukong => true
|
33
|
+
Settings.define :max_maps_per_cluster, :jobconf => true, :description => 'mapred.max.maps.per.cluster', :wukong => true
|
34
|
+
Settings.define :max_record_length, :jobconf => true, :description => 'mapred.linerecordreader.maxlength', :wukong => true # "Safeguards against corrupted data: lines longer than this (in bytes) are treated as bad records."
|
32
35
|
Settings.define :noempty, :description => "don't create zero-byte reduce files (hadoop mode only)", :wukong => true
|
33
|
-
Settings.define :job_name, :jobconf => true, :description => 'mapred.job.name', :wukong => true
|
34
|
-
# mapred.linerecordreader.maxlength :description => "Safeguards against corrupted data: lines longer than this (in bytes) are treated as bad records."
|
35
|
-
Settings.define :max_reduces_per_node, :jobconf => true, :description => 'mapred.max.reduces.per.node', :wukong => true
|
36
|
-
Settings.define :max_reduces_per_cluster,:jobconf => true, :description => 'mapred.max.reduces.per.cluster', :wukong => true
|
37
|
-
Settings.define :max_maps_per_node, :jobconf => true, :description => 'mapred.max.maps.per.node', :wukong => true
|
38
|
-
Settings.define :max_maps_per_cluster, :jobconf => true, :description => 'mapred.max.maps.per.cluster', :wukong => true
|
39
36
|
|
40
|
-
#
|
41
|
-
#
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
37
|
+
#
|
38
|
+
# Assemble the hadoop command to execute
|
39
|
+
# and launch the hadoop runner to execute the script across all tasktrackers
|
40
|
+
#
|
41
|
+
def execute_hadoop_workflow
|
42
|
+
# If no reducer_klass and no reduce_command, then skip the reduce phase
|
43
|
+
options[:reduce_tasks] = 0 if (! reducer_klass) && (! options[:reduce_command]) && (! options[:reduce_tasks])
|
44
|
+
# Input paths join by ','
|
45
|
+
input_paths = @input_paths.join(',')
|
46
|
+
#
|
47
|
+
# Use Settings[:hadoop_home] to set the path your config install.
|
48
|
+
hadoop_commandline = [
|
49
|
+
hadoop_runner,
|
50
|
+
"jar #{Settings[:hadoop_home]}/contrib/streaming/hadoop-*streaming*.jar",
|
51
|
+
hadoop_jobconf_options,
|
52
|
+
"-D mapred.job.name '#{job_name}",
|
53
|
+
"-mapper '#{map_commandline}'",
|
54
|
+
"-reducer '#{reduce_commandline}'",
|
55
|
+
"-input '#{input_paths}'",
|
56
|
+
"-output '#{output_path}'",
|
57
|
+
hadoop_recycle_env,
|
58
|
+
hadoop_other_args(input_paths, output_path),
|
59
|
+
].flatten.compact.join(" \t\\\n ")
|
60
|
+
Log.info " Launching hadoop!"
|
61
|
+
execute_command!(hadoop_commandline)
|
46
62
|
end
|
47
63
|
|
48
|
-
|
49
|
-
|
50
|
-
|
64
|
+
def hadoop_jobconf_options
|
65
|
+
jobconf_options = []
|
66
|
+
# The fields should hadoop treat as the keys
|
67
|
+
jobconf_options += [
|
51
68
|
jobconf(:key_field_separator),
|
52
69
|
jobconf(:sort_fields),
|
53
70
|
]
|
54
|
-
|
55
|
-
|
56
|
-
# Define what fields hadoop should use to distribute records to reducers
|
57
|
-
def hadoop_partition_args
|
71
|
+
# Fields hadoop should use to distribute records to reducers
|
58
72
|
unless options[:partition_fields].blank?
|
59
|
-
[
|
73
|
+
jobconf_options += [
|
60
74
|
'-partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner',
|
61
75
|
jobconf(:output_field_separator),
|
62
76
|
jobconf(:partition_fields),
|
63
77
|
]
|
64
78
|
end
|
65
|
-
|
66
|
-
|
67
|
-
# Emit options for setting the number of mappers and reducers.
|
68
|
-
def hadoop_num_tasks_args
|
69
|
-
[
|
79
|
+
# Setting the number of mappers and reducers.
|
80
|
+
jobconf_options += [
|
70
81
|
jobconf(:max_node_map_tasks),
|
71
82
|
jobconf(:max_node_reduce_tasks),
|
83
|
+
jobconf(:max_reduces_per_node),
|
84
|
+
jobconf(:max_reduces_per_cluster),
|
85
|
+
jobconf(:max_maps_per_node),
|
86
|
+
jobconf(:max_maps_per_cluster),
|
72
87
|
jobconf(:map_tasks),
|
73
88
|
jobconf(:reduce_tasks)
|
74
89
|
]
|
90
|
+
jobconf_options.flatten.compact
|
75
91
|
end
|
76
92
|
|
77
|
-
|
93
|
+
# emit a -jobconf hadoop option if the simplified command line arg is present
|
94
|
+
# if not, the resulting nil will be elided later
|
95
|
+
def jobconf option
|
96
|
+
if options[option]
|
97
|
+
"-D %s=%s" % [options.description_for(option), options[option]]
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
def hadoop_other_args
|
78
102
|
extra_str_args = [ options[:extra_args] ]
|
79
103
|
extra_str_args += ' -lazyOutput' if options[:noempty] # don't create reduce file if no records
|
80
104
|
options[:reuse_jvms] = '-1' if (options[:reuse_jvms] == true)
|
81
105
|
options[:respect_exit_status] = 'false' if (options[:ignore_exit_status] == true)
|
82
|
-
|
83
|
-
extra_hsh_args = [:job_name, :map_speculative, :timeout, :reuse_jvms, :respect_exit_status].map{|opt| jobconf(opt) }
|
106
|
+
extra_hsh_args = [:map_speculative, :timeout, :reuse_jvms, :respect_exit_status].map{|opt| jobconf(opt) }
|
84
107
|
extra_str_args + extra_hsh_args
|
85
108
|
end
|
86
109
|
|
@@ -95,29 +118,6 @@ module Wukong
|
|
95
118
|
options[:hadoop_runner] || (options[:hadoop_home]+'/bin/hadoop')
|
96
119
|
end
|
97
120
|
|
98
|
-
#
|
99
|
-
# Assemble the hadoop command to execute
|
100
|
-
#
|
101
|
-
def hadoop_command input_path, output_path
|
102
|
-
# If this is wrong, create a config/wukong-site.rb or
|
103
|
-
# otherwise set Settings[:hadoop_home] to the
|
104
|
-
# root of your config install.
|
105
|
-
[
|
106
|
-
hadoop_runner,
|
107
|
-
"jar #{Settings[:hadoop_home]}/contrib/streaming/hadoop-*streaming*.jar",
|
108
|
-
hadoop_partition_args,
|
109
|
-
hadoop_sort_args,
|
110
|
-
hadoop_num_tasks_args,
|
111
|
-
"-mapper '#{map_command}'",
|
112
|
-
"-reducer '#{reduce_command}'",
|
113
|
-
"-input '#{input_path}'",
|
114
|
-
"-output '#{output_path}'",
|
115
|
-
hadoop_recycle_env,
|
116
|
-
hadoop_other_args(input_path, output_path),
|
117
|
-
].flatten.compact.join(" \t\\\n ")
|
118
|
-
end
|
119
|
-
|
120
|
-
|
121
121
|
module ClassMethods
|
122
122
|
#
|
123
123
|
# Via @pskomoroch via @tlipcon,
|
@@ -201,6 +201,7 @@ module Wukong
|
|
201
201
|
ENV['stream_map_streamprocessor']
|
202
202
|
end
|
203
203
|
end
|
204
|
+
|
204
205
|
# Standard ClassMethods-on-include trick
|
205
206
|
def self.included base
|
206
207
|
base.class_eval do
|
@@ -209,22 +210,3 @@ module Wukong
|
|
209
210
|
end
|
210
211
|
end
|
211
212
|
end
|
212
|
-
|
213
|
-
|
214
|
-
# -inputformat <name of inputformat (class)> (“auto” by default)
|
215
|
-
# -input <additional DFS input path>
|
216
|
-
# -python <python command to use on nodes> (“python” by default)
|
217
|
-
# -name <job name> (“program.py” by default)
|
218
|
-
# -numMapTasks <number>
|
219
|
-
# -numReduceTasks <number> (no sorting or reducing will take place if this is 0)
|
220
|
-
# -priority <priority value> (“NORMAL” by default)
|
221
|
-
# -libjar <path to jar> (this jar gets put in the class path)
|
222
|
-
# -libegg <path to egg> (this egg gets put in the Python path)
|
223
|
-
# -file <local file> (this file will be put in the dir where the python program gets executed)
|
224
|
-
# -cacheFile hdfs://<host>:<fs_port>/<path to file>#<link name> (a link ”<link name>” to the given file will be in the dir)
|
225
|
-
# -cacheArchive hdfs://<host>:<fs_port>/<path to jar>#<link name> (link points to dir that contains files from given jar)
|
226
|
-
# -cmdenv <env var name>=<value>
|
227
|
-
# -jobconf <property name>=<value>
|
228
|
-
# -addpath yes (replace each input key by a tuple consisting of the path of the corresponding input file and the original key)
|
229
|
-
# -fake yes (fake run, only prints the underlying shell commands but does not actually execute them)
|
230
|
-
# -memlimit <number of bytes> (set an upper limit on the amount of memory that can be used)
|
@@ -1,21 +1,31 @@
|
|
1
1
|
module Wukong
|
2
|
+
#
|
3
|
+
# Local execution Options
|
4
|
+
#
|
2
5
|
module LocalCommand
|
3
6
|
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
7
|
+
def execute_local_workflow
|
8
|
+
Log.info " Reading STDIN / Writing STDOUT"
|
9
|
+
execute_command!(local_commandline)
|
10
|
+
end
|
8
11
|
|
9
12
|
# program, including arg, to sort input between mapper and reducer in local
|
10
13
|
# mode. You could override to for example run 'sort -n' (numeric sort).
|
11
|
-
def
|
14
|
+
def local_mode_sort_commandline
|
12
15
|
'sort'
|
13
16
|
end
|
14
17
|
|
15
|
-
|
16
|
-
|
18
|
+
#
|
19
|
+
# Commandline string to execute the job in local mode
|
20
|
+
#
|
21
|
+
# With an input path of '-', just uses $stdin
|
22
|
+
# With an output path of '-', just uses $stdout
|
23
|
+
#
|
24
|
+
def local_commandline
|
25
|
+
@input_paths = input_paths.map(&:strip).join(' ')
|
26
|
+
cmd_input_str = (input_paths == '-') ? "" : "cat '#{input_paths}' | "
|
17
27
|
cmd_output_str = (output_path == '-') ? "" : "> '#{output_path}'"
|
18
|
-
%Q{ #{cmd_input_str} #{
|
28
|
+
%Q{ #{cmd_input_str} #{mapper_commandline} | #{local_mode_sort_commandline} | #{reducer_commandline} #{cmd_output_str} }
|
19
29
|
end
|
20
30
|
|
21
31
|
end
|
data/lib/wukong/script.rb
CHANGED
@@ -4,7 +4,6 @@ require 'wukong/script/local_command'
|
|
4
4
|
require 'configliere' ; Configliere.use(:commandline, :env_var, :define)
|
5
5
|
require 'rbconfig' # for uncovering ruby_interpreter_path
|
6
6
|
module Wukong
|
7
|
-
|
8
7
|
# == How to run a Wukong script
|
9
8
|
#
|
10
9
|
# your/script.rb --run path/to/input_files path/to/output_dir
|
@@ -29,6 +28,13 @@ module Wukong
|
|
29
28
|
# To use more than one file as input, you can use normal * ? [] wildcards or
|
30
29
|
# give a comma-separated list -- see the hadoop documentation for syntax.
|
31
30
|
#
|
31
|
+
# == Run in Elastic MapReduce Mode (--run=emr)
|
32
|
+
#
|
33
|
+
# Wukong can be used to start scripts on the amazon cloud
|
34
|
+
#
|
35
|
+
# * copies the script to s3 in two parts
|
36
|
+
# * invokes it using the amazon API
|
37
|
+
#
|
32
38
|
# == Run locally (--run=local)
|
33
39
|
#
|
34
40
|
# To run your script locally, use --run=local
|
@@ -57,7 +63,8 @@ module Wukong
|
|
57
63
|
class Script
|
58
64
|
include Wukong::HadoopCommand
|
59
65
|
include Wukong::LocalCommand
|
60
|
-
|
66
|
+
attr_reader :mapper_klass, :reducer_klass, :options
|
67
|
+
attr_reader :input_paths, :output_path
|
61
68
|
|
62
69
|
# ---------------------------------------------------------------------------
|
63
70
|
#
|
@@ -79,18 +86,14 @@ module Wukong
|
|
79
86
|
# thus, requiring a working hadoop install), or to run in local mode
|
80
87
|
# (script --map | sort | script --reduce)
|
81
88
|
#
|
82
|
-
Settings.define :default_run_mode, :default => 'hadoop',
|
83
|
-
Settings.define :
|
84
|
-
Settings.define :
|
85
|
-
Settings.define :
|
86
|
-
Settings.define :
|
87
|
-
Settings.define :
|
88
|
-
Settings.define :
|
89
|
-
Settings.define :
|
90
|
-
Settings.define :run, :description => "run the script's main phase. In hadoop mode, invokes the hadoop script; in local mode, runs your_script.rb --map | sort | your_script.rb --reduce", :wukong => true
|
91
|
-
Settings.define :local, :description => "run in local mode (invokes 'your_script.rb --map | sort | your_script.rb --reduce'", :wukong => true
|
92
|
-
Settings.define :hadoop, :description => "run in hadoop mode (invokes the system hadoop runner script)", :wukong => true
|
93
|
-
Settings.define :dry_run, :description => "echo the command that will be run, but don't run it", :wukong => true
|
89
|
+
Settings.define :default_run_mode, :default => 'hadoop', :description => 'Run mode: local, hadoop, emr (elastic mapreduce)', :wukong => true, :hide_help => false
|
90
|
+
Settings.define :map_command, :description => "shell command to run as mapper, in place of this wukong script", :wukong => true
|
91
|
+
Settings.define :reduce_command, :description => "shell command to run as reducer, in place of this wukong script", :wukong => true
|
92
|
+
Settings.define :run, :description => "run the script's workflow: Specify 'hadoop' to use hadoop streaming; 'local' to run your_script.rb --map | sort | your_script.rb --reduce; 'emr' to launch on the amazon cloud.", :wukong => true
|
93
|
+
Settings.define :map, :description => "run the script's map phase. Reads/writes to STDIN/STDOUT.", :wukong => true
|
94
|
+
Settings.define :reduce, :description => "run the script's reduce phase. Reads/writes to STDIN/STDOUT. You can only choose one of --run, --map or --reduce.", :wukong => true
|
95
|
+
Settings.define :dry_run, :description => "echo the command that will be run, but don't run it", :wukong => true
|
96
|
+
Settings.define :rm, :description => "Recursively remove the destination directory. Only used in hadoop mode.", :wukong => true
|
94
97
|
|
95
98
|
#
|
96
99
|
# Instantiate the Script with the Mapper and the Reducer class (each a
|
@@ -120,25 +123,46 @@ module Wukong
|
|
120
123
|
# MyScript.new(MyMapper, nil).run
|
121
124
|
#
|
122
125
|
def initialize mapper_klass, reducer_klass=nil, extra_options={}
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
126
|
+
Settings.resolve!
|
127
|
+
@options = Settings.dup
|
128
|
+
options.merge! extra_options
|
129
|
+
@mapper_klass = mapper_klass
|
130
|
+
@reducer_klass = reducer_klass
|
131
|
+
@output_path = options.rest.pop
|
132
|
+
@input_paths = options.rest.reject(&:blank?)
|
133
|
+
if (input_paths.blank? || output_path.blank?) && (not options[:dry_run]) && (not ['map', 'reduce'].include?(run_mode))
|
134
|
+
raise "You need to specify a parsed input directory and a directory for output. Got #{ARGV.inspect}"
|
135
|
+
end
|
131
136
|
end
|
132
137
|
|
133
138
|
#
|
134
|
-
#
|
135
|
-
#
|
136
|
-
#
|
137
|
-
#
|
138
|
-
# super.merge :my_option => :val
|
139
|
+
# In --run mode, use the framework (local, hadoop, emr, etc) to re-launch
|
140
|
+
# the script as mapper, reducer, etc.
|
141
|
+
# If --map or --reduce, dispatch to the mapper or reducer.
|
139
142
|
#
|
140
|
-
def
|
141
|
-
|
143
|
+
def run
|
144
|
+
case run_mode
|
145
|
+
when 'map' then mapper_klass.new(self.options).stream
|
146
|
+
when 'reduce' then reducer_klass.new(self.options).stream
|
147
|
+
when 'local' then execute_local_workflow
|
148
|
+
when 'hadoop', 'mapred' then execute_hadoop_workflow
|
149
|
+
when 'emr'
|
150
|
+
require 'wukong/script/emr_command'
|
151
|
+
execute_emr_workflow
|
152
|
+
else dump_help
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
# if only --run is given, assume default run mode
|
157
|
+
def run_mode
|
158
|
+
case
|
159
|
+
when options[:map] then 'map'
|
160
|
+
when options[:reduce] then 'reduce'
|
161
|
+
when ($0 =~ /-mapper\.rb$/) then 'map'
|
162
|
+
when ($0 =~ /-reducer\.rb$/) then 'reduce'
|
163
|
+
when (options[:run] == true) then options[:default_run_mode]
|
164
|
+
else options[:run].to_s
|
165
|
+
end
|
142
166
|
end
|
143
167
|
|
144
168
|
#
|
@@ -146,11 +170,11 @@ module Wukong
|
|
146
170
|
# In hadoop mode, this is given to the hadoop streaming command.
|
147
171
|
# In local mode, it's given to the system() call
|
148
172
|
#
|
149
|
-
def
|
173
|
+
def mapper_commandline
|
150
174
|
if mapper_klass
|
151
|
-
|
175
|
+
"#{ruby_interpreter_path} #{this_script_filename} --map " + non_wukong_params
|
152
176
|
else
|
153
|
-
options[:map_command]
|
177
|
+
options[:map_command]
|
154
178
|
end
|
155
179
|
end
|
156
180
|
|
@@ -159,7 +183,7 @@ module Wukong
|
|
159
183
|
# In hadoop mode, this is given to the hadoop streaming command.
|
160
184
|
# In local mode, it's given to the system() call
|
161
185
|
#
|
162
|
-
def
|
186
|
+
def reducer_commandline
|
163
187
|
if reducer_klass
|
164
188
|
"#{ruby_interpreter_path} #{this_script_filename} --reduce " + non_wukong_params
|
165
189
|
else
|
@@ -167,41 +191,37 @@ module Wukong
|
|
167
191
|
end
|
168
192
|
end
|
169
193
|
|
194
|
+
def job_name
|
195
|
+
options[:job_name] ||
|
196
|
+
"#{File.basename(this_script_filename)}---#{input_paths}---#{output_path}".gsub(%r{[^\w/\.\-\+]+}, '')
|
197
|
+
end
|
198
|
+
|
199
|
+
|
200
|
+
protected
|
201
|
+
|
170
202
|
#
|
171
|
-
#
|
203
|
+
# Execute the runner phase:
|
204
|
+
# use the running framework to relaunch the script in map and in reduce mode
|
172
205
|
#
|
173
|
-
def
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
command = local_command input_path, output_path
|
179
|
-
when 'hadoop', 'mapred'
|
180
|
-
$stderr.puts " Launching hadoop as"
|
181
|
-
command = hadoop_command input_path, output_path
|
206
|
+
def execute_command! *args
|
207
|
+
command = args.flatten.compact.join(" \\\n ")
|
208
|
+
Log.info "Running\n\n#{command}\n"
|
209
|
+
if options[:dry_run]
|
210
|
+
Log.info '== [Not running preceding command: dry run] =='
|
182
211
|
else
|
183
|
-
|
212
|
+
maybe_overwrite_output_paths! output_path
|
213
|
+
$stdout.puts `#{command}`
|
184
214
|
end
|
185
215
|
end
|
186
216
|
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
options[:run].to_s
|
193
|
-
end
|
194
|
-
|
195
|
-
def input_output_paths
|
196
|
-
output_path = options.rest.pop
|
197
|
-
input_paths = options.rest.reject(&:blank?)
|
198
|
-
raise "You need to specify a parsed input directory and a directory for output. Got #{ARGV.inspect}" if (! options[:dry_run]) && (input_paths.blank? || output_path.blank?)
|
199
|
-
[input_paths, output_path]
|
200
|
-
end
|
201
|
-
|
217
|
+
#
|
218
|
+
# In hadoop mode only, removes the destination path before launching
|
219
|
+
#
|
220
|
+
# To the panic-stricken: look in .Trash/current/path/to/accidentally_deleted_files
|
221
|
+
#
|
202
222
|
def maybe_overwrite_output_paths! output_path
|
203
|
-
if (options[:overwrite] || options[:rm]) && (run_mode
|
204
|
-
|
223
|
+
if (options[:overwrite] || options[:rm]) && (run_mode == 'hadoop')
|
224
|
+
Log.info "Removing output file #{output_path}"
|
205
225
|
`hdp-rm -r '#{output_path}'`
|
206
226
|
end
|
207
227
|
end
|
@@ -222,39 +242,15 @@ module Wukong
|
|
222
242
|
|
223
243
|
# use the full ruby interpreter path to run slave processes
|
224
244
|
def ruby_interpreter_path
|
225
|
-
Pathname.new(
|
226
|
-
|
227
|
-
|
228
|
-
).realpath
|
229
|
-
end
|
230
|
-
|
231
|
-
#
|
232
|
-
# Execute the runner phase
|
233
|
-
#
|
234
|
-
def exec_hadoop_streaming
|
235
|
-
$stderr.puts "Streaming on self"
|
236
|
-
input_path, output_path = input_output_paths
|
237
|
-
command = runner_command(input_path, output_path)
|
238
|
-
$stderr.puts command
|
239
|
-
unless options[:dry_run]
|
240
|
-
maybe_overwrite_output_paths! output_path
|
241
|
-
$stdout.puts `#{command}`
|
242
|
-
end
|
245
|
+
Pathname.new(File.join(
|
246
|
+
Config::CONFIG["bindir"],
|
247
|
+
Config::CONFIG["RUBY_INSTALL_NAME"]+Config::CONFIG["EXEEXT"])).realpath
|
243
248
|
end
|
244
249
|
|
245
250
|
#
|
246
|
-
#
|
247
|
-
# Otherwise,
|
251
|
+
# Usage
|
248
252
|
#
|
249
|
-
def
|
250
|
-
case
|
251
|
-
when options[:map]
|
252
|
-
mapper_klass.new(self.options).stream
|
253
|
-
when options[:reduce]
|
254
|
-
reducer_klass.new(self.options).stream
|
255
|
-
when options[:run]
|
256
|
-
exec_hadoop_streaming
|
257
|
-
else
|
253
|
+
def dump_help
|
258
254
|
options.dump_help %Q{Please specify a run mode: you probably want to start with
|
259
255
|
#{$0} --run --local input.tsv output.tsv
|
260
256
|
although
|
@@ -262,8 +258,7 @@ although
|
|
262
258
|
or
|
263
259
|
cat mapped.tsv | sort | #{$0} --reduce > reduced.tsv
|
264
260
|
can be useful for initial testing.}
|
265
|
-
end
|
266
261
|
end
|
267
|
-
end
|
268
262
|
|
263
|
+
end
|
269
264
|
end
|