wukong 1.4.12 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,119 @@
1
+ require 'right_aws'
2
+ require 'configliere/config_block'
3
+ Settings.read(File.expand_path('~/.wukong/emr.yaml'))
4
+ Settings.define :access_key, :description => 'AWS Access key', :env_var => 'AWS_ACCESS_KEY_ID'
5
+ Settings.define :secret_access_key, :description => 'AWS Secret Access key', :env_var => 'AWS_SECRET_ACCESS_KEY'
6
+ Settings.define :emr_runner, :description => 'Path to the elastic-mapreduce command (~ etc will be expanded)'
7
+ Settings.define :emr_root, :description => 'S3 url to use as the base for Elastic MapReduce storage'
8
+ Settings.define :key_pair_file, :description => 'AWS Key pair file', :finally => lambda{ Settings.key_pair_file = File.expand_path(Settings.key_pair_file.to_s) }
9
+ Settings.define :key_pair, :description => "AWS Key pair name. If not specified, it's taken from key_pair_file's basename", :finally => lambda{ Settings.key_pair ||= File.basename(Settings.key_pair_file.to_s, '.pem') }
10
+ Settings.define :instance_type, :description => 'AWS instance type to use', :default => 'm1.small'
11
+ Settings.define :master_instance_type, :description => 'Overrides the instance type for the master node', :finally => lambda{ Settings.master_instance_type ||= Settings.instance_type }
12
+ Settings.define :jobflow
13
+ module Wukong
14
+ #
15
+ # EMR Options
16
+ #
17
+ module EmrCommand
18
+
19
+ def execute_emr_workflow
20
+ copy_script_to_cloud
21
+ execute_emr_runner
22
+ end
23
+
24
+ def copy_script_to_cloud
25
+ Log.info " Copying this script to the cloud."
26
+ S3Util.store(this_script_filename, mapper_s3_uri)
27
+ S3Util.store(this_script_filename, reducer_s3_uri)
28
+ S3Util.store(File.expand_path('~/ics/wukong/bin/bootstrap.sh'), bootstrap_s3_uri)
29
+ S3Util.store(File.expand_path('/tmp/wukong-libs.tar.bz2'), wukong_libs_s3_uri)
30
+ S3Util.store(File.expand_path('/tmp/wukong-libs.jar'), s3_path('bin', "wukong-libs.jar"))
31
+ end
32
+
33
+ def execute_emr_runner
34
+ command_args = [
35
+ :hadoop_version, :availability_zone, :key_pair, :key_pair_file,
36
+ ].map{|args| Settings.dashed_flag_for(*args) }
37
+ command_args += [
38
+ %Q{--verbose --debug --access-id #{Settings.access_key} --private-key #{Settings.secret_access_key} },
39
+ "--stream",
40
+ "--mapper=#{mapper_s3_uri}",
41
+ "--reducer=#{reducer_s3_uri}",
42
+ "--input=#{mapper_s3_uri} --output=#{Settings.emr_root+'/foo-out.tsv'}",
43
+ #"--enable-debugging --log-uri=#{log_s3_uri}",
44
+ "--cache-archive=#{s3_path('bin', "wukong-libs.jar")}#wukong-libs.jar",
45
+ "--cache=#{wukong_libs_s3_uri}##{File.basename wukong_libs_s3_uri}",
46
+ "--bootstrap-action=#{bootstrap_s3_uri}",
47
+ ]
48
+ if Settings.jobflow
49
+ command_args << "--jobflow=#{Settings[:jobflow]}"
50
+ else
51
+ command_args << '--alive --create'
52
+ command_args << "--name=#{job_name}"
53
+ command_args += [ [:instance_type, :slave_instance_type] , :master_instance_type, :num_instances, ].map{|args| Settings.dashed_flag_for(*args) }
54
+ end
55
+ execute_command!( File.expand_path(Settings.emr_runner), *command_args )
56
+ end
57
+
58
+ # A short name for this job
59
+ def job_handle
60
+ File.basename($0,'.rb')
61
+ end
62
+
63
+ def mapper_s3_uri
64
+ s3_path(job_handle+'-mapper.rb')
65
+ end
66
+ def reducer_s3_uri
67
+ s3_path(job_handle+'-reducer.rb')
68
+ end
69
+ def log_s3_uri
70
+ s3_path('log', job_handle)
71
+ end
72
+ def bootstrap_s3_uri
73
+ s3_path('bin', "bootstrap-#{job_handle}.sh")
74
+ end
75
+ def wukong_libs_s3_uri
76
+ s3_path('bin', "wukong-libs.tar.bz2")
77
+ end
78
+
79
+ def s3_path *path_segs
80
+ File.join(Settings.emr_root, path_segs.flatten.compact)
81
+ end
82
+
83
+ module ClassMethods
84
+
85
+ # Standard hack to create ClassMethods-on-include
86
+ def self.included base
87
+ base.class_eval do
88
+ extend ClassMethods
89
+ end
90
+ end
91
+ end
92
+
93
+ class S3Util
94
+ # class methods
95
+ class << self
96
+ def s3
97
+ @s3 ||= RightAws::S3Interface.new(
98
+ Settings.access_key, Settings.secret_access_key,
99
+ {:multi_thread => true, :logger => Log})
100
+ end
101
+
102
+ def bucket_and_path_from_uri uri
103
+ uri =~ %r{^s3\w*://([\w\.\-]+)\W*(.*)} and return([$1, $2])
104
+ end
105
+
106
+ def store filename, uri
107
+ Log.debug " #{filename} => #{uri}"
108
+ dest_bucket, dest_key = bucket_and_path_from_uri(uri)
109
+ contents = File.open(filename)
110
+ s3.store_object(:bucket => dest_bucket, :key => dest_key, :data => contents)
111
+ end
112
+
113
+ end
114
+ end
115
+ end
116
+ Script.class_eval do
117
+ include EmrCommand
118
+ end
119
+ end
@@ -2,85 +2,108 @@
2
2
  module Wukong
3
3
  module HadoopCommand
4
4
 
5
- # ===========================================================================
6
- #
7
- # Hadoop Environment
8
- #
9
-
10
5
  # ===========================================================================
11
6
  #
12
7
  # Hadoop Options
13
8
  #
9
+ Settings.define :hadoop_home, :default => '/usr/lib/hadoop', :description => "Path to hadoop installation; ENV['HADOOP_HOME'] by default. HADOOP_HOME/bin/hadoop is used to run hadoop.", :env_var => 'HADOOP_HOME', :wukong => true
10
+ Settings.define :hadoop_runner, :description => "Path to hadoop script. Usually set --hadoop_home instead of this.", :wukong => true
14
11
 
15
12
  #
16
- # Translate the simplified args to their hairy-assed hadoop equivalents
13
+ # Translate simplified args to their hairy hadoop equivalents
17
14
  #
18
- Settings.define :max_node_map_tasks, :jobconf => true, :description => 'mapred.tasktracker.map.tasks.maximum', :wukong => true
19
- Settings.define :max_node_reduce_tasks, :jobconf => true, :description => 'mapred.tasktracker.reduce.tasks.maximum', :wukong => true
20
- Settings.define :map_tasks, :jobconf => true, :description => 'mapred.map.tasks', :wukong => true
21
- Settings.define :reduce_tasks, :jobconf => true, :description => 'mapred.reduce.tasks', :wukong => true
22
- Settings.define :sort_fields, :jobconf => true, :description => 'stream.num.map.output.key.fields', :wukong => true
23
- Settings.define :key_field_separator, :jobconf => true, :description => 'map.output.key.field.separator', :wukong => true
24
- Settings.define :partition_fields, :jobconf => true, :description => 'num.key.fields.for.partition', :wukong => true
25
- Settings.define :output_field_separator, :jobconf => true, :description => 'stream.map.output.field.separator', :wukong => true
26
- Settings.define :map_speculative, :jobconf => true, :description => 'mapred.map.tasks.speculative.execution', :wukong => true
27
- Settings.define :timeout, :jobconf => true, :description => 'mapred.task.timeout', :wukong => true
28
- Settings.define :reuse_jvms, :jobconf => true, :description => 'mapred.job.reuse.jvm.num.tasks', :wukong => true
29
- Settings.define :respect_exit_status, :jobconf => true, :description => 'stream.non.zero.exit.is.failure', :wukong => true
30
- Settings.define :io_sort_record_percent, :jobconf => true, :description => 'io.sort.record.percent', :wukong => true
31
- Settings.define :io_sort_mb, :jobconf => true, :description => 'io.sort.mb', :wukong => true
15
+ Settings.define :max_node_map_tasks, :jobconf => true, :description => 'mapred.tasktracker.map.tasks.maximum', :wukong => true
16
+ Settings.define :max_node_reduce_tasks, :jobconf => true, :description => 'mapred.tasktracker.reduce.tasks.maximum', :wukong => true
17
+ Settings.define :map_tasks, :jobconf => true, :description => 'mapred.map.tasks', :wukong => true
18
+ Settings.define :reduce_tasks, :jobconf => true, :description => 'mapred.reduce.tasks', :wukong => true
19
+ Settings.define :sort_fields, :jobconf => true, :description => 'stream.num.map.output.key.fields', :wukong => true
20
+ Settings.define :key_field_separator, :jobconf => true, :description => 'map.output.key.field.separator', :wukong => true
21
+ Settings.define :partition_fields, :jobconf => true, :description => 'num.key.fields.for.partition', :wukong => true
22
+ Settings.define :output_field_separator, :jobconf => true, :description => 'stream.map.output.field.separator', :wukong => true
23
+ Settings.define :map_speculative, :jobconf => true, :description => 'mapred.map.tasks.speculative.execution', :wukong => true
24
+ Settings.define :timeout, :jobconf => true, :description => 'mapred.task.timeout', :wukong => true
25
+ Settings.define :reuse_jvms, :jobconf => true, :description => 'mapred.job.reuse.jvm.num.tasks', :wukong => true
26
+ Settings.define :respect_exit_status, :jobconf => true, :description => 'stream.non.zero.exit.is.failure', :wukong => true
27
+ Settings.define :io_sort_mb, :jobconf => true, :description => 'io.sort.mb', :wukong => true
28
+ Settings.define :io_sort_record_percent, :jobconf => true, :description => 'io.sort.record.percent', :wukong => true
29
+ Settings.define :job_name, :jobconf => true, :description => 'mapred.job.name', :wukong => true
30
+ Settings.define :max_reduces_per_node, :jobconf => true, :description => 'mapred.max.reduces.per.node', :wukong => true
31
+ Settings.define :max_reduces_per_cluster,:jobconf => true, :description => 'mapred.max.reduces.per.cluster', :wukong => true
32
+ Settings.define :max_maps_per_node, :jobconf => true, :description => 'mapred.max.maps.per.node', :wukong => true
33
+ Settings.define :max_maps_per_cluster, :jobconf => true, :description => 'mapred.max.maps.per.cluster', :wukong => true
34
+ Settings.define :max_record_length, :jobconf => true, :description => 'mapred.linerecordreader.maxlength', :wukong => true # "Safeguards against corrupted data: lines longer than this (in bytes) are treated as bad records."
32
35
  Settings.define :noempty, :description => "don't create zero-byte reduce files (hadoop mode only)", :wukong => true
33
- Settings.define :job_name, :jobconf => true, :description => 'mapred.job.name', :wukong => true
34
- # mapred.linerecordreader.maxlength :description => "Safeguards against corrupted data: lines longer than this (in bytes) are treated as bad records."
35
- Settings.define :max_reduces_per_node, :jobconf => true, :description => 'mapred.max.reduces.per.node', :wukong => true
36
- Settings.define :max_reduces_per_cluster,:jobconf => true, :description => 'mapred.max.reduces.per.cluster', :wukong => true
37
- Settings.define :max_maps_per_node, :jobconf => true, :description => 'mapred.max.maps.per.node', :wukong => true
38
- Settings.define :max_maps_per_cluster, :jobconf => true, :description => 'mapred.max.maps.per.cluster', :wukong => true
39
36
 
40
- # emit a -jobconf hadoop option if the simplified command line arg is present
41
- # if not, the resulting nil will be elided later
42
- def jobconf option
43
- if options[option]
44
- "-jobconf %s=%s" % [options.description_for(option), options[option]]
45
- end
37
+ #
38
+ # Assemble the hadoop command to execute
39
+ # and launch the hadoop runner to execute the script across all tasktrackers
40
+ #
41
+ def execute_hadoop_workflow
42
+ # If no reducer_klass and no reduce_command, then skip the reduce phase
43
+ options[:reduce_tasks] = 0 if (! reducer_klass) && (! options[:reduce_command]) && (! options[:reduce_tasks])
44
+ # Input paths join by ','
45
+ input_paths = @input_paths.join(',')
46
+ #
47
+ # Use Settings[:hadoop_home] to set the path your config install.
48
+ hadoop_commandline = [
49
+ hadoop_runner,
50
+ "jar #{Settings[:hadoop_home]}/contrib/streaming/hadoop-*streaming*.jar",
51
+ hadoop_jobconf_options,
52
+ "-D mapred.job.name '#{job_name}",
53
+ "-mapper '#{map_commandline}'",
54
+ "-reducer '#{reduce_commandline}'",
55
+ "-input '#{input_paths}'",
56
+ "-output '#{output_path}'",
57
+ hadoop_recycle_env,
58
+ hadoop_other_args(input_paths, output_path),
59
+ ].flatten.compact.join(" \t\\\n ")
60
+ Log.info " Launching hadoop!"
61
+ execute_command!(hadoop_commandline)
46
62
  end
47
63
 
48
- # Define what fields hadoop should treat as the keys
49
- def hadoop_sort_args
50
- [
64
+ def hadoop_jobconf_options
65
+ jobconf_options = []
66
+ # The fields should hadoop treat as the keys
67
+ jobconf_options += [
51
68
  jobconf(:key_field_separator),
52
69
  jobconf(:sort_fields),
53
70
  ]
54
- end
55
-
56
- # Define what fields hadoop should use to distribute records to reducers
57
- def hadoop_partition_args
71
+ # Fields hadoop should use to distribute records to reducers
58
72
  unless options[:partition_fields].blank?
59
- [
73
+ jobconf_options += [
60
74
  '-partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner',
61
75
  jobconf(:output_field_separator),
62
76
  jobconf(:partition_fields),
63
77
  ]
64
78
  end
65
- end
66
-
67
- # Emit options for setting the number of mappers and reducers.
68
- def hadoop_num_tasks_args
69
- [
79
+ # Setting the number of mappers and reducers.
80
+ jobconf_options += [
70
81
  jobconf(:max_node_map_tasks),
71
82
  jobconf(:max_node_reduce_tasks),
83
+ jobconf(:max_reduces_per_node),
84
+ jobconf(:max_reduces_per_cluster),
85
+ jobconf(:max_maps_per_node),
86
+ jobconf(:max_maps_per_cluster),
72
87
  jobconf(:map_tasks),
73
88
  jobconf(:reduce_tasks)
74
89
  ]
90
+ jobconf_options.flatten.compact
75
91
  end
76
92
 
77
- def hadoop_other_args input_path, output_path
93
+ # emit a -jobconf hadoop option if the simplified command line arg is present
94
+ # if not, the resulting nil will be elided later
95
+ def jobconf option
96
+ if options[option]
97
+ "-D %s=%s" % [options.description_for(option), options[option]]
98
+ end
99
+ end
100
+
101
+ def hadoop_other_args
78
102
  extra_str_args = [ options[:extra_args] ]
79
103
  extra_str_args += ' -lazyOutput' if options[:noempty] # don't create reduce file if no records
80
104
  options[:reuse_jvms] = '-1' if (options[:reuse_jvms] == true)
81
105
  options[:respect_exit_status] = 'false' if (options[:ignore_exit_status] == true)
82
- options[:job_name] ||= "#{File.basename(this_script_filename)}---#{input_path}---#{output_path}".gsub(%r{[^\w/\.\-\+]+}, '')
83
- extra_hsh_args = [:job_name, :map_speculative, :timeout, :reuse_jvms, :respect_exit_status].map{|opt| jobconf(opt) }
106
+ extra_hsh_args = [:map_speculative, :timeout, :reuse_jvms, :respect_exit_status].map{|opt| jobconf(opt) }
84
107
  extra_str_args + extra_hsh_args
85
108
  end
86
109
 
@@ -95,29 +118,6 @@ module Wukong
95
118
  options[:hadoop_runner] || (options[:hadoop_home]+'/bin/hadoop')
96
119
  end
97
120
 
98
- #
99
- # Assemble the hadoop command to execute
100
- #
101
- def hadoop_command input_path, output_path
102
- # If this is wrong, create a config/wukong-site.rb or
103
- # otherwise set Settings[:hadoop_home] to the
104
- # root of your config install.
105
- [
106
- hadoop_runner,
107
- "jar #{Settings[:hadoop_home]}/contrib/streaming/hadoop-*streaming*.jar",
108
- hadoop_partition_args,
109
- hadoop_sort_args,
110
- hadoop_num_tasks_args,
111
- "-mapper '#{map_command}'",
112
- "-reducer '#{reduce_command}'",
113
- "-input '#{input_path}'",
114
- "-output '#{output_path}'",
115
- hadoop_recycle_env,
116
- hadoop_other_args(input_path, output_path),
117
- ].flatten.compact.join(" \t\\\n ")
118
- end
119
-
120
-
121
121
  module ClassMethods
122
122
  #
123
123
  # Via @pskomoroch via @tlipcon,
@@ -201,6 +201,7 @@ module Wukong
201
201
  ENV['stream_map_streamprocessor']
202
202
  end
203
203
  end
204
+
204
205
  # Standard ClassMethods-on-include trick
205
206
  def self.included base
206
207
  base.class_eval do
@@ -209,22 +210,3 @@ module Wukong
209
210
  end
210
211
  end
211
212
  end
212
-
213
-
214
- # -inputformat <name of inputformat (class)> (“auto” by default)
215
- # -input <additional DFS input path>
216
- # -python <python command to use on nodes> (“python” by default)
217
- # -name <job name> (“program.py” by default)
218
- # -numMapTasks <number>
219
- # -numReduceTasks <number> (no sorting or reducing will take place if this is 0)
220
- # -priority <priority value> (“NORMAL” by default)
221
- # -libjar <path to jar> (this jar gets put in the class path)
222
- # -libegg <path to egg> (this egg gets put in the Python path)
223
- # -file <local file> (this file will be put in the dir where the python program gets executed)
224
- # -cacheFile hdfs://<host>:<fs_port>/<path to file>#<link name> (a link ”<link name>” to the given file will be in the dir)
225
- # -cacheArchive hdfs://<host>:<fs_port>/<path to jar>#<link name> (link points to dir that contains files from given jar)
226
- # -cmdenv <env var name>=<value>
227
- # -jobconf <property name>=<value>
228
- # -addpath yes (replace each input key by a tuple consisting of the path of the corresponding input file and the original key)
229
- # -fake yes (fake run, only prints the underlying shell commands but does not actually execute them)
230
- # -memlimit <number of bytes> (set an upper limit on the amount of memory that can be used)
@@ -1,21 +1,31 @@
1
1
  module Wukong
2
+ #
3
+ # Local execution Options
4
+ #
2
5
  module LocalCommand
3
6
 
4
- # ===========================================================================
5
- #
6
- # Local execution Options
7
- #
7
+ def execute_local_workflow
8
+ Log.info " Reading STDIN / Writing STDOUT"
9
+ execute_command!(local_commandline)
10
+ end
8
11
 
9
12
  # program, including arg, to sort input between mapper and reducer in local
10
13
  # mode. You could override to for example run 'sort -n' (numeric sort).
11
- def sort_command
14
+ def local_mode_sort_commandline
12
15
  'sort'
13
16
  end
14
17
 
15
- def local_command input_path, output_path
16
- cmd_input_str = (input_path == '-') ? "" : "cat '#{input_path}' | "
18
+ #
19
+ # Commandline string to execute the job in local mode
20
+ #
21
+ # With an input path of '-', just uses $stdin
22
+ # With an output path of '-', just uses $stdout
23
+ #
24
+ def local_commandline
25
+ @input_paths = input_paths.map(&:strip).join(' ')
26
+ cmd_input_str = (input_paths == '-') ? "" : "cat '#{input_paths}' | "
17
27
  cmd_output_str = (output_path == '-') ? "" : "> '#{output_path}'"
18
- %Q{ #{cmd_input_str} #{map_command} | #{sort_command} | #{reduce_command} #{cmd_output_str} }
28
+ %Q{ #{cmd_input_str} #{mapper_commandline} | #{local_mode_sort_commandline} | #{reducer_commandline} #{cmd_output_str} }
19
29
  end
20
30
 
21
31
  end
data/lib/wukong/script.rb CHANGED
@@ -4,7 +4,6 @@ require 'wukong/script/local_command'
4
4
  require 'configliere' ; Configliere.use(:commandline, :env_var, :define)
5
5
  require 'rbconfig' # for uncovering ruby_interpreter_path
6
6
  module Wukong
7
-
8
7
  # == How to run a Wukong script
9
8
  #
10
9
  # your/script.rb --run path/to/input_files path/to/output_dir
@@ -29,6 +28,13 @@ module Wukong
29
28
  # To use more than one file as input, you can use normal * ? [] wildcards or
30
29
  # give a comma-separated list -- see the hadoop documentation for syntax.
31
30
  #
31
+ # == Run in Elastic MapReduce Mode (--run=emr)
32
+ #
33
+ # Wukong can be used to start scripts on the amazon cloud
34
+ #
35
+ # * copies the script to s3 in two parts
36
+ # * invokes it using the amazon API
37
+ #
32
38
  # == Run locally (--run=local)
33
39
  #
34
40
  # To run your script locally, use --run=local
@@ -57,7 +63,8 @@ module Wukong
57
63
  class Script
58
64
  include Wukong::HadoopCommand
59
65
  include Wukong::LocalCommand
60
- attr_accessor :mapper_klass, :reducer_klass, :options
66
+ attr_reader :mapper_klass, :reducer_klass, :options
67
+ attr_reader :input_paths, :output_path
61
68
 
62
69
  # ---------------------------------------------------------------------------
63
70
  #
@@ -79,18 +86,14 @@ module Wukong
79
86
  # thus, requiring a working hadoop install), or to run in local mode
80
87
  # (script --map | sort | script --reduce)
81
88
  #
82
- Settings.define :default_run_mode, :default => 'hadoop', :description => 'Run as local or as hadoop?', :wukong => true, :hide_help => false
83
- Settings.define :default_mapper, :default => '/bin/cat', :description => 'The command to run when a nil mapper is given.', :wukong => true, :hide_help => true
84
- Settings.define :default_reducer, :default => '/bin/cat', :description => 'The command to run when a nil reducer is given.', :wukong => true, :hide_help => true
85
- Settings.define :map_command, :description => "shell command to run as mapper, in place of this wukong script", :wukong => true
86
- Settings.define :hadoop_home, :default => '/usr/lib/hadoop', :env_var => 'HADOOP_HOME', :description => "Path to hadoop installation; :hadoop_home/bin/hadoop should run hadoop.", :wukong => true
87
- Settings.define :hadoop_runner, :description => "Path to hadoop script; usually, set :hadoop_home instead of this.", :wukong => true
88
- Settings.define :map, :description => "run the script's map phase. Reads/writes to STDIN/STDOUT.", :wukong => true
89
- Settings.define :reduce, :description => "run the script's reduce phase. Reads/writes to STDIN/STDOUT. You can only choose one of --run, --map or --reduce.", :wukong => true
90
- Settings.define :run, :description => "run the script's main phase. In hadoop mode, invokes the hadoop script; in local mode, runs your_script.rb --map | sort | your_script.rb --reduce", :wukong => true
91
- Settings.define :local, :description => "run in local mode (invokes 'your_script.rb --map | sort | your_script.rb --reduce'", :wukong => true
92
- Settings.define :hadoop, :description => "run in hadoop mode (invokes the system hadoop runner script)", :wukong => true
93
- Settings.define :dry_run, :description => "echo the command that will be run, but don't run it", :wukong => true
89
+ Settings.define :default_run_mode, :default => 'hadoop', :description => 'Run mode: local, hadoop, emr (elastic mapreduce)', :wukong => true, :hide_help => false
90
+ Settings.define :map_command, :description => "shell command to run as mapper, in place of this wukong script", :wukong => true
91
+ Settings.define :reduce_command, :description => "shell command to run as reducer, in place of this wukong script", :wukong => true
92
+ Settings.define :run, :description => "run the script's workflow: Specify 'hadoop' to use hadoop streaming; 'local' to run your_script.rb --map | sort | your_script.rb --reduce; 'emr' to launch on the amazon cloud.", :wukong => true
93
+ Settings.define :map, :description => "run the script's map phase. Reads/writes to STDIN/STDOUT.", :wukong => true
94
+ Settings.define :reduce, :description => "run the script's reduce phase. Reads/writes to STDIN/STDOUT. You can only choose one of --run, --map or --reduce.", :wukong => true
95
+ Settings.define :dry_run, :description => "echo the command that will be run, but don't run it", :wukong => true
96
+ Settings.define :rm, :description => "Recursively remove the destination directory. Only used in hadoop mode.", :wukong => true
94
97
 
95
98
  #
96
99
  # Instantiate the Script with the Mapper and the Reducer class (each a
@@ -120,25 +123,46 @@ module Wukong
120
123
  # MyScript.new(MyMapper, nil).run
121
124
  #
122
125
  def initialize mapper_klass, reducer_klass=nil, extra_options={}
123
- self.options = Settings.dup
124
- self.options.resolve!
125
- self.options.merge! self.default_options
126
- self.options.merge! extra_options
127
- self.mapper_klass = mapper_klass
128
- self.reducer_klass = reducer_klass
129
- # If no reducer_klass and no reduce_command, then skip the reduce phase
130
- options[:reduce_tasks] = 0 if (! reducer_klass) && (! options[:reduce_command]) && (! options[:reduce_tasks])
126
+ Settings.resolve!
127
+ @options = Settings.dup
128
+ options.merge! extra_options
129
+ @mapper_klass = mapper_klass
130
+ @reducer_klass = reducer_klass
131
+ @output_path = options.rest.pop
132
+ @input_paths = options.rest.reject(&:blank?)
133
+ if (input_paths.blank? || output_path.blank?) && (not options[:dry_run]) && (not ['map', 'reduce'].include?(run_mode))
134
+ raise "You need to specify a parsed input directory and a directory for output. Got #{ARGV.inspect}"
135
+ end
131
136
  end
132
137
 
133
138
  #
134
- # Gives default options. Command line parameters take precedence
135
- #
136
- # MAKE SURE YOU CALL SUPER: write your script according to the pattern
137
- #
138
- # super.merge :my_option => :val
139
+ # In --run mode, use the framework (local, hadoop, emr, etc) to re-launch
140
+ # the script as mapper, reducer, etc.
141
+ # If --map or --reduce, dispatch to the mapper or reducer.
139
142
  #
140
- def default_options
141
- {}
143
+ def run
144
+ case run_mode
145
+ when 'map' then mapper_klass.new(self.options).stream
146
+ when 'reduce' then reducer_klass.new(self.options).stream
147
+ when 'local' then execute_local_workflow
148
+ when 'hadoop', 'mapred' then execute_hadoop_workflow
149
+ when 'emr'
150
+ require 'wukong/script/emr_command'
151
+ execute_emr_workflow
152
+ else dump_help
153
+ end
154
+ end
155
+
156
+ # if only --run is given, assume default run mode
157
+ def run_mode
158
+ case
159
+ when options[:map] then 'map'
160
+ when options[:reduce] then 'reduce'
161
+ when ($0 =~ /-mapper\.rb$/) then 'map'
162
+ when ($0 =~ /-reducer\.rb$/) then 'reduce'
163
+ when (options[:run] == true) then options[:default_run_mode]
164
+ else options[:run].to_s
165
+ end
142
166
  end
143
167
 
144
168
  #
@@ -146,11 +170,11 @@ module Wukong
146
170
  # In hadoop mode, this is given to the hadoop streaming command.
147
171
  # In local mode, it's given to the system() call
148
172
  #
149
- def map_command
173
+ def mapper_commandline
150
174
  if mapper_klass
151
- "#{ruby_interpreter_path} #{this_script_filename} --map " + non_wukong_params
175
+ "#{ruby_interpreter_path} #{this_script_filename} --map " + non_wukong_params
152
176
  else
153
- options[:map_command] || options[:default_mapper]
177
+ options[:map_command]
154
178
  end
155
179
  end
156
180
 
@@ -159,7 +183,7 @@ module Wukong
159
183
  # In hadoop mode, this is given to the hadoop streaming command.
160
184
  # In local mode, it's given to the system() call
161
185
  #
162
- def reduce_command
186
+ def reducer_commandline
163
187
  if reducer_klass
164
188
  "#{ruby_interpreter_path} #{this_script_filename} --reduce " + non_wukong_params
165
189
  else
@@ -167,41 +191,37 @@ module Wukong
167
191
  end
168
192
  end
169
193
 
194
+ def job_name
195
+ options[:job_name] ||
196
+ "#{File.basename(this_script_filename)}---#{input_paths}---#{output_path}".gsub(%r{[^\w/\.\-\+]+}, '')
197
+ end
198
+
199
+
200
+ protected
201
+
170
202
  #
171
- # Shell command to re-run in mapreduce mode using --map and --reduce
203
+ # Execute the runner phase:
204
+ # use the running framework to relaunch the script in map and in reduce mode
172
205
  #
173
- def runner_command input_path, output_path
174
- # run as either local or hadoop
175
- case run_mode
176
- when 'local'
177
- $stderr.puts " Reading STDIN / Writing STDOUT"
178
- command = local_command input_path, output_path
179
- when 'hadoop', 'mapred'
180
- $stderr.puts " Launching hadoop as"
181
- command = hadoop_command input_path, output_path
206
+ def execute_command! *args
207
+ command = args.flatten.compact.join(" \\\n ")
208
+ Log.info "Running\n\n#{command}\n"
209
+ if options[:dry_run]
210
+ Log.info '== [Not running preceding command: dry run] =='
182
211
  else
183
- raise "Need to use --run=local or --run=hadoop; or to use the :default_run_mode in config.yaml just say --run "
212
+ maybe_overwrite_output_paths! output_path
213
+ $stdout.puts `#{command}`
184
214
  end
185
215
  end
186
216
 
187
- def run_mode
188
- return 'local' if options[:local]
189
- return 'hadoop' if options[:hadoop]
190
- # if only --run is given, assume default run mode
191
- options[:run] = options[:default_run_mode] if (options[:run] == true)
192
- options[:run].to_s
193
- end
194
-
195
- def input_output_paths
196
- output_path = options.rest.pop
197
- input_paths = options.rest.reject(&:blank?)
198
- raise "You need to specify a parsed input directory and a directory for output. Got #{ARGV.inspect}" if (! options[:dry_run]) && (input_paths.blank? || output_path.blank?)
199
- [input_paths, output_path]
200
- end
201
-
217
+ #
218
+ # In hadoop mode only, removes the destination path before launching
219
+ #
220
+ # To the panic-stricken: look in .Trash/current/path/to/accidentally_deleted_files
221
+ #
202
222
  def maybe_overwrite_output_paths! output_path
203
- if (options[:overwrite] || options[:rm]) && (run_mode != 'local')
204
- $stderr.puts "Removing output file #{output_path}"
223
+ if (options[:overwrite] || options[:rm]) && (run_mode == 'hadoop')
224
+ Log.info "Removing output file #{output_path}"
205
225
  `hdp-rm -r '#{output_path}'`
206
226
  end
207
227
  end
@@ -222,39 +242,15 @@ module Wukong
222
242
 
223
243
  # use the full ruby interpreter path to run slave processes
224
244
  def ruby_interpreter_path
225
- Pathname.new(
226
- File.join(Config::CONFIG["bindir"],
227
- Config::CONFIG["RUBY_INSTALL_NAME"]+Config::CONFIG["EXEEXT"])
228
- ).realpath
229
- end
230
-
231
- #
232
- # Execute the runner phase
233
- #
234
- def exec_hadoop_streaming
235
- $stderr.puts "Streaming on self"
236
- input_path, output_path = input_output_paths
237
- command = runner_command(input_path, output_path)
238
- $stderr.puts command
239
- unless options[:dry_run]
240
- maybe_overwrite_output_paths! output_path
241
- $stdout.puts `#{command}`
242
- end
245
+ Pathname.new(File.join(
246
+ Config::CONFIG["bindir"],
247
+ Config::CONFIG["RUBY_INSTALL_NAME"]+Config::CONFIG["EXEEXT"])).realpath
243
248
  end
244
249
 
245
250
  #
246
- # If --map or --reduce, dispatch to the mapper or reducer.
247
- # Otherwise,
251
+ # Usage
248
252
  #
249
- def run
250
- case
251
- when options[:map]
252
- mapper_klass.new(self.options).stream
253
- when options[:reduce]
254
- reducer_klass.new(self.options).stream
255
- when options[:run]
256
- exec_hadoop_streaming
257
- else
253
+ def dump_help
258
254
  options.dump_help %Q{Please specify a run mode: you probably want to start with
259
255
  #{$0} --run --local input.tsv output.tsv
260
256
  although
@@ -262,8 +258,7 @@ although
262
258
  or
263
259
  cat mapped.tsv | sort | #{$0} --reduce > reduced.tsv
264
260
  can be useful for initial testing.}
265
- end
266
261
  end
267
- end
268
262
 
263
+ end
269
264
  end