wukong 1.4.12 → 1.5.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,119 @@
1
+ require 'right_aws'
2
+ require 'configliere/config_block'
3
+ Settings.read(File.expand_path('~/.wukong/emr.yaml'))
4
+ Settings.define :access_key, :description => 'AWS Access key', :env_var => 'AWS_ACCESS_KEY_ID'
5
+ Settings.define :secret_access_key, :description => 'AWS Secret Access key', :env_var => 'AWS_SECRET_ACCESS_KEY'
6
+ Settings.define :emr_runner, :description => 'Path to the elastic-mapreduce command (~ etc will be expanded)'
7
+ Settings.define :emr_root, :description => 'S3 url to use as the base for Elastic MapReduce storage'
8
+ Settings.define :key_pair_file, :description => 'AWS Key pair file', :finally => lambda{ Settings.key_pair_file = File.expand_path(Settings.key_pair_file.to_s) }
9
+ Settings.define :key_pair, :description => "AWS Key pair name. If not specified, it's taken from key_pair_file's basename", :finally => lambda{ Settings.key_pair ||= File.basename(Settings.key_pair_file.to_s, '.pem') }
10
+ Settings.define :instance_type, :description => 'AWS instance type to use', :default => 'm1.small'
11
+ Settings.define :master_instance_type, :description => 'Overrides the instance type for the master node', :finally => lambda{ Settings.master_instance_type ||= Settings.instance_type }
12
+ Settings.define :jobflow
13
+ module Wukong
14
+ #
15
+ # EMR Options
16
+ #
17
+ module EmrCommand
18
+
19
+ def execute_emr_workflow
20
+ copy_script_to_cloud
21
+ execute_emr_runner
22
+ end
23
+
24
+ def copy_script_to_cloud
25
+ Log.info " Copying this script to the cloud."
26
+ S3Util.store(this_script_filename, mapper_s3_uri)
27
+ S3Util.store(this_script_filename, reducer_s3_uri)
28
+ S3Util.store(File.expand_path('~/ics/wukong/bin/bootstrap.sh'), bootstrap_s3_uri)
29
+ S3Util.store(File.expand_path('/tmp/wukong-libs.tar.bz2'), wukong_libs_s3_uri)
30
+ S3Util.store(File.expand_path('/tmp/wukong-libs.jar'), s3_path('bin', "wukong-libs.jar"))
31
+ end
32
+
33
+ def execute_emr_runner
34
+ command_args = [
35
+ :hadoop_version, :availability_zone, :key_pair, :key_pair_file,
36
+ ].map{|args| Settings.dashed_flag_for(*args) }
37
+ command_args += [
38
+ %Q{--verbose --debug --access-id #{Settings.access_key} --private-key #{Settings.secret_access_key} },
39
+ "--stream",
40
+ "--mapper=#{mapper_s3_uri}",
41
+ "--reducer=#{reducer_s3_uri}",
42
+ "--input=#{mapper_s3_uri} --output=#{Settings.emr_root+'/foo-out.tsv'}",
43
+ #"--enable-debugging --log-uri=#{log_s3_uri}",
44
+ "--cache-archive=#{s3_path('bin', "wukong-libs.jar")}#wukong-libs.jar",
45
+ "--cache=#{wukong_libs_s3_uri}##{File.basename wukong_libs_s3_uri}",
46
+ "--bootstrap-action=#{bootstrap_s3_uri}",
47
+ ]
48
+ if Settings.jobflow
49
+ command_args << "--jobflow=#{Settings[:jobflow]}"
50
+ else
51
+ command_args << '--alive --create'
52
+ command_args << "--name=#{job_name}"
53
+ command_args += [ [:instance_type, :slave_instance_type] , :master_instance_type, :num_instances, ].map{|args| Settings.dashed_flag_for(*args) }
54
+ end
55
+ execute_command!( File.expand_path(Settings.emr_runner), *command_args )
56
+ end
57
+
58
+ # A short name for this job
59
+ def job_handle
60
+ File.basename($0,'.rb')
61
+ end
62
+
63
+ def mapper_s3_uri
64
+ s3_path(job_handle+'-mapper.rb')
65
+ end
66
+ def reducer_s3_uri
67
+ s3_path(job_handle+'-reducer.rb')
68
+ end
69
+ def log_s3_uri
70
+ s3_path('log', job_handle)
71
+ end
72
+ def bootstrap_s3_uri
73
+ s3_path('bin', "bootstrap-#{job_handle}.sh")
74
+ end
75
+ def wukong_libs_s3_uri
76
+ s3_path('bin', "wukong-libs.tar.bz2")
77
+ end
78
+
79
+ def s3_path *path_segs
80
+ File.join(Settings.emr_root, path_segs.flatten.compact)
81
+ end
82
+
83
+ module ClassMethods
84
+
85
+ # Standard hack to create ClassMethods-on-include
86
+ def self.included base
87
+ base.class_eval do
88
+ extend ClassMethods
89
+ end
90
+ end
91
+ end
92
+
93
+ class S3Util
94
+ # class methods
95
+ class << self
96
+ def s3
97
+ @s3 ||= RightAws::S3Interface.new(
98
+ Settings.access_key, Settings.secret_access_key,
99
+ {:multi_thread => true, :logger => Log})
100
+ end
101
+
102
+ def bucket_and_path_from_uri uri
103
+ uri =~ %r{^s3\w*://([\w\.\-]+)\W*(.*)} and return([$1, $2])
104
+ end
105
+
106
+ def store filename, uri
107
+ Log.debug " #{filename} => #{uri}"
108
+ dest_bucket, dest_key = bucket_and_path_from_uri(uri)
109
+ contents = File.open(filename)
110
+ s3.store_object(:bucket => dest_bucket, :key => dest_key, :data => contents)
111
+ end
112
+
113
+ end
114
+ end
115
+ end
116
+ Script.class_eval do
117
+ include EmrCommand
118
+ end
119
+ end
@@ -2,85 +2,108 @@
2
2
  module Wukong
3
3
  module HadoopCommand
4
4
 
5
- # ===========================================================================
6
- #
7
- # Hadoop Environment
8
- #
9
-
10
5
  # ===========================================================================
11
6
  #
12
7
  # Hadoop Options
13
8
  #
9
+ Settings.define :hadoop_home, :default => '/usr/lib/hadoop', :description => "Path to hadoop installation; ENV['HADOOP_HOME'] by default. HADOOP_HOME/bin/hadoop is used to run hadoop.", :env_var => 'HADOOP_HOME', :wukong => true
10
+ Settings.define :hadoop_runner, :description => "Path to hadoop script. Usually set --hadoop_home instead of this.", :wukong => true
14
11
 
15
12
  #
16
- # Translate the simplified args to their hairy-assed hadoop equivalents
13
+ # Translate simplified args to their hairy hadoop equivalents
17
14
  #
18
- Settings.define :max_node_map_tasks, :jobconf => true, :description => 'mapred.tasktracker.map.tasks.maximum', :wukong => true
19
- Settings.define :max_node_reduce_tasks, :jobconf => true, :description => 'mapred.tasktracker.reduce.tasks.maximum', :wukong => true
20
- Settings.define :map_tasks, :jobconf => true, :description => 'mapred.map.tasks', :wukong => true
21
- Settings.define :reduce_tasks, :jobconf => true, :description => 'mapred.reduce.tasks', :wukong => true
22
- Settings.define :sort_fields, :jobconf => true, :description => 'stream.num.map.output.key.fields', :wukong => true
23
- Settings.define :key_field_separator, :jobconf => true, :description => 'map.output.key.field.separator', :wukong => true
24
- Settings.define :partition_fields, :jobconf => true, :description => 'num.key.fields.for.partition', :wukong => true
25
- Settings.define :output_field_separator, :jobconf => true, :description => 'stream.map.output.field.separator', :wukong => true
26
- Settings.define :map_speculative, :jobconf => true, :description => 'mapred.map.tasks.speculative.execution', :wukong => true
27
- Settings.define :timeout, :jobconf => true, :description => 'mapred.task.timeout', :wukong => true
28
- Settings.define :reuse_jvms, :jobconf => true, :description => 'mapred.job.reuse.jvm.num.tasks', :wukong => true
29
- Settings.define :respect_exit_status, :jobconf => true, :description => 'stream.non.zero.exit.is.failure', :wukong => true
30
- Settings.define :io_sort_record_percent, :jobconf => true, :description => 'io.sort.record.percent', :wukong => true
31
- Settings.define :io_sort_mb, :jobconf => true, :description => 'io.sort.mb', :wukong => true
15
+ Settings.define :max_node_map_tasks, :jobconf => true, :description => 'mapred.tasktracker.map.tasks.maximum', :wukong => true
16
+ Settings.define :max_node_reduce_tasks, :jobconf => true, :description => 'mapred.tasktracker.reduce.tasks.maximum', :wukong => true
17
+ Settings.define :map_tasks, :jobconf => true, :description => 'mapred.map.tasks', :wukong => true
18
+ Settings.define :reduce_tasks, :jobconf => true, :description => 'mapred.reduce.tasks', :wukong => true
19
+ Settings.define :sort_fields, :jobconf => true, :description => 'stream.num.map.output.key.fields', :wukong => true
20
+ Settings.define :key_field_separator, :jobconf => true, :description => 'map.output.key.field.separator', :wukong => true
21
+ Settings.define :partition_fields, :jobconf => true, :description => 'num.key.fields.for.partition', :wukong => true
22
+ Settings.define :output_field_separator, :jobconf => true, :description => 'stream.map.output.field.separator', :wukong => true
23
+ Settings.define :map_speculative, :jobconf => true, :description => 'mapred.map.tasks.speculative.execution', :wukong => true
24
+ Settings.define :timeout, :jobconf => true, :description => 'mapred.task.timeout', :wukong => true
25
+ Settings.define :reuse_jvms, :jobconf => true, :description => 'mapred.job.reuse.jvm.num.tasks', :wukong => true
26
+ Settings.define :respect_exit_status, :jobconf => true, :description => 'stream.non.zero.exit.is.failure', :wukong => true
27
+ Settings.define :io_sort_mb, :jobconf => true, :description => 'io.sort.mb', :wukong => true
28
+ Settings.define :io_sort_record_percent, :jobconf => true, :description => 'io.sort.record.percent', :wukong => true
29
+ Settings.define :job_name, :jobconf => true, :description => 'mapred.job.name', :wukong => true
30
+ Settings.define :max_reduces_per_node, :jobconf => true, :description => 'mapred.max.reduces.per.node', :wukong => true
31
+ Settings.define :max_reduces_per_cluster,:jobconf => true, :description => 'mapred.max.reduces.per.cluster', :wukong => true
32
+ Settings.define :max_maps_per_node, :jobconf => true, :description => 'mapred.max.maps.per.node', :wukong => true
33
+ Settings.define :max_maps_per_cluster, :jobconf => true, :description => 'mapred.max.maps.per.cluster', :wukong => true
34
+ Settings.define :max_record_length, :jobconf => true, :description => 'mapred.linerecordreader.maxlength', :wukong => true # "Safeguards against corrupted data: lines longer than this (in bytes) are treated as bad records."
32
35
  Settings.define :noempty, :description => "don't create zero-byte reduce files (hadoop mode only)", :wukong => true
33
- Settings.define :job_name, :jobconf => true, :description => 'mapred.job.name', :wukong => true
34
- # mapred.linerecordreader.maxlength :description => "Safeguards against corrupted data: lines longer than this (in bytes) are treated as bad records."
35
- Settings.define :max_reduces_per_node, :jobconf => true, :description => 'mapred.max.reduces.per.node', :wukong => true
36
- Settings.define :max_reduces_per_cluster,:jobconf => true, :description => 'mapred.max.reduces.per.cluster', :wukong => true
37
- Settings.define :max_maps_per_node, :jobconf => true, :description => 'mapred.max.maps.per.node', :wukong => true
38
- Settings.define :max_maps_per_cluster, :jobconf => true, :description => 'mapred.max.maps.per.cluster', :wukong => true
39
36
 
40
- # emit a -jobconf hadoop option if the simplified command line arg is present
41
- # if not, the resulting nil will be elided later
42
- def jobconf option
43
- if options[option]
44
- "-jobconf %s=%s" % [options.description_for(option), options[option]]
45
- end
37
+ #
38
+ # Assemble the hadoop command to execute
39
+ # and launch the hadoop runner to execute the script across all tasktrackers
40
+ #
41
+ def execute_hadoop_workflow
42
+ # If no reducer_klass and no reduce_command, then skip the reduce phase
43
+ options[:reduce_tasks] = 0 if (! reducer_klass) && (! options[:reduce_command]) && (! options[:reduce_tasks])
44
+ # Input paths join by ','
45
+ input_paths = @input_paths.join(',')
46
+ #
47
+ # Use Settings[:hadoop_home] to set the path your config install.
48
+ hadoop_commandline = [
49
+ hadoop_runner,
50
+ "jar #{Settings[:hadoop_home]}/contrib/streaming/hadoop-*streaming*.jar",
51
+ hadoop_jobconf_options,
52
+ "-D mapred.job.name '#{job_name}",
53
+ "-mapper '#{map_commandline}'",
54
+ "-reducer '#{reduce_commandline}'",
55
+ "-input '#{input_paths}'",
56
+ "-output '#{output_path}'",
57
+ hadoop_recycle_env,
58
+ hadoop_other_args(input_paths, output_path),
59
+ ].flatten.compact.join(" \t\\\n ")
60
+ Log.info " Launching hadoop!"
61
+ execute_command!(hadoop_commandline)
46
62
  end
47
63
 
48
- # Define what fields hadoop should treat as the keys
49
- def hadoop_sort_args
50
- [
64
+ def hadoop_jobconf_options
65
+ jobconf_options = []
66
+ # The fields should hadoop treat as the keys
67
+ jobconf_options += [
51
68
  jobconf(:key_field_separator),
52
69
  jobconf(:sort_fields),
53
70
  ]
54
- end
55
-
56
- # Define what fields hadoop should use to distribute records to reducers
57
- def hadoop_partition_args
71
+ # Fields hadoop should use to distribute records to reducers
58
72
  unless options[:partition_fields].blank?
59
- [
73
+ jobconf_options += [
60
74
  '-partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner',
61
75
  jobconf(:output_field_separator),
62
76
  jobconf(:partition_fields),
63
77
  ]
64
78
  end
65
- end
66
-
67
- # Emit options for setting the number of mappers and reducers.
68
- def hadoop_num_tasks_args
69
- [
79
+ # Setting the number of mappers and reducers.
80
+ jobconf_options += [
70
81
  jobconf(:max_node_map_tasks),
71
82
  jobconf(:max_node_reduce_tasks),
83
+ jobconf(:max_reduces_per_node),
84
+ jobconf(:max_reduces_per_cluster),
85
+ jobconf(:max_maps_per_node),
86
+ jobconf(:max_maps_per_cluster),
72
87
  jobconf(:map_tasks),
73
88
  jobconf(:reduce_tasks)
74
89
  ]
90
+ jobconf_options.flatten.compact
75
91
  end
76
92
 
77
- def hadoop_other_args input_path, output_path
93
+ # emit a -jobconf hadoop option if the simplified command line arg is present
94
+ # if not, the resulting nil will be elided later
95
+ def jobconf option
96
+ if options[option]
97
+ "-D %s=%s" % [options.description_for(option), options[option]]
98
+ end
99
+ end
100
+
101
+ def hadoop_other_args
78
102
  extra_str_args = [ options[:extra_args] ]
79
103
  extra_str_args += ' -lazyOutput' if options[:noempty] # don't create reduce file if no records
80
104
  options[:reuse_jvms] = '-1' if (options[:reuse_jvms] == true)
81
105
  options[:respect_exit_status] = 'false' if (options[:ignore_exit_status] == true)
82
- options[:job_name] ||= "#{File.basename(this_script_filename)}---#{input_path}---#{output_path}".gsub(%r{[^\w/\.\-\+]+}, '')
83
- extra_hsh_args = [:job_name, :map_speculative, :timeout, :reuse_jvms, :respect_exit_status].map{|opt| jobconf(opt) }
106
+ extra_hsh_args = [:map_speculative, :timeout, :reuse_jvms, :respect_exit_status].map{|opt| jobconf(opt) }
84
107
  extra_str_args + extra_hsh_args
85
108
  end
86
109
 
@@ -95,29 +118,6 @@ module Wukong
95
118
  options[:hadoop_runner] || (options[:hadoop_home]+'/bin/hadoop')
96
119
  end
97
120
 
98
- #
99
- # Assemble the hadoop command to execute
100
- #
101
- def hadoop_command input_path, output_path
102
- # If this is wrong, create a config/wukong-site.rb or
103
- # otherwise set Settings[:hadoop_home] to the
104
- # root of your config install.
105
- [
106
- hadoop_runner,
107
- "jar #{Settings[:hadoop_home]}/contrib/streaming/hadoop-*streaming*.jar",
108
- hadoop_partition_args,
109
- hadoop_sort_args,
110
- hadoop_num_tasks_args,
111
- "-mapper '#{map_command}'",
112
- "-reducer '#{reduce_command}'",
113
- "-input '#{input_path}'",
114
- "-output '#{output_path}'",
115
- hadoop_recycle_env,
116
- hadoop_other_args(input_path, output_path),
117
- ].flatten.compact.join(" \t\\\n ")
118
- end
119
-
120
-
121
121
  module ClassMethods
122
122
  #
123
123
  # Via @pskomoroch via @tlipcon,
@@ -201,6 +201,7 @@ module Wukong
201
201
  ENV['stream_map_streamprocessor']
202
202
  end
203
203
  end
204
+
204
205
  # Standard ClassMethods-on-include trick
205
206
  def self.included base
206
207
  base.class_eval do
@@ -209,22 +210,3 @@ module Wukong
209
210
  end
210
211
  end
211
212
  end
212
-
213
-
214
- # -inputformat <name of inputformat (class)> (“auto” by default)
215
- # -input <additional DFS input path>
216
- # -python <python command to use on nodes> (“python” by default)
217
- # -name <job name> (“program.py” by default)
218
- # -numMapTasks <number>
219
- # -numReduceTasks <number> (no sorting or reducing will take place if this is 0)
220
- # -priority <priority value> (“NORMAL” by default)
221
- # -libjar <path to jar> (this jar gets put in the class path)
222
- # -libegg <path to egg> (this egg gets put in the Python path)
223
- # -file <local file> (this file will be put in the dir where the python program gets executed)
224
- # -cacheFile hdfs://<host>:<fs_port>/<path to file>#<link name> (a link ”<link name>” to the given file will be in the dir)
225
- # -cacheArchive hdfs://<host>:<fs_port>/<path to jar>#<link name> (link points to dir that contains files from given jar)
226
- # -cmdenv <env var name>=<value>
227
- # -jobconf <property name>=<value>
228
- # -addpath yes (replace each input key by a tuple consisting of the path of the corresponding input file and the original key)
229
- # -fake yes (fake run, only prints the underlying shell commands but does not actually execute them)
230
- # -memlimit <number of bytes> (set an upper limit on the amount of memory that can be used)
@@ -1,21 +1,31 @@
1
1
  module Wukong
2
+ #
3
+ # Local execution Options
4
+ #
2
5
  module LocalCommand
3
6
 
4
- # ===========================================================================
5
- #
6
- # Local execution Options
7
- #
7
+ def execute_local_workflow
8
+ Log.info " Reading STDIN / Writing STDOUT"
9
+ execute_command!(local_commandline)
10
+ end
8
11
 
9
12
  # program, including arg, to sort input between mapper and reducer in local
10
13
  # mode. You could override to for example run 'sort -n' (numeric sort).
11
- def sort_command
14
+ def local_mode_sort_commandline
12
15
  'sort'
13
16
  end
14
17
 
15
- def local_command input_path, output_path
16
- cmd_input_str = (input_path == '-') ? "" : "cat '#{input_path}' | "
18
+ #
19
+ # Commandline string to execute the job in local mode
20
+ #
21
+ # With an input path of '-', just uses $stdin
22
+ # With an output path of '-', just uses $stdout
23
+ #
24
+ def local_commandline
25
+ @input_paths = input_paths.map(&:strip).join(' ')
26
+ cmd_input_str = (input_paths == '-') ? "" : "cat '#{input_paths}' | "
17
27
  cmd_output_str = (output_path == '-') ? "" : "> '#{output_path}'"
18
- %Q{ #{cmd_input_str} #{map_command} | #{sort_command} | #{reduce_command} #{cmd_output_str} }
28
+ %Q{ #{cmd_input_str} #{mapper_commandline} | #{local_mode_sort_commandline} | #{reducer_commandline} #{cmd_output_str} }
19
29
  end
20
30
 
21
31
  end
data/lib/wukong/script.rb CHANGED
@@ -4,7 +4,6 @@ require 'wukong/script/local_command'
4
4
  require 'configliere' ; Configliere.use(:commandline, :env_var, :define)
5
5
  require 'rbconfig' # for uncovering ruby_interpreter_path
6
6
  module Wukong
7
-
8
7
  # == How to run a Wukong script
9
8
  #
10
9
  # your/script.rb --run path/to/input_files path/to/output_dir
@@ -29,6 +28,13 @@ module Wukong
29
28
  # To use more than one file as input, you can use normal * ? [] wildcards or
30
29
  # give a comma-separated list -- see the hadoop documentation for syntax.
31
30
  #
31
+ # == Run in Elastic MapReduce Mode (--run=emr)
32
+ #
33
+ # Wukong can be used to start scripts on the amazon cloud
34
+ #
35
+ # * copies the script to s3 in two parts
36
+ # * invokes it using the amazon API
37
+ #
32
38
  # == Run locally (--run=local)
33
39
  #
34
40
  # To run your script locally, use --run=local
@@ -57,7 +63,8 @@ module Wukong
57
63
  class Script
58
64
  include Wukong::HadoopCommand
59
65
  include Wukong::LocalCommand
60
- attr_accessor :mapper_klass, :reducer_klass, :options
66
+ attr_reader :mapper_klass, :reducer_klass, :options
67
+ attr_reader :input_paths, :output_path
61
68
 
62
69
  # ---------------------------------------------------------------------------
63
70
  #
@@ -79,18 +86,14 @@ module Wukong
79
86
  # thus, requiring a working hadoop install), or to run in local mode
80
87
  # (script --map | sort | script --reduce)
81
88
  #
82
- Settings.define :default_run_mode, :default => 'hadoop', :description => 'Run as local or as hadoop?', :wukong => true, :hide_help => false
83
- Settings.define :default_mapper, :default => '/bin/cat', :description => 'The command to run when a nil mapper is given.', :wukong => true, :hide_help => true
84
- Settings.define :default_reducer, :default => '/bin/cat', :description => 'The command to run when a nil reducer is given.', :wukong => true, :hide_help => true
85
- Settings.define :map_command, :description => "shell command to run as mapper, in place of this wukong script", :wukong => true
86
- Settings.define :hadoop_home, :default => '/usr/lib/hadoop', :env_var => 'HADOOP_HOME', :description => "Path to hadoop installation; :hadoop_home/bin/hadoop should run hadoop.", :wukong => true
87
- Settings.define :hadoop_runner, :description => "Path to hadoop script; usually, set :hadoop_home instead of this.", :wukong => true
88
- Settings.define :map, :description => "run the script's map phase. Reads/writes to STDIN/STDOUT.", :wukong => true
89
- Settings.define :reduce, :description => "run the script's reduce phase. Reads/writes to STDIN/STDOUT. You can only choose one of --run, --map or --reduce.", :wukong => true
90
- Settings.define :run, :description => "run the script's main phase. In hadoop mode, invokes the hadoop script; in local mode, runs your_script.rb --map | sort | your_script.rb --reduce", :wukong => true
91
- Settings.define :local, :description => "run in local mode (invokes 'your_script.rb --map | sort | your_script.rb --reduce'", :wukong => true
92
- Settings.define :hadoop, :description => "run in hadoop mode (invokes the system hadoop runner script)", :wukong => true
93
- Settings.define :dry_run, :description => "echo the command that will be run, but don't run it", :wukong => true
89
+ Settings.define :default_run_mode, :default => 'hadoop', :description => 'Run mode: local, hadoop, emr (elastic mapreduce)', :wukong => true, :hide_help => false
90
+ Settings.define :map_command, :description => "shell command to run as mapper, in place of this wukong script", :wukong => true
91
+ Settings.define :reduce_command, :description => "shell command to run as reducer, in place of this wukong script", :wukong => true
92
+ Settings.define :run, :description => "run the script's workflow: Specify 'hadoop' to use hadoop streaming; 'local' to run your_script.rb --map | sort | your_script.rb --reduce; 'emr' to launch on the amazon cloud.", :wukong => true
93
+ Settings.define :map, :description => "run the script's map phase. Reads/writes to STDIN/STDOUT.", :wukong => true
94
+ Settings.define :reduce, :description => "run the script's reduce phase. Reads/writes to STDIN/STDOUT. You can only choose one of --run, --map or --reduce.", :wukong => true
95
+ Settings.define :dry_run, :description => "echo the command that will be run, but don't run it", :wukong => true
96
+ Settings.define :rm, :description => "Recursively remove the destination directory. Only used in hadoop mode.", :wukong => true
94
97
 
95
98
  #
96
99
  # Instantiate the Script with the Mapper and the Reducer class (each a
@@ -120,25 +123,46 @@ module Wukong
120
123
  # MyScript.new(MyMapper, nil).run
121
124
  #
122
125
  def initialize mapper_klass, reducer_klass=nil, extra_options={}
123
- self.options = Settings.dup
124
- self.options.resolve!
125
- self.options.merge! self.default_options
126
- self.options.merge! extra_options
127
- self.mapper_klass = mapper_klass
128
- self.reducer_klass = reducer_klass
129
- # If no reducer_klass and no reduce_command, then skip the reduce phase
130
- options[:reduce_tasks] = 0 if (! reducer_klass) && (! options[:reduce_command]) && (! options[:reduce_tasks])
126
+ Settings.resolve!
127
+ @options = Settings.dup
128
+ options.merge! extra_options
129
+ @mapper_klass = mapper_klass
130
+ @reducer_klass = reducer_klass
131
+ @output_path = options.rest.pop
132
+ @input_paths = options.rest.reject(&:blank?)
133
+ if (input_paths.blank? || output_path.blank?) && (not options[:dry_run]) && (not ['map', 'reduce'].include?(run_mode))
134
+ raise "You need to specify a parsed input directory and a directory for output. Got #{ARGV.inspect}"
135
+ end
131
136
  end
132
137
 
133
138
  #
134
- # Gives default options. Command line parameters take precedence
135
- #
136
- # MAKE SURE YOU CALL SUPER: write your script according to the pattern
137
- #
138
- # super.merge :my_option => :val
139
+ # In --run mode, use the framework (local, hadoop, emr, etc) to re-launch
140
+ # the script as mapper, reducer, etc.
141
+ # If --map or --reduce, dispatch to the mapper or reducer.
139
142
  #
140
- def default_options
141
- {}
143
+ def run
144
+ case run_mode
145
+ when 'map' then mapper_klass.new(self.options).stream
146
+ when 'reduce' then reducer_klass.new(self.options).stream
147
+ when 'local' then execute_local_workflow
148
+ when 'hadoop', 'mapred' then execute_hadoop_workflow
149
+ when 'emr'
150
+ require 'wukong/script/emr_command'
151
+ execute_emr_workflow
152
+ else dump_help
153
+ end
154
+ end
155
+
156
+ # if only --run is given, assume default run mode
157
+ def run_mode
158
+ case
159
+ when options[:map] then 'map'
160
+ when options[:reduce] then 'reduce'
161
+ when ($0 =~ /-mapper\.rb$/) then 'map'
162
+ when ($0 =~ /-reducer\.rb$/) then 'reduce'
163
+ when (options[:run] == true) then options[:default_run_mode]
164
+ else options[:run].to_s
165
+ end
142
166
  end
143
167
 
144
168
  #
@@ -146,11 +170,11 @@ module Wukong
146
170
  # In hadoop mode, this is given to the hadoop streaming command.
147
171
  # In local mode, it's given to the system() call
148
172
  #
149
- def map_command
173
+ def mapper_commandline
150
174
  if mapper_klass
151
- "#{ruby_interpreter_path} #{this_script_filename} --map " + non_wukong_params
175
+ "#{ruby_interpreter_path} #{this_script_filename} --map " + non_wukong_params
152
176
  else
153
- options[:map_command] || options[:default_mapper]
177
+ options[:map_command]
154
178
  end
155
179
  end
156
180
 
@@ -159,7 +183,7 @@ module Wukong
159
183
  # In hadoop mode, this is given to the hadoop streaming command.
160
184
  # In local mode, it's given to the system() call
161
185
  #
162
- def reduce_command
186
+ def reducer_commandline
163
187
  if reducer_klass
164
188
  "#{ruby_interpreter_path} #{this_script_filename} --reduce " + non_wukong_params
165
189
  else
@@ -167,41 +191,37 @@ module Wukong
167
191
  end
168
192
  end
169
193
 
194
+ def job_name
195
+ options[:job_name] ||
196
+ "#{File.basename(this_script_filename)}---#{input_paths}---#{output_path}".gsub(%r{[^\w/\.\-\+]+}, '')
197
+ end
198
+
199
+
200
+ protected
201
+
170
202
  #
171
- # Shell command to re-run in mapreduce mode using --map and --reduce
203
+ # Execute the runner phase:
204
+ # use the running framework to relaunch the script in map and in reduce mode
172
205
  #
173
- def runner_command input_path, output_path
174
- # run as either local or hadoop
175
- case run_mode
176
- when 'local'
177
- $stderr.puts " Reading STDIN / Writing STDOUT"
178
- command = local_command input_path, output_path
179
- when 'hadoop', 'mapred'
180
- $stderr.puts " Launching hadoop as"
181
- command = hadoop_command input_path, output_path
206
+ def execute_command! *args
207
+ command = args.flatten.compact.join(" \\\n ")
208
+ Log.info "Running\n\n#{command}\n"
209
+ if options[:dry_run]
210
+ Log.info '== [Not running preceding command: dry run] =='
182
211
  else
183
- raise "Need to use --run=local or --run=hadoop; or to use the :default_run_mode in config.yaml just say --run "
212
+ maybe_overwrite_output_paths! output_path
213
+ $stdout.puts `#{command}`
184
214
  end
185
215
  end
186
216
 
187
- def run_mode
188
- return 'local' if options[:local]
189
- return 'hadoop' if options[:hadoop]
190
- # if only --run is given, assume default run mode
191
- options[:run] = options[:default_run_mode] if (options[:run] == true)
192
- options[:run].to_s
193
- end
194
-
195
- def input_output_paths
196
- output_path = options.rest.pop
197
- input_paths = options.rest.reject(&:blank?)
198
- raise "You need to specify a parsed input directory and a directory for output. Got #{ARGV.inspect}" if (! options[:dry_run]) && (input_paths.blank? || output_path.blank?)
199
- [input_paths, output_path]
200
- end
201
-
217
+ #
218
+ # In hadoop mode only, removes the destination path before launching
219
+ #
220
+ # To the panic-stricken: look in .Trash/current/path/to/accidentally_deleted_files
221
+ #
202
222
  def maybe_overwrite_output_paths! output_path
203
- if (options[:overwrite] || options[:rm]) && (run_mode != 'local')
204
- $stderr.puts "Removing output file #{output_path}"
223
+ if (options[:overwrite] || options[:rm]) && (run_mode == 'hadoop')
224
+ Log.info "Removing output file #{output_path}"
205
225
  `hdp-rm -r '#{output_path}'`
206
226
  end
207
227
  end
@@ -222,39 +242,15 @@ module Wukong
222
242
 
223
243
  # use the full ruby interpreter path to run slave processes
224
244
  def ruby_interpreter_path
225
- Pathname.new(
226
- File.join(Config::CONFIG["bindir"],
227
- Config::CONFIG["RUBY_INSTALL_NAME"]+Config::CONFIG["EXEEXT"])
228
- ).realpath
229
- end
230
-
231
- #
232
- # Execute the runner phase
233
- #
234
- def exec_hadoop_streaming
235
- $stderr.puts "Streaming on self"
236
- input_path, output_path = input_output_paths
237
- command = runner_command(input_path, output_path)
238
- $stderr.puts command
239
- unless options[:dry_run]
240
- maybe_overwrite_output_paths! output_path
241
- $stdout.puts `#{command}`
242
- end
245
+ Pathname.new(File.join(
246
+ Config::CONFIG["bindir"],
247
+ Config::CONFIG["RUBY_INSTALL_NAME"]+Config::CONFIG["EXEEXT"])).realpath
243
248
  end
244
249
 
245
250
  #
246
- # If --map or --reduce, dispatch to the mapper or reducer.
247
- # Otherwise,
251
+ # Usage
248
252
  #
249
- def run
250
- case
251
- when options[:map]
252
- mapper_klass.new(self.options).stream
253
- when options[:reduce]
254
- reducer_klass.new(self.options).stream
255
- when options[:run]
256
- exec_hadoop_streaming
257
- else
253
+ def dump_help
258
254
  options.dump_help %Q{Please specify a run mode: you probably want to start with
259
255
  #{$0} --run --local input.tsv output.tsv
260
256
  although
@@ -262,8 +258,7 @@ although
262
258
  or
263
259
  cat mapped.tsv | sort | #{$0} --reduce > reduced.tsv
264
260
  can be useful for initial testing.}
265
- end
266
261
  end
267
- end
268
262
 
263
+ end
269
264
  end