RubyGems - wukong - Versions diffs - 1.4.12 → 1.5.0 - Mend

wukong 1.4.12 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

data/CHANGELOG.textile +20 -1
data/bin/bootstrap.sh +32 -0
data/bin/hdp-sort +3 -0
data/bin/hdp-stream +3 -0
data/docpages/README-elastic_map_reduce.textile +377 -0
data/docpages/avro/avro_notes.textile +56 -0
data/docpages/avro/tethering.textile +19 -0
data/docpages/pig/commandline_params.txt +26 -0
data/examples/emr/elastic_mapreduce_example.rb +35 -0
data/lib/wukong/logger.rb +8 -1
data/lib/wukong/script/avro_command.rb +5 -0
data/lib/wukong/script/emr_command.rb +119 -0
data/lib/wukong/script/hadoop_command.rb +72 -90
data/lib/wukong/script/local_command.rb +18 -8
data/lib/wukong/script.rb +87 -92
data/wukong.gemspec +27 -18
metadata +30 -21

data/lib/wukong/script/emr_command.rb ADDED Viewed

@@ -0,0 +1,119 @@
+require 'right_aws'
+require 'configliere/config_block'
+Settings.read(File.expand_path('~/.wukong/emr.yaml'))
+Settings.define :access_key,        :description => 'AWS Access key', :env_var => 'AWS_ACCESS_KEY_ID'
+Settings.define :secret_access_key, :description => 'AWS Secret Access key', :env_var => 'AWS_SECRET_ACCESS_KEY'
+Settings.define :emr_runner,        :description => 'Path to the elastic-mapreduce command (~ etc will be expanded)'
+Settings.define :emr_root,          :description => 'S3 url to use as the base for Elastic MapReduce storage'
+Settings.define :key_pair_file,     :description => 'AWS Key pair file', :finally => lambda{ Settings.key_pair_file = File.expand_path(Settings.key_pair_file.to_s) }
+Settings.define :key_pair,          :description => "AWS Key pair name. If not specified, it's taken from key_pair_file's basename", :finally => lambda{ Settings.key_pair ||= File.basename(Settings.key_pair_file.to_s, '.pem') }
+Settings.define :instance_type,     :description => 'AWS instance type to use', :default => 'm1.small'
+Settings.define :master_instance_type, :description => 'Overrides the instance type for the master node', :finally => lambda{ Settings.master_instance_type ||= Settings.instance_type }
+Settings.define :jobflow
+module Wukong
+  #
+  # EMR Options
+  #
+  module EmrCommand
+    def execute_emr_workflow
+      copy_script_to_cloud
+      execute_emr_runner
+    end
+    def copy_script_to_cloud
+      Log.info "  Copying this script to the cloud."
+      S3Util.store(this_script_filename, mapper_s3_uri)
+      S3Util.store(this_script_filename, reducer_s3_uri)
+      S3Util.store(File.expand_path('~/ics/wukong/bin/bootstrap.sh'), bootstrap_s3_uri)
+      S3Util.store(File.expand_path('/tmp/wukong-libs.tar.bz2'), wukong_libs_s3_uri)
+      S3Util.store(File.expand_path('/tmp/wukong-libs.jar'), s3_path('bin', "wukong-libs.jar"))
+    end
+    def execute_emr_runner
+      command_args = [
+        :hadoop_version, :availability_zone, :key_pair, :key_pair_file,
+      ].map{|args| Settings.dashed_flag_for(*args) }
+      command_args += [
+        %Q{--verbose --debug --access-id #{Settings.access_key} --private-key #{Settings.secret_access_key} },
+        "--stream",
+        "--mapper=#{mapper_s3_uri}",
+        "--reducer=#{reducer_s3_uri}",
+        "--input=#{mapper_s3_uri} --output=#{Settings.emr_root+'/foo-out.tsv'}",
+        #"--enable-debugging --log-uri=#{log_s3_uri}",
+        "--cache-archive=#{s3_path('bin', "wukong-libs.jar")}#wukong-libs.jar",
+        "--cache=#{wukong_libs_s3_uri}##{File.basename wukong_libs_s3_uri}",
+        "--bootstrap-action=#{bootstrap_s3_uri}",
+      ]
+      if Settings.jobflow
+        command_args << "--jobflow=#{Settings[:jobflow]}"
+       else
+        command_args << '--alive --create'
+        command_args << "--name=#{job_name}"
+        command_args += [ [:instance_type, :slave_instance_type] , :master_instance_type, :num_instances, ].map{|args| Settings.dashed_flag_for(*args) }
+      end
+      execute_command!( File.expand_path(Settings.emr_runner), *command_args )
+    end
+    # A short name for this job
+    def job_handle
+      File.basename($0,'.rb')
+    end
+    def mapper_s3_uri
+      s3_path(job_handle+'-mapper.rb')
+    end
+    def reducer_s3_uri
+      s3_path(job_handle+'-reducer.rb')
+    end
+    def log_s3_uri
+      s3_path('log', job_handle)
+    end
+    def bootstrap_s3_uri
+      s3_path('bin', "bootstrap-#{job_handle}.sh")
+    end
+    def wukong_libs_s3_uri
+      s3_path('bin', "wukong-libs.tar.bz2")
+    end
+    def s3_path *path_segs
+      File.join(Settings.emr_root, path_segs.flatten.compact)
+    end
+    module ClassMethods
+      # Standard hack to create ClassMethods-on-include
+      def self.included base
+        base.class_eval do
+          extend ClassMethods
+        end
+      end
+    end
+    class S3Util
+      # class methods
+      class << self
+        def s3
+          @s3 ||= RightAws::S3Interface.new(
+            Settings.access_key, Settings.secret_access_key,
+            {:multi_thread => true, :logger => Log})
+        end
+        def bucket_and_path_from_uri uri
+          uri =~ %r{^s3\w*://([\w\.\-]+)\W*(.*)} and return([$1, $2])
+        end
+        def store filename, uri
+          Log.debug "    #{filename} => #{uri}"
+          dest_bucket, dest_key = bucket_and_path_from_uri(uri)
+          contents = File.open(filename)
+          s3.store_object(:bucket => dest_bucket, :key => dest_key, :data => contents)
+        end
+      end
+    end
+  end
+  Script.class_eval do
+    include EmrCommand
+  end
+end

data/lib/wukong/script/hadoop_command.rb CHANGED Viewed

@@ -2,85 +2,108 @@
 module Wukong
   module HadoopCommand
-    # ===========================================================================
-    #
-    # Hadoop Environment
-    #
     # ===========================================================================
     #
     # Hadoop Options
     #
+    Settings.define :hadoop_home, :default => '/usr/lib/hadoop', :description => "Path to hadoop installation; ENV['HADOOP_HOME'] by default. HADOOP_HOME/bin/hadoop is used to run hadoop.", :env_var => 'HADOOP_HOME', :wukong => true
+    Settings.define :hadoop_runner,                            :description => "Path to hadoop script. Usually set --hadoop_home instead of this.", :wukong => true
     #
-    # Translate the simplified args to their hairy-assed hadoop equivalents
+    # Translate simplified args to their hairy hadoop equivalents
     #
-    Settings.define :max_node_map_tasks,     :jobconf => true, :description => 'mapred.tasktracker.map.tasks.maximum', :wukong => true
-    Settings.define :max_node_reduce_tasks,  :jobconf => true, :description => 'mapred.tasktracker.reduce.tasks.maximum', :wukong => true
-    Settings.define :map_tasks,              :jobconf => true, :description => 'mapred.map.tasks', :wukong => true
-    Settings.define :reduce_tasks,           :jobconf => true, :description => 'mapred.reduce.tasks', :wukong => true
-    Settings.define :sort_fields,            :jobconf => true, :description => 'stream.num.map.output.key.fields', :wukong => true
-    Settings.define :key_field_separator,    :jobconf => true, :description => 'map.output.key.field.separator', :wukong => true
-    Settings.define :partition_fields,       :jobconf => true, :description => 'num.key.fields.for.partition', :wukong => true
-    Settings.define :output_field_separator, :jobconf => true, :description => 'stream.map.output.field.separator', :wukong => true
-    Settings.define :map_speculative,        :jobconf => true, :description => 'mapred.map.tasks.speculative.execution', :wukong => true
-    Settings.define :timeout,                :jobconf => true, :description => 'mapred.task.timeout', :wukong => true
-    Settings.define :reuse_jvms,             :jobconf => true, :description => 'mapred.job.reuse.jvm.num.tasks', :wukong => true
-    Settings.define :respect_exit_status,    :jobconf => true, :description => 'stream.non.zero.exit.is.failure', :wukong => true
-    Settings.define :io_sort_record_percent, :jobconf => true, :description => 'io.sort.record.percent', :wukong => true
-    Settings.define :io_sort_mb,             :jobconf => true, :description => 'io.sort.mb', :wukong => true
+    Settings.define :max_node_map_tasks,     :jobconf => true, :description => 'mapred.tasktracker.map.tasks.maximum',                   :wukong => true
+    Settings.define :max_node_reduce_tasks,  :jobconf => true, :description => 'mapred.tasktracker.reduce.tasks.maximum',                :wukong => true
+    Settings.define :map_tasks,              :jobconf => true, :description => 'mapred.map.tasks',                                       :wukong => true
+    Settings.define :reduce_tasks,           :jobconf => true, :description => 'mapred.reduce.tasks',                                    :wukong => true
+    Settings.define :sort_fields,            :jobconf => true, :description => 'stream.num.map.output.key.fields',                       :wukong => true
+    Settings.define :key_field_separator,    :jobconf => true, :description => 'map.output.key.field.separator',                         :wukong => true
+    Settings.define :partition_fields,       :jobconf => true, :description => 'num.key.fields.for.partition',                           :wukong => true
+    Settings.define :output_field_separator, :jobconf => true, :description => 'stream.map.output.field.separator',                      :wukong => true
+    Settings.define :map_speculative,        :jobconf => true, :description => 'mapred.map.tasks.speculative.execution',                 :wukong => true
+    Settings.define :timeout,                :jobconf => true, :description => 'mapred.task.timeout',                                    :wukong => true
+    Settings.define :reuse_jvms,             :jobconf => true, :description => 'mapred.job.reuse.jvm.num.tasks',                         :wukong => true
+    Settings.define :respect_exit_status,    :jobconf => true, :description => 'stream.non.zero.exit.is.failure',                        :wukong => true
+    Settings.define :io_sort_mb,             :jobconf => true, :description => 'io.sort.mb',                                             :wukong => true
+    Settings.define :io_sort_record_percent, :jobconf => true, :description => 'io.sort.record.percent',                                 :wukong => true
+    Settings.define :job_name,               :jobconf => true, :description => 'mapred.job.name',                                        :wukong => true
+    Settings.define :max_reduces_per_node,   :jobconf => true, :description => 'mapred.max.reduces.per.node',                            :wukong => true
+    Settings.define :max_reduces_per_cluster,:jobconf => true, :description => 'mapred.max.reduces.per.cluster',                         :wukong => true
+    Settings.define :max_maps_per_node,      :jobconf => true, :description => 'mapred.max.maps.per.node',                               :wukong => true
+    Settings.define :max_maps_per_cluster,   :jobconf => true, :description => 'mapred.max.maps.per.cluster',                            :wukong => true
+    Settings.define :max_record_length,      :jobconf => true, :description => 'mapred.linerecordreader.maxlength',                      :wukong => true # "Safeguards against corrupted data: lines longer than this (in bytes) are treated as bad records."
     Settings.define :noempty,                                  :description => "don't create zero-byte reduce files (hadoop mode only)", :wukong => true
-    Settings.define :job_name,               :jobconf => true, :description => 'mapred.job.name', :wukong => true
-    # mapred.linerecordreader.maxlength :description => "Safeguards against corrupted data: lines longer than this (in bytes) are treated as bad records."
-    Settings.define :max_reduces_per_node,   :jobconf => true, :description => 'mapred.max.reduces.per.node',    :wukong => true
-    Settings.define :max_reduces_per_cluster,:jobconf => true, :description => 'mapred.max.reduces.per.cluster', :wukong => true
-    Settings.define :max_maps_per_node,      :jobconf => true, :description => 'mapred.max.maps.per.node',       :wukong => true
-    Settings.define :max_maps_per_cluster,   :jobconf => true, :description => 'mapred.max.maps.per.cluster',    :wukong => true
-    # emit a -jobconf hadoop option if the simplified command line arg is present
-    # if not, the resulting nil will be elided later
-    def jobconf option
-      if options[option]
-        "-jobconf %s=%s" % [options.description_for(option), options[option]]
-      end
+    #
+    # Assemble the hadoop command to execute
+    # and launch the hadoop runner to execute the script across all tasktrackers
+    #
+    def execute_hadoop_workflow
+      # If no reducer_klass and no reduce_command, then skip the reduce phase
+      options[:reduce_tasks] = 0 if (! reducer_klass) && (! options[:reduce_command]) && (! options[:reduce_tasks])
+      # Input paths join by ','
+      input_paths = @input_paths.join(',')
+      #
+      # Use Settings[:hadoop_home] to set the path your config install.
+      hadoop_commandline = [
+        hadoop_runner,
+        "jar #{Settings[:hadoop_home]}/contrib/streaming/hadoop-*streaming*.jar",
+        hadoop_jobconf_options,
+        "-D mapred.job.name '#{job_name}",
+        "-mapper  '#{map_commandline}'",
+        "-reducer '#{reduce_commandline}'",
+        "-input   '#{input_paths}'",
+        "-output  '#{output_path}'",
+        hadoop_recycle_env,
+        hadoop_other_args(input_paths, output_path),
+      ].flatten.compact.join(" \t\\\n  ")
+      Log.info "  Launching hadoop!"
+      execute_command!(hadoop_commandline)
     end
-    # Define what fields hadoop should treat as the keys
-    def hadoop_sort_args
-      [
+    def hadoop_jobconf_options
+      jobconf_options = []
+      # The fields should hadoop treat as the keys
+      jobconf_options += [
         jobconf(:key_field_separator),
         jobconf(:sort_fields),
       ]
-    end
-    # Define what fields hadoop should use to distribute records to reducers
-    def hadoop_partition_args
+      # Fields hadoop should use to distribute records to reducers
       unless options[:partition_fields].blank?
-        [
+        jobconf_options += [
           '-partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner',
           jobconf(:output_field_separator),
           jobconf(:partition_fields),
         ]
       end
-    end
-    # Emit options for setting the number of mappers and reducers.
-    def hadoop_num_tasks_args
-      [
+      # Setting the number of mappers and reducers.
+      jobconf_options += [
         jobconf(:max_node_map_tasks),
         jobconf(:max_node_reduce_tasks),
+        jobconf(:max_reduces_per_node),
+        jobconf(:max_reduces_per_cluster),
+        jobconf(:max_maps_per_node),
+        jobconf(:max_maps_per_cluster),
         jobconf(:map_tasks),
         jobconf(:reduce_tasks)
       ]
+      jobconf_options.flatten.compact
     end
-    def hadoop_other_args input_path, output_path
+    # emit a -jobconf hadoop option if the simplified command line arg is present
+    # if not, the resulting nil will be elided later
+    def jobconf option
+      if options[option]
+        "-D %s=%s" % [options.description_for(option), options[option]]
+      end
+    end
+    def hadoop_other_args
       extra_str_args  = [ options[:extra_args] ]
       extra_str_args               += ' -lazyOutput' if options[:noempty]  # don't create reduce file if no records
       options[:reuse_jvms]          = '-1'     if (options[:reuse_jvms] == true)
       options[:respect_exit_status] = 'false'  if (options[:ignore_exit_status] == true)
-      options[:job_name] ||= "#{File.basename(this_script_filename)}---#{input_path}---#{output_path}".gsub(%r{[^\w/\.\-\+]+}, '')
-      extra_hsh_args = [:job_name, :map_speculative, :timeout, :reuse_jvms, :respect_exit_status].map{|opt| jobconf(opt)  }
+      extra_hsh_args = [:map_speculative, :timeout, :reuse_jvms, :respect_exit_status].map{|opt| jobconf(opt)  }
       extra_str_args + extra_hsh_args
     end
@@ -95,29 +118,6 @@ module Wukong
       options[:hadoop_runner] || (options[:hadoop_home]+'/bin/hadoop')
     end
-    #
-    # Assemble the hadoop command to execute
-    #
-    def hadoop_command input_path, output_path
-      # If this is wrong, create a config/wukong-site.rb or
-      # otherwise set Settings[:hadoop_home] to the
-      # root of your config install.
-      [
-        hadoop_runner,
-        "jar #{Settings[:hadoop_home]}/contrib/streaming/hadoop-*streaming*.jar",
-        hadoop_partition_args,
-        hadoop_sort_args,
-        hadoop_num_tasks_args,
-        "-mapper  '#{map_command}'",
-        "-reducer '#{reduce_command}'",
-        "-input   '#{input_path}'",
-        "-output  '#{output_path}'",
-        hadoop_recycle_env,
-        hadoop_other_args(input_path, output_path),
-      ].flatten.compact.join(" \t\\\n  ")
-    end
     module ClassMethods
       #
       # Via @pskomoroch via @tlipcon,
@@ -201,6 +201,7 @@ module Wukong
         ENV['stream_map_streamprocessor']
       end
     end
     # Standard ClassMethods-on-include trick
     def self.included base
       base.class_eval do
@@ -209,22 +210,3 @@ module Wukong
     end
   end
 end
-# -inputformat     <name of inputformat (class)> (“auto” by default)
-# -input           <additional DFS input path>
-# -python          <python command to use on nodes> (“python” by default)
-# -name            <job name> (“program.py” by default)
-# -numMapTasks     <number>
-# -numReduceTasks  <number> (no sorting or reducing will take place if this is 0)
-# -priority        <priority value> (“NORMAL” by default)
-# -libjar          <path to jar> (this jar gets put in the class path)
-# -libegg          <path to egg> (this egg gets put in the Python path)
-# -file            <local file> (this file will be put in the dir where the python program gets executed)
-# -cacheFile       hdfs://<host>:<fs_port>/<path to file>#<link name> (a link ”<link name>” to the given file will be in the dir)
-# -cacheArchive    hdfs://<host>:<fs_port>/<path to jar>#<link name> (link points to dir that contains files from given jar)
-# -cmdenv          <env var name>=<value>
-# -jobconf         <property name>=<value>
-# -addpath         yes (replace each input key by a tuple consisting of the path of the corresponding input file and the original key)
-# -fake            yes (fake run, only prints the underlying shell commands but does not actually execute them)
-# -memlimit        <number of bytes> (set an upper limit on the amount of memory that can be used)

data/lib/wukong/script/local_command.rb CHANGED Viewed

@@ -1,21 +1,31 @@
 module Wukong
+  #
+  # Local execution Options
+  #
   module LocalCommand
-    # ===========================================================================
-    #
-    # Local execution Options
-    #
+    def execute_local_workflow
+      Log.info "  Reading STDIN / Writing STDOUT"
+      execute_command!(local_commandline)
+    end
     # program, including arg, to sort input between mapper and reducer in local
     # mode. You could override to for example run 'sort -n' (numeric sort).
-    def sort_command
+    def local_mode_sort_commandline
       'sort'
     end
-    def local_command input_path, output_path
-      cmd_input_str  = (input_path  == '-') ? "" : "cat '#{input_path}' | "
+    #
+    # Commandline string to execute the job in local mode
+    #
+    # With an input path of '-', just uses $stdin
+    # With an output path of '-', just uses $stdout
+    #
+    def local_commandline
+      @input_paths = input_paths.map(&:strip).join(' ')
+      cmd_input_str  = (input_paths == '-') ? "" : "cat '#{input_paths}' | "
       cmd_output_str = (output_path == '-') ? "" : "> '#{output_path}'"
-      %Q{ #{cmd_input_str} #{map_command} | #{sort_command} | #{reduce_command} #{cmd_output_str} }
+      %Q{ #{cmd_input_str} #{mapper_commandline} | #{local_mode_sort_commandline} | #{reducer_commandline} #{cmd_output_str} }
     end
   end

data/lib/wukong/script.rb CHANGED Viewed

@@ -4,7 +4,6 @@ require 'wukong/script/local_command'
 require 'configliere' ; Configliere.use(:commandline, :env_var, :define)
 require 'rbconfig' # for uncovering ruby_interpreter_path
 module Wukong
   # == How to run a Wukong script
   #
   #   your/script.rb --run path/to/input_files path/to/output_dir
@@ -29,6 +28,13 @@ module Wukong
   # To use more than one file as input, you can use normal * ? [] wildcards or
   # give a comma-separated list -- see the hadoop documentation for syntax.
   #
+  # == Run in Elastic MapReduce Mode (--run=emr)
+  #
+  # Wukong can be used to start scripts on the amazon cloud
+  #
+  # * copies the script to s3 in two parts
+  # * invokes it using the amazon API
+  #
   # == Run locally (--run=local)
   #
   # To run your script locally, use --run=local
@@ -57,7 +63,8 @@ module Wukong
   class Script
     include Wukong::HadoopCommand
     include Wukong::LocalCommand
-    attr_accessor :mapper_klass, :reducer_klass, :options
+    attr_reader :mapper_klass, :reducer_klass, :options
+    attr_reader :input_paths, :output_path
     # ---------------------------------------------------------------------------
     #
@@ -79,18 +86,14 @@ module Wukong
     #   thus, requiring a working hadoop install), or to run in local mode
     #   (script --map | sort | script --reduce)
     #
-    Settings.define :default_run_mode, :default => 'hadoop',    :description => 'Run as local or as hadoop?', :wukong => true, :hide_help => false
-    Settings.define :default_mapper,   :default => '/bin/cat',  :description => 'The command to run when a nil mapper is given.', :wukong => true, :hide_help => true
-    Settings.define :default_reducer,  :default => '/bin/cat',  :description => 'The command to run when a nil reducer is given.', :wukong => true, :hide_help => true
-    Settings.define :map_command,      :description => "shell command to run as mapper, in place of this wukong script", :wukong => true
-    Settings.define :hadoop_home,      :default => '/usr/lib/hadoop', :env_var => 'HADOOP_HOME', :description => "Path to hadoop installation; :hadoop_home/bin/hadoop should run hadoop.", :wukong => true
-    Settings.define :hadoop_runner,    :description => "Path to hadoop script; usually, set :hadoop_home instead of this.", :wukong => true
-    Settings.define :map,              :description => "run the script's map phase. Reads/writes to STDIN/STDOUT.", :wukong => true
-    Settings.define :reduce,           :description => "run the script's reduce phase. Reads/writes to STDIN/STDOUT. You can only choose one of --run, --map or --reduce.", :wukong => true
-    Settings.define :run,              :description => "run the script's main phase. In hadoop mode, invokes the hadoop script; in local mode, runs your_script.rb --map | sort | your_script.rb --reduce", :wukong => true
-    Settings.define :local,            :description => "run in local mode (invokes 'your_script.rb --map | sort | your_script.rb --reduce'", :wukong => true
-    Settings.define :hadoop,           :description => "run in hadoop mode (invokes the system hadoop runner script)", :wukong => true
-    Settings.define :dry_run,          :description => "echo the command that will be run, but don't run it", :wukong => true
+    Settings.define :default_run_mode, :default => 'hadoop',          :description => 'Run mode: local, hadoop, emr (elastic mapreduce)', :wukong => true, :hide_help => false
+    Settings.define :map_command,                                     :description => "shell command to run as mapper, in place of this wukong script", :wukong => true
+    Settings.define :reduce_command,                                  :description => "shell command to run as reducer, in place of this wukong script", :wukong => true
+    Settings.define :run,                                             :description => "run the script's workflow: Specify 'hadoop' to use hadoop streaming; 'local' to run your_script.rb --map | sort | your_script.rb --reduce; 'emr' to launch on the amazon cloud.", :wukong => true
+    Settings.define :map,                                             :description => "run the script's map phase. Reads/writes to STDIN/STDOUT.", :wukong => true
+    Settings.define :reduce,                                          :description => "run the script's reduce phase. Reads/writes to STDIN/STDOUT. You can only choose one of --run, --map or --reduce.", :wukong => true
+    Settings.define :dry_run,                                         :description => "echo the command that will be run, but don't run it", :wukong => true
+    Settings.define :rm,                                              :description => "Recursively remove the destination directory. Only used in hadoop mode.", :wukong => true
     #
     # Instantiate the Script with the Mapper and the Reducer class (each a
@@ -120,25 +123,46 @@ module Wukong
     #   MyScript.new(MyMapper, nil).run
     #
     def initialize mapper_klass, reducer_klass=nil, extra_options={}
-      self.options = Settings.dup
-      self.options.resolve!
-      self.options.merge! self.default_options
-      self.options.merge! extra_options
-      self.mapper_klass  = mapper_klass
-      self.reducer_klass = reducer_klass
-      # If no reducer_klass and no reduce_command, then skip the reduce phase
-      options[:reduce_tasks] = 0 if (! reducer_klass) && (! options[:reduce_command]) && (! options[:reduce_tasks])
+      Settings.resolve!
+      @options = Settings.dup
+      options.merge! extra_options
+      @mapper_klass  = mapper_klass
+      @reducer_klass = reducer_klass
+      @output_path = options.rest.pop
+      @input_paths = options.rest.reject(&:blank?)
+      if (input_paths.blank? || output_path.blank?) && (not options[:dry_run]) && (not ['map', 'reduce'].include?(run_mode))
+        raise "You need to specify a parsed input directory and a directory for output. Got #{ARGV.inspect}"
+      end
     end
     #
-    # Gives default options.  Command line parameters take precedence
-    #
-    # MAKE SURE YOU CALL SUPER: write your script according to the pattern
-    #
-    #   super.merge :my_option => :val
+    # In --run mode, use the framework (local, hadoop, emr, etc) to re-launch
+    #   the script as mapper, reducer, etc.
+    # If --map or --reduce, dispatch to the mapper or reducer.
     #
-    def default_options
-      {}
+    def run
+      case run_mode
+      when 'map'              then mapper_klass.new(self.options).stream
+      when 'reduce'           then reducer_klass.new(self.options).stream
+      when 'local'            then execute_local_workflow
+      when 'hadoop', 'mapred' then execute_hadoop_workflow
+      when 'emr'
+        require 'wukong/script/emr_command'
+        execute_emr_workflow
+      else                    dump_help
+      end
+    end
+    # if only --run is given, assume default run mode
+    def run_mode
+      case
+      when options[:map]           then 'map'
+      when options[:reduce]        then 'reduce'
+      when ($0 =~ /-mapper\.rb$/)  then 'map'
+      when ($0 =~ /-reducer\.rb$/) then 'reduce'
+      when (options[:run] == true) then options[:default_run_mode]
+      else                         options[:run].to_s
+      end
     end
     #
@@ -146,11 +170,11 @@ module Wukong
     # In hadoop mode, this is given to the hadoop streaming command.
     # In local mode, it's given to the system() call
     #
-    def map_command
+    def mapper_commandline
       if mapper_klass
-         "#{ruby_interpreter_path} #{this_script_filename} --map " + non_wukong_params
+        "#{ruby_interpreter_path} #{this_script_filename} --map " + non_wukong_params
       else
-        options[:map_command] || options[:default_mapper]
+        options[:map_command]
       end
     end
@@ -159,7 +183,7 @@ module Wukong
     # In hadoop mode, this is given to the hadoop streaming command.
     # In local mode, it's given to the system() call
     #
-    def reduce_command
+    def reducer_commandline
       if reducer_klass
          "#{ruby_interpreter_path} #{this_script_filename} --reduce " + non_wukong_params
       else
@@ -167,41 +191,37 @@ module Wukong
       end
     end
+    def job_name
+      options[:job_name] ||
+        "#{File.basename(this_script_filename)}---#{input_paths}---#{output_path}".gsub(%r{[^\w/\.\-\+]+}, '')
+    end
+  protected
     #
-    # Shell command to re-run in mapreduce mode using --map and --reduce
+    # Execute the runner phase:
+    # use the running framework to relaunch the script in map and in reduce mode
     #
-    def runner_command input_path, output_path
-      # run as either local or hadoop
-      case run_mode
-      when 'local'
-        $stderr.puts "  Reading STDIN / Writing STDOUT"
-        command = local_command input_path, output_path
-      when 'hadoop', 'mapred'
-        $stderr.puts "  Launching hadoop as"
-        command = hadoop_command input_path, output_path
+    def execute_command! *args
+      command = args.flatten.compact.join(" \\\n    ")
+      Log.info "Running\n\n#{command}\n"
+      if options[:dry_run]
+        Log.info '== [Not running preceding command: dry run] =='
       else
-        raise "Need to use --run=local or --run=hadoop; or to use the :default_run_mode in config.yaml just say --run "
+        maybe_overwrite_output_paths! output_path
+        $stdout.puts `#{command}`
       end
     end
-    def run_mode
-      return 'local'  if options[:local]
-      return 'hadoop' if options[:hadoop]
-      # if only --run is given, assume default run mode
-      options[:run] = options[:default_run_mode] if (options[:run] == true)
-      options[:run].to_s
-    end
-    def input_output_paths
-      output_path = options.rest.pop
-      input_paths = options.rest.reject(&:blank?)
-      raise "You need to specify a parsed input directory and a directory for output. Got #{ARGV.inspect}" if (! options[:dry_run]) && (input_paths.blank? || output_path.blank?)
-      [input_paths, output_path]
-    end
+    #
+    # In hadoop mode only, removes the destination path before launching
+    #
+    # To the panic-stricken: look in .Trash/current/path/to/accidentally_deleted_files
+    #
     def maybe_overwrite_output_paths! output_path
-      if (options[:overwrite] || options[:rm]) && (run_mode != 'local')
-        $stderr.puts "Removing output file #{output_path}"
+      if (options[:overwrite] || options[:rm]) && (run_mode == 'hadoop')
+        Log.info "Removing output file #{output_path}"
         `hdp-rm -r '#{output_path}'`
       end
     end
@@ -222,39 +242,15 @@ module Wukong
     # use the full ruby interpreter path to run slave processes
     def ruby_interpreter_path
-      Pathname.new(
-                   File.join(Config::CONFIG["bindir"],
-                             Config::CONFIG["RUBY_INSTALL_NAME"]+Config::CONFIG["EXEEXT"])
-                   ).realpath
-    end
-    #
-    # Execute the runner phase
-    #
-    def exec_hadoop_streaming
-      $stderr.puts "Streaming on self"
-      input_path, output_path = input_output_paths
-      command = runner_command(input_path, output_path)
-      $stderr.puts command
-      unless options[:dry_run]
-        maybe_overwrite_output_paths! output_path
-        $stdout.puts `#{command}`
-      end
+      Pathname.new(File.join(
+          Config::CONFIG["bindir"],
+          Config::CONFIG["RUBY_INSTALL_NAME"]+Config::CONFIG["EXEEXT"])).realpath
     end
     #
-    # If --map or --reduce, dispatch to the mapper or reducer.
-    # Otherwise,
+    # Usage
     #
-    def run
-      case
-      when options[:map]
-        mapper_klass.new(self.options).stream
-      when options[:reduce]
-        reducer_klass.new(self.options).stream
-      when options[:run]
-        exec_hadoop_streaming
-      else
+    def dump_help
         options.dump_help %Q{Please specify a run mode: you probably want to start with
   #{$0} --run --local input.tsv output.tsv
 although
@@ -262,8 +258,7 @@ although
 or
   cat mapped.tsv | sort | #{$0} --reduce > reduced.tsv
 can be useful for initial testing.}
-      end
     end
-  end
+  end
 end