RubyGems - wukong - Versions diffs - 1.5.3 → 1.5.4 - Mend

wukong 1.5.3 → 1.5.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

data/CHANGELOG.textile +4 -0
data/bin/hdp-bin +44 -0
data/bin/hdp-ls +2 -1
data/docpages/avro/performance.textile +36 -0
data/examples/cassandra_streaming/avromapper.rb +85 -0
data/examples/cassandra_streaming/berlitz_for_cassandra.textile +22 -0
data/examples/cassandra_streaming/cassandra.avpr +468 -0
data/examples/cassandra_streaming/cassandra_random_partitioner.rb +62 -0
data/examples/cassandra_streaming/catter.sh +45 -0
data/examples/cassandra_streaming/client_interface_notes.textile +200 -0
data/examples/cassandra_streaming/client_schema.avpr +211 -0
data/examples/cassandra_streaming/client_schema.textile +318 -0
data/examples/cassandra_streaming/foofile.avr +0 -0
data/examples/cassandra_streaming/pymap.sh +1 -0
data/examples/cassandra_streaming/pyreduce.sh +1 -0
data/examples/cassandra_streaming/smutation.avpr +188 -0
data/examples/cassandra_streaming/streamer.sh +51 -0
data/examples/cassandra_streaming/struct_loader.rb +24 -0
data/examples/cassandra_streaming/tuning.textile +73 -0
data/examples/emr/README-elastic_map_reduce.textile +26 -0
data/examples/emr/dot_wukong_dir/credentials.json +7 -0
data/examples/emr/{emr.yaml → dot_wukong_dir/emr.yaml} +33 -16
data/{bin/bootstrap.sh → examples/emr/dot_wukong_dir/emr_bootstrap.sh} +1 -1
data/examples/emr/elastic_mapreduce_example.rb +1 -0
data/lib/wukong/encoding/asciize.rb +108 -0
data/lib/wukong/extensions/date_time.rb +33 -7
data/lib/wukong/extensions/emittable.rb +12 -25
data/lib/wukong/extensions/hash_like.rb +13 -6
data/lib/wukong/filename_pattern.rb +8 -7
data/lib/wukong/schema.rb +47 -0
data/lib/wukong/script.rb +7 -0
data/lib/wukong/script/cassandra_loader_script.rb +40 -0
data/lib/wukong/script/emr_command.rb +74 -43
data/lib/wukong/script/hadoop_command.rb +89 -72
data/lib/wukong/store.rb +2 -7
data/lib/wukong/store/cassandra.rb +10 -0
data/lib/wukong/store/cassandra/streaming.rb +75 -0
data/lib/wukong/store/cassandra/struct_loader.rb +21 -0
data/lib/wukong/store/cassandra_model.rb +90 -0
data/lib/wukong/store/chh_chunked_flat_file_store.rb +1 -1
data/lib/wukong/store/chunked_flat_file_store.rb +24 -20
data/wukong.gemspec +32 -4
metadata +33 -14

data/lib/wukong/extensions/emittable.rb CHANGED

@@ -1,4 +1,3 @@
 Object.class_eval do
   def to_flat() [to_s] end
 end
@@ -54,29 +53,17 @@ Hash.class_eval do
   end
 end
-class Time
-  # strftime() format to flatten a date
-  FLAT_FORMAT = "%Y%m%d%H%M%S"
-  # Flatten
-  def to_flat
-    strftime(FLAT_FORMAT)
-  end
-end
-class Date
-  # strftime() format to flatten a date
-  FLAT_FORMAT = "%Y%m%d"
-  # Flatten
-  def to_flat
-    strftime(FLAT_FORMAT)
-  end
-end
-class DateTime < Date
-  # strftime() format to flatten a date
-  FLAT_FORMAT = "%Y%m%d%H%M%S"
-  # Flatten
-  def to_flat
-    strftime(FLAT_FORMAT)
+class Integer
+  #
+  # Express boolean as 1 (true) or 0 (false).  In contravention of typical ruby
+  # semantics (but in a way that is more robust for wukong-like batch
+  # processing), the number 0, the string '0', nil and false are all considered
+  # false. (This also makes the method idempotent: repeated calls give same result.)
+  #
+  def self.unbooleanize bool
+    case bool
+    when 0, '0', false, nil then 0
+    else                         1
+    end
   end
 end

data/lib/wukong/extensions/hash_like.rb CHANGED

@@ -103,16 +103,23 @@ module Wukong
       # otherwise they must be uniformly strings
       #
       def from_hash(hsh, has_symbol_keys=false)
-        keys = self.keys
-        keys = keys.map(&:to_sym) if has_symbol_keys
-        self.new *hsh.values_of(*keys)
+        extract_keys = has_symbol_keys ? self.keys.map(&:to_sym) : self.keys.map(&:to_s)
+        self.new *hsh.values_of(*extract_keys)
       end
       #
       # The last portion of the class in underscored form
-      # note memoization
+      # memoized
       #
-      def self.resource_name
-        @resource_name ||= self.to_s.gsub(%r{.*::}, '').underscore.to_sym
+      def resource_name
+        @resource_name ||= self.class_basename.underscore.to_sym
+      end
+      # The last portion of the class name
+      # memoized
+      #
+      # @example
+      #   This::That::TheOther.new.class_basename   # => TheOther
+      def class_basename
+        @class_basename ||= self.to_s.gsub(%r{.*::}, '')
       end
     end

data/lib/wukong/filename_pattern.rb CHANGED

@@ -16,12 +16,12 @@ module Wukong
       # walk through pattern, replacing tokens (eg :time or :pid) with the
       # corresponding value.
       #
+      # Don't use ':' in a pattern except to introduce a token
+      # and separate tokens with '-', '+' '/' or '.'
+      #
       def make token_vals={}
         token_vals = token_val_defaults.merge token_vals
         token_vals[:timestamp] ||= Time.now.utc.strftime("%Y%m%d%H%M%S")
-        # CHH_NOTE: The following is broken for patterns that need a ":" or
-        # patterns that need text following a token with no special chars in
-        # between.
         val = pattern.gsub(/:(\w+)/){ replace($1, token_vals)  }
         val
       end
@@ -39,7 +39,7 @@ module Wukong
         case token
         when :pid           then pid
         when :hostname      then hostname
-        when :handle        then token_vals[:handle]
+        when :handle        then token_vals[:handle]
         when :handle_prefix then token_vals[:handle].to_s[0..5]
         when :timestamp     then token_vals[:timestamp]
         when :date          then token_vals[:timestamp][ 0..7]
@@ -56,7 +56,7 @@ module Wukong
       # Memoized: the hostname for the machine running this script.
       def hostname
-        @hostname ||= ENV['HOSTNAME'] || `hostname`.delete("\n")
+        @hostname ||= ENV['HOSTNAME'] || `hostname`.chomp
       end
       # Memoized: the Process ID for this invocation.
       def pid
@@ -64,9 +64,10 @@ module Wukong
       end
       # Characters deemed safe in a filename;
-      SAFE_CHARS = 'a-zA-Z0-9_\-\.\+\/\;'
+      SAFE_CHARS = 'a-zA-Z0-9_\-\.\+\/'
+      RE_SAFE_FILENAME = %r{[^#{SAFE_CHARS}]+}moxi
       def self.sanitize str
-        str.gsub(%r{[^#{SAFE_CHARS}]+}, '-')
+        str.gsub(RE_SAFE_FILENAME, '-')
       end
     end

data/lib/wukong/schema.rb CHANGED

@@ -50,6 +50,35 @@ class << Yaml       ; def to_pig() 'chararray'     end ; end if defined?(Yaml)
 class << Json       ; def to_pig() 'chararray'     end ; end if defined?(Json)
 class << Regex      ; def to_pig() 'chararray'     end ; end if defined?(Regex)
+#
+# Basic types: Avro conversion
+#
+class << Integer    ; def to_avro() 'int'           end ; end
+class << Bignum     ; def to_avro() 'long'          end ; end
+class << Float      ; def to_avro() 'float'         end ; end
+class << Symbol     ; def to_avro() 'string'        end ; end
+class << Date       ; def to_avro() 'long'          end ; end
+class << Time       ; def to_avro() 'long'          end ; end
+class << DateTime   ; def to_avro() 'long'          end ; end
+class << String     ; def to_avro() 'string'        end ; end
+class << Text       ; def to_avro() 'string'        end ; end if defined?(Text)
+class << Blob       ; def to_avro() 'bytearray'     end ; end if defined?(Blob)
+class << Boolean    ; def to_avro() 'bytearray'     end ; end if defined?(Boolean)
+class String        ; def to_avro() self.to_s ;     end ; end
+class Symbol        ; def to_avro() self.to_s ;     end ; end
+class << BigDecimal ; def to_avro() 'long'          end ; end if defined?(BigDecimal)
+class << EpochTime  ; def to_avro() 'integer'       end ; end if defined?(EpochTime)
+class << FilePath   ; def to_avro() 'string'        end ; end if defined?(FilePath)
+class << Flag       ; def to_avro() 'string'        end ; end if defined?(Flag)
+class << IPAddress  ; def to_avro() 'string'        end ; end if defined?(IPAddress)
+class << URI        ; def to_avro() 'string'        end ; end if defined?(URI)
+class << Csv        ; def to_avro() 'string'        end ; end if defined?(Csv)
+class << Yaml       ; def to_avro() 'string'        end ; end if defined?(Yaml)
+class << Json       ; def to_avro() 'string'        end ; end if defined?(Json)
+class << Regex      ; def to_avro() 'string'        end ; end if defined?(Regex)
 module Wukong
   #
   # Export model's structure for loading and manipulating in other frameworks,
@@ -208,6 +237,24 @@ module Wukong
         str.join("\n")
       end
+      #
+      # Avro
+      #
+      def to_avro
+        require 'json' # yikes
+        h = {}
+        h[:name]   = self.name
+        h[:type]   = "record"
+        h[:fields] =  []
+        members.zip(mtypes).each do |member, type|
+          h[:fields] << {:name => member.to_s, :type => type.to_avro}
+        end
+        h.to_json
+      end
     end
     # standard stanza for making methods appear on the class itself on include
     def self.included base

data/lib/wukong/script.rb CHANGED

@@ -145,6 +145,7 @@ module Wukong
       when 'map'              then mapper_klass.new(self.options).stream
       when 'reduce'           then reducer_klass.new(self.options).stream
       when 'local'            then execute_local_workflow
+      when 'cassandra'        then execute_hadoop_workflow
       when 'hadoop', 'mapred' then execute_hadoop_workflow
       when 'emr'
         require 'wukong/script/emr_command'
@@ -196,6 +197,12 @@ module Wukong
         "#{File.basename(this_script_filename)}---#{input_paths}---#{output_path}".gsub(%r{[^\w/\.\-\+]+}, '')
     end
+    # Wrapper for dangerous operations to catch errors
+    def safely action, &block
+      begin
+        block.call
+      rescue StandardError => e ; handle_error(action, e); end
+    end
   protected

data/lib/wukong/script/cassandra_loader_script.rb ADDED

@@ -0,0 +1,40 @@
+Settings.define :cassandra_keyspace,   :required => true, :description => "The keyspace to bulk load"
+Settings.define :cassandra_col_family, :required => true, :description => "The column family to bulk load"
+Settings.define :cassandra_home,  :env_var => 'CASSANDRA_HOME', :default => '/usr/local/share/cassandra'
+module Wukong
+  class CassandraScript < Wukong::Script
+    def hadoop_other_args *args
+      opts = super(*args)
+      opts << "-D stream.map.output=\'cassandra_avro_output\'"
+      opts << "-D stream.io.identifier.resolver.class=\'org.apache.cassandra.hadoop.streaming.AvroResolver\'"
+      opts << "-D cassandra.output.keyspace=\'#{Settings.cassandra_keyspace}\'"
+      opts << "-D cassandra.output.columnfamily=\'#{Settings.cassandra_col_family}\'"
+      opts << "-D cassandra.partitioner.class=\'org.apache.cassandra.dht.RandomPartitioner\'"
+      opts << "-D cassandra.thrift.address=\'#{[Settings.cassandra_hosts].flatten.map{|s| s.gsub(/:.*/, '')}.join(",")}\'"
+      opts << "-D cassandra.thrift.port=\'9160\'"
+      # opts << "-D mapreduce.output.columnfamilyoutputformat.batch.threshold=\'1024\'"
+      # ORDER MATTERS
+      opts << "-libjars \'#{cassandra_jars}\'"
+      opts << "-file    \'#{avro_schema}\'"
+      opts << "-outputformat \'org.apache.cassandra.hadoop.ColumnFamilyOutputFormat\'"
+      opts
+    end
+    #
+    # Return paths to cassandra jars as a string
+    #
+    def cassandra_jars
+      jars = []
+      Dir["#{Settings.cassandra_home}/build/apache-cassandra*.jar", "#{Settings.cassandra_home}/build/lib/jars/*.jar", "#{Settings.cassandra_home}/lib/*.jar"].each do |jar|
+        jars << jar
+      end
+      jars.join(',')
+    end
+    def avro_schema
+      File.join(Settings.cassandra_home, "interface/avro/cassandra.avpr")
+    end
+  end
+end

data/lib/wukong/script/emr_command.rb CHANGED

@@ -1,16 +1,26 @@
 require 'right_aws'
 require 'configliere/config_block'
-Settings.read(File.expand_path('~/.wukong/emr.yaml'))
+#
+EMR_CONFIG_DIR = '~/.wukong' unless defined?(EMR_CONFIG_DIR)
+#
 Settings.define :emr_credentials_file, :description => 'A .json file holding your AWS access credentials. See http://bit.ly/emr_credentials_file for format'
 Settings.define :access_key,           :description => 'AWS Access key',        :env_var => 'AWS_ACCESS_KEY_ID'
 Settings.define :secret_access_key,    :description => 'AWS Secret Access key', :env_var => 'AWS_SECRET_ACCESS_KEY'
 Settings.define :emr_runner,           :description => 'Path to the elastic-mapreduce command (~ etc will be expanded)'
-Settings.define :emr_root,             :description => 'S3 url to use as the base for Elastic MapReduce storage'
-Settings.define :key_pair_file,        :description => 'AWS Key pair file', :finally => lambda{ Settings.key_pair_file = File.expand_path(Settings.key_pair_file.to_s) if Settings.key_pair_file }
-Settings.define :key_pair,             :description => "AWS Key pair name. If not specified, it's taken from key_pair_file's basename", :finally => lambda{ Settings.key_pair ||= File.basename(Settings.key_pair_file.to_s, '.pem') if Settings.key_pair_file }
-Settings.define :instance_type,        :description => 'AWS instance type to use', :default => 'm1.small'
+Settings.define :emr_root,             :description => 'S3 bucket and path to use as the base for Elastic MapReduce storage, organized by job name'
+Settings.define :emr_data_root,        :description => 'Optional '
+Settings.define :emr_bootstrap_script, :description => 'Bootstrap actions for Elastic Map Reduce machine provisioning', :default => EMR_CONFIG_DIR+'/emr_bootstrap.sh', :type => :filename, :finally => lambda{ Settings.emr_bootstrap_script = File.expand_path(Settings.emr_bootstrap_script) }
+Settings.define :emr_extra_args,       :description => 'kludge: allows you to stuff extra args into the elastic-mapreduce invocation', :type => Array, :wukong => true
+Settings.define :alive,                :description => 'Whether to keep machine running after job invocation', :type => :boolean
+#
+Settings.define :keypair_file,        :description => 'AWS Key pair file',                               :type => :filename
+Settings.define :keypair,             :description => "AWS Key pair name. If not specified, it's taken from keypair_file's basename", :finally => lambda{ Settings.keypair ||= File.basename(Settings.keypair_file.to_s, '.pem') if Settings.keypair_file }
+Settings.define :instance_type,        :description => 'AWS instance type to use',                        :default => 'm1.small'
 Settings.define :master_instance_type, :description => 'Overrides the instance type for the master node', :finally => lambda{ Settings.master_instance_type ||= Settings.instance_type }
-Settings.define :jobflow
+Settings.define :jobflow,              :description => "ID of an existing EMR job flow. Wukong will create a new job flow"
+#
+Settings.read(File.expand_path(EMR_CONFIG_DIR+'/emr.yaml'))
 module Wukong
   #
   # EMR Options
@@ -26,39 +36,46 @@ module Wukong
       Log.info "  Copying this script to the cloud."
       S3Util.store(this_script_filename, mapper_s3_uri)
       S3Util.store(this_script_filename, reducer_s3_uri)
-      S3Util.store(File.expand_path('~/ics/wukong/bin/bootstrap.sh'), bootstrap_s3_uri)
+      S3Util.store(File.expand_path(Settings.emr_bootstrap_script), bootstrap_s3_uri)
+    end
+    def copy_jars_to_cloud
+      S3Util.store(File.expand_path('/tmp/wukong-libs.jar'), wukong_libs_s3_uri)
+      # "--cache-archive=#{wukong_libs_s3_uri}#vendor",
+    end
+    def hadoop_options_for_emr_runner
+      [hadoop_jobconf_options, hadoop_other_args].flatten.compact.map{|hdp_opt| "--arg '#{hdp_opt}'"}
     end
     def execute_emr_runner
       command_args = []
-      command_args << Settings.dashed_flags(:hadoop_version, :enable_debugging, :step_action, [:emr_runner_verbose, :verbose], [:emr_runner_debug, :debug]).join(' ')
-      command_args += emr_credentials
       if Settings.jobflow
         command_args << Settings.dashed_flag_for(:jobflow)
       else
-        command_args << Settings.dashed_flag_for(:alive)
         command_args << "--create --name=#{job_name}"
-        command_args << Settings.dashed_flags(:num_instances, [:instance_type, :slave_instance_type], :master_instance_type).join(' ')
+        command_args << Settings.dashed_flag_for(:alive)
+        command_args << Settings.dashed_flags(:num_instances, [:instance_type, :slave_instance_type], :master_instance_type, :hadoop_version).join(' ')
+        command_args << Settings.dashed_flags(:availability_zone, :keypair, :keypair_file).join(' ')
+        command_args << "--bootstrap-action=#{bootstrap_s3_uri}"
       end
+      command_args << Settings.dashed_flags(:enable_debugging, :step_action, [:emr_runner_verbose, :verbose], [:emr_runner_debug, :debug]).join(' ')
+      command_args += emr_credentials
       command_args += [
-        "--bootstrap-action=#{bootstrap_s3_uri}",
         "--log-uri=#{log_s3_uri}",
         "--stream",
         "--mapper=#{mapper_s3_uri} ",
         "--reducer=#{reducer_s3_uri} ",
-        "--input=#{input_paths} --output=#{output_path}",
-        # to specify zero reducers:
-        # "--arg '-D mapred.reduce.tasks=0'"
+        "--input=#{input_paths.join(",")} --output=#{output_path}",
       ]
+      # eg to specify zero reducers:
+      # Settings[:emr_extra_args] = "--arg '-D mapred.reduce.tasks=0'"
+      command_args += Settings[:emr_extra_args] unless Settings[:emr_extra_args].blank?
+      command_args += hadoop_options_for_emr_runner
       Log.info 'Follow along at http://localhost:9000/job'
       execute_command!( File.expand_path(Settings.emr_runner), *command_args )
     end
-    def emr_ship_jars
-      S3Util.store(File.expand_path('/tmp/wukong-libs.jar'), wukong_libs_s3_uri)
-      # "--cache-archive=#{wukong_libs_s3_uri}#vendor",
-    end
     def emr_credentials
       command_args = []
       if Settings.emr_credentials_file
@@ -66,7 +83,6 @@ module Wukong
       else
         command_args << %Q{--access-id #{Settings.access_key} --private-key #{Settings.secret_access_key} }
       end
-      command_args << Settings.dashed_flags(:availability_zone, :key_pair, :key_pair_file).join(' ')
       command_args
     end
@@ -75,58 +91,73 @@ module Wukong
       File.basename($0,'.rb')
     end
+    # Produces an s3 URI within the Wukong emr sandbox from a set of path
+    # segments
+    #
+    # @example
+    #   Settings.emr_root = 's3://emr.yourmom.com/wukong'
+    #   emr_s3_path('log', 'my_happy_job', 'run-97.log')
+    #   # => "s3://emr.yourmom.com/wukong/log/my_happy_job/run-97.log"
+    #
+    def emr_s3_path *path_segs
+      File.join(Settings.emr_root, path_segs.flatten.compact)
+    end
     def mapper_s3_uri
-      emr_s3_path(job_handle+'-mapper.rb')
+      emr_s3_path(job_handle, 'code', job_handle+'-mapper.rb')
     end
     def reducer_s3_uri
-      emr_s3_path(job_handle+'-reducer.rb')
+      emr_s3_path(job_handle, 'code', job_handle+'-reducer.rb')
     end
     def log_s3_uri
-      emr_s3_path('log', job_handle)
+      emr_s3_path(job_handle, 'log', 'emr_jobs')
     end
     def bootstrap_s3_uri
-      emr_s3_path('bin', "bootstrap-#{job_handle}.sh")
+      emr_s3_path(job_handle, 'bin', "emr_bootstrap.sh")
     end
     def wukong_libs_s3_uri
-      emr_s3_path('bin', "wukong-libs.jar")
-    end
-    def emr_s3_path *path_segs
-      File.join(Settings.emr_root, path_segs.flatten.compact)
+      emr_s3_path(job_handle, 'code', "wukong-libs.jar")
     end
-    module ClassMethods
-      # Standard hack to create ClassMethods-on-include
-      def self.included base
-        base.class_eval do
-          extend ClassMethods
-        end
+    ABSOLUTE_URI = %r{^/|^\w+://}
+    #
+    # Walk through the input paths and the output path. Prepends
+    # Settings.emr_data_root to any that does NOT look like
+    # an absolute path ("/foo") or a URI ("s3://yourmom/data")
+    #
+    def fix_paths!
+      return if Settings.emr_data_root.blank?
+      unless input_paths.blank?
+        @input_paths = input_paths.map{|path|   (path =~ ABSOLUTE_URI) ? path : File.join(Settings.emr_data_root, path) }
+      end
+      unless output_path.blank?
+        @output_path = [output_path].map{|path| (path =~ ABSOLUTE_URI) ? path : File.join(Settings.emr_data_root, path) }
       end
     end
+    #
+    # Simple class to coordinate s3 operations
+    #
     class S3Util
       # class methods
       class << self
         def s3
           @s3 ||= RightAws::S3Interface.new(
             Settings.access_key, Settings.secret_access_key,
-            {:multi_thread => true, :logger => Log})
+            {:multi_thread => true, :logger => Log, :port => 80, :protocol => 'http' })
         end
         def bucket_and_path_from_uri uri
           uri =~ %r{^s3\w*://([\w\.\-]+)\W*(.*)} and return([$1, $2])
         end
         def store filename, uri
-          Log.debug "    #{filename} => #{uri}"
           dest_bucket, dest_key = bucket_and_path_from_uri(uri)
-          contents = File.open(filename)
+          Log.debug "    #{filename} => #{dest_bucket} / #{dest_key}"
+          contents = File.read(filename)
           s3.store_object(:bucket => dest_bucket, :key => dest_key, :data => contents)
         end
       end
     end
   end
   Script.class_eval do
     include EmrCommand

data/lib/wukong/script/hadoop_command.rb CHANGED

@@ -32,16 +32,28 @@ module Wukong
     Settings.define :max_maps_per_node,      :jobconf => true, :description => 'mapred.max.maps.per.node',                               :wukong => true
     Settings.define :max_maps_per_cluster,   :jobconf => true, :description => 'mapred.max.maps.per.cluster',                            :wukong => true
     Settings.define :max_record_length,      :jobconf => true, :description => 'mapred.linerecordreader.maxlength',                      :wukong => true # "Safeguards against corrupted data: lines longer than this (in bytes) are treated as bad records."
-    Settings.define :min_input_split_size,   :jobconf => true, :description => 'mapred.min.split.size',                                  :wukong => true
+    Settings.define :min_split_size,         :jobconf => true, :description => 'mapred.min.split.size',                                  :wukong => true
     Settings.define :noempty,                                  :description => "don't create zero-byte reduce files (hadoop mode only)", :wukong => true
+    Settings.define :split_on_xml_tag,                         :description => "Parse XML document by specifying the tag name: 'anything found between <tag> and </tag> will be treated as one record for map tasks'", :wukong => true
+    # emit a -jobconf hadoop option if the simplified command line arg is present
+    # if not, the resulting nil will be elided later
+    def jobconf option
+      if options[option]
+        # "-jobconf %s=%s" % [options.description_for(option), options[option]]
+        "-D %s=%s" % [options.description_for(option), options[option]]
+      end
+    end
     #
     # Assemble the hadoop command to execute
     # and launch the hadoop runner to execute the script across all tasktrackers
     #
+    # FIXME: Should add some simple logic to ensure that commands are in the
+    # right order or hadoop will complain. ie. -D options MUST come before
+    # others
+    #
     def execute_hadoop_workflow
-      # If no reducer_klass and no reduce_command, then skip the reduce phase
-      options[:reduce_tasks] = 0 if (! reducer_klass) && (! options[:reduce_command]) && (! options[:reduce_tasks])
       # Input paths join by ','
       input_paths = @input_paths.join(',')
       #
@@ -49,14 +61,14 @@ module Wukong
       hadoop_commandline = [
         hadoop_runner,
         "jar #{Settings[:hadoop_home]}/contrib/streaming/hadoop-*streaming*.jar",
+        hadoop_jobconf_options,
+        "-D mapred.job.name='#{job_name}'",
+        hadoop_other_args,
         "-mapper  '#{mapper_commandline}'",
         "-reducer '#{reducer_commandline}'",
         "-input   '#{input_paths}'",
         "-output  '#{output_path}'",
-        hadoop_jobconf_options,
-        "-jobconf mapred.job.name='#{job_name}'",
         hadoop_recycle_env,
-        hadoop_other_args,
       ].flatten.compact.join(" \t\\\n  ")
       Log.info "  Launching hadoop!"
       execute_command!(hadoop_commandline)
@@ -64,48 +76,40 @@ module Wukong
     def hadoop_jobconf_options
       jobconf_options = []
-      # The fields should hadoop treat as the keys
-      jobconf_options += [
-        jobconf(:key_field_separator),
-        jobconf(:sort_fields),
-      ]
+      # Fixup these options
+      options[:reuse_jvms] = '-1'             if (options[:reuse_jvms] == true)
+      options[:respect_exit_status] = 'false' if (options[:ignore_exit_status] == true)
+      # If no reducer_klass and no reduce_command, then skip the reduce phase
+      options[:reduce_tasks] = 0 if (! reducer_klass) && (! options[:reduce_command]) && (! options[:reduce_tasks])
       # Fields hadoop should use to distribute records to reducers
       unless options[:partition_fields].blank?
         jobconf_options += [
-          '-partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner',
-          jobconf(:output_field_separator),
           jobconf(:partition_fields),
+          jobconf(:output_field_separator),
         ]
       end
-      # Setting the number of mappers and reducers.
       jobconf_options += [
-        jobconf(:max_node_map_tasks),
-        jobconf(:max_node_reduce_tasks),
-        jobconf(:max_reduces_per_node),
-        jobconf(:max_reduces_per_cluster),
-        jobconf(:max_maps_per_node),
-        jobconf(:max_maps_per_cluster),
-        jobconf(:map_tasks),
-        jobconf(:reduce_tasks)
-      ]
+        :key_field_separator,  :sort_fields,
+        :map_tasks,            :reduce_tasks,
+        :max_node_map_tasks,   :max_node_reduce_tasks,
+        :max_reduces_per_node, :max_reduces_per_cluster,
+        :max_maps_per_node,    :max_maps_per_cluster,
+        :min_split_size,
+        :map_speculative,
+        :timeout,
+        :reuse_jvms, :respect_exit_status
+      ].map{|opt| jobconf(opt)}
       jobconf_options.flatten.compact
     end
-    # emit a -jobconf hadoop option if the simplified command line arg is present
-    # if not, the resulting nil will be elided later
-    def jobconf option
-      if options[option]
-        "-jobconf %s=%s" % [options.description_for(option), options[option]]
-      end
-    end
     def hadoop_other_args
       extra_str_args  = [ options[:extra_args] ]
-      extra_str_args               += ' -lazyOutput' if options[:noempty]  # don't create reduce file if no records
-      options[:reuse_jvms]          = '-1'     if (options[:reuse_jvms] == true)
-      options[:respect_exit_status] = 'false'  if (options[:ignore_exit_status] == true)
-      extra_hsh_args = [:map_speculative, :timeout, :reuse_jvms, :respect_exit_status].map{|opt| jobconf(opt)  }
-      extra_str_args + extra_hsh_args
+      if Settings.split_on_xml_tag
+        extra_str_args << %Q{-inputreader 'StreamXmlRecordReader,begin=<#{Settings.split_on_xml_tag}>,end=</#{Settings.split_on_xml_tag}>'}
+      end
+      extra_str_args   << ' -lazyOutput' if options[:noempty]  # don't create reduce file if no records
+      extra_str_args   << ' -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner' unless options[:partition_fields].blank?
+      extra_str_args
     end
     def hadoop_recycle_env
@@ -135,42 +139,6 @@ module Wukong
       #   Thanks to Todd Lipcon for directing me to that hack.
       #
-      # "HADOOP_HOME"                             =>"/usr/lib/hadoop-0.20/bin/..",
-      # "HADOOP_IDENT_STRING"                     =>"hadoop",
-      # "HADOOP_LOGFILE"                          =>"hadoop-hadoop-tasktracker-ip-10-242-14-223.log",
-      # "HADOOP_LOG_DIR"                          =>"/usr/lib/hadoop-0.20/bin/../logs",
-      # "HOME"                                    =>"/var/run/hadoop-0.20",
-      # "JAVA_HOME"                               =>"/usr/lib/jvm/java-6-sun",
-      # "LD_LIBRARY_PATH"                         =>"/usr/lib/jvm/java-6-sun-1.6.0.10/jre/lib/i386/client:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/lib/i386:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/../lib/i386:/mnt/hadoop/mapred/local/taskTracker/jobcache/job_200910221152_0023/attempt_200910221152_0023_m_000000_0/work:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/lib/i386/client:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/lib/i386:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/../lib/i386",
-      # "PATH"                                    =>"/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games",
-      # "USER"                                    =>"hadoop",
-      #
-      # "dfs_block_size"                          =>"134217728",
-      # "map_input_start"                         =>"0",
-      # "map_input_length"                        =>"125726898",
-      # "mapred_output_key_class"                 =>"org.apache.hadoop.io.Text",
-      # "mapred_output_value_class"               =>"org.apache.hadoop.io.Text",
-      # "mapred_output_format_class"              =>"org.apache.hadoop.mapred.TextOutputFormat",
-      # "mapred_output_compression_codec"         =>"org.apache.hadoop.io.compress.DefaultCodec",
-      # "mapred_output_compression_type"          =>"BLOCK",
-      # "mapred_task_partition"                   =>"0",
-      # "mapred_tasktracker_map_tasks_maximum"    =>"4",
-      # "mapred_tasktracker_reduce_tasks_maximum" =>"2",
-      # "mapred_tip_id"                           =>"task_200910221152_0023_m_000000",
-      # "mapred_task_id"                          =>"attempt_200910221152_0023_m_000000_0",
-      # "mapred_job_tracker"                      =>"ec2-174-129-141-78.compute-1.amazonaws.com:8021",
-      #
-      # "mapred_input_dir"                        =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip/ripd/com.tw/com.twitter.search/20090809",
-      # "map_input_file"                          =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip/ripd/com.tw/com.twitter.search/20090809/com.twitter.search+20090809233441-56735-womper.tsv.bz2",
-      # "mapred_working_dir"                      =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip",
-      # "mapred_work_output_dir"                  =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip/tmp/twsearch-20090809/_temporary/_attempt_200910221152_0023_m_000000_0",
-      # "mapred_output_dir"                       =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip/tmp/twsearch-20090809",
-      # "mapred_temp_dir"                         =>"/mnt/tmp/hadoop-hadoop/mapred/temp",
-      # "PWD"                                     =>"/mnt/hadoop/mapred/local/taskTracker/jobcache/job_200910221152_0023/attempt_200910221152_0023_m_000000_0/work",
-      # "TMPDIR"                                  =>"/mnt/hadoop/mapred/local/taskTracker/jobcache/job_200910221152_0023/attempt_200910221152_0023_m_000000_0/work/tmp",
-      # "stream_map_streamprocessor"              =>"%2Fusr%2Fbin%2Fruby1.8+%2Fmnt%2Fhome%2Fflip%2Fics%2Fwuclan%2Fexamples%2Ftwitter%2Fparse%2Fparse_twitter_search_requests.rb+--map+--rm",
-      # "user_name"                               =>"flip",
       # HDFS pathname to the input file currently being processed.
       def input_file
         ENV['map_input_file']
@@ -211,3 +179,52 @@ module Wukong
     end
   end
 end
+        # -partitioner                          org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner \
+        # -D mapred.output.key.comparator.class=org.apache.hadoop.mapred.lib.KeyFieldBasedComparator \
+        # -D mapred.text.key.comparator.options=-k2,2nr\
+        # -D mapred.text.key.partitioner.options=-k1,2\
+        # -D mapred.text.key.partitioner.options=\"-k1,$partfields\"
+        # -D stream.num.map.output.key.fields=\"$sortfields\"
+        #
+        # -D stream.map.output.field.separator=\"'/t'\"
+        # -D    map.output.key.field.separator=. \
+        # -D       mapred.data.field.separator=. \
+        # -D map.output.key.value.fields.spec=6,5,1-3:0- \
+        # -D reduce.output.key.value.fields.spec=0-2:5- \
+      # "HADOOP_HOME"                             =>"/usr/lib/hadoop-0.20/bin/..",
+      # "HADOOP_IDENT_STRING"                     =>"hadoop",
+      # "HADOOP_LOGFILE"                          =>"hadoop-hadoop-tasktracker-ip-10-242-14-223.log",
+      # "HADOOP_LOG_DIR"                          =>"/usr/lib/hadoop-0.20/bin/../logs",
+      # "HOME"                                    =>"/var/run/hadoop-0.20",
+      # "JAVA_HOME"                               =>"/usr/lib/jvm/java-6-sun",
+      # "LD_LIBRARY_PATH"                         =>"/usr/lib/jvm/java-6-sun-1.6.0.10/jre/lib/i386/client:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/lib/i386:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/../lib/i386:/mnt/hadoop/mapred/local/taskTracker/jobcache/job_200910221152_0023/attempt_200910221152_0023_m_000000_0/work:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/lib/i386/client:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/lib/i386:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/../lib/i386",
+      # "PATH"                                    =>"/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games",
+      # "USER"                                    =>"hadoop",
+      #
+      # "dfs_block_size"                          =>"134217728",
+      # "map_input_start"                         =>"0",
+      # "map_input_length"                        =>"125726898",
+      # "mapred_output_key_class"                 =>"org.apache.hadoop.io.Text",
+      # "mapred_output_value_class"               =>"org.apache.hadoop.io.Text",
+      # "mapred_output_format_class"              =>"org.apache.hadoop.mapred.TextOutputFormat",
+      # "mapred_output_compression_codec"         =>"org.apache.hadoop.io.compress.DefaultCodec",
+      # "mapred_output_compression_type"          =>"BLOCK",
+      # "mapred_task_partition"                   =>"0",
+      # "mapred_tasktracker_map_tasks_maximum"    =>"4",
+      # "mapred_tasktracker_reduce_tasks_maximum" =>"2",
+      # "mapred_tip_id"                           =>"task_200910221152_0023_m_000000",
+      # "mapred_task_id"                          =>"attempt_200910221152_0023_m_000000_0",
+      # "mapred_job_tracker"                      =>"ec2-174-129-141-78.compute-1.amazonaws.com:8021",
+      #
+      # "mapred_input_dir"                        =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip/ripd/com.tw/com.twitter.search/20090809",
+      # "map_input_file"                          =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip/ripd/com.tw/com.twitter.search/20090809/com.twitter.search+20090809233441-56735-womper.tsv.bz2",
+      # "mapred_working_dir"                      =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip",
+      # "mapred_work_output_dir"                  =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip/tmp/twsearch-20090809/_temporary/_attempt_200910221152_0023_m_000000_0",
+      # "mapred_output_dir"                       =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip/tmp/twsearch-20090809",
+      # "mapred_temp_dir"                         =>"/mnt/tmp/hadoop-hadoop/mapred/temp",
+      # "PWD"                                     =>"/mnt/hadoop/mapred/local/taskTracker/jobcache/job_200910221152_0023/attempt_200910221152_0023_m_000000_0/work",
+      # "TMPDIR"                                  =>"/mnt/hadoop/mapred/local/taskTracker/jobcache/job_200910221152_0023/attempt_200910221152_0023_m_000000_0/work/tmp",
+      # "stream_map_streamprocessor"              =>"%2Fusr%2Fbin%2Fruby1.8+%2Fmnt%2Fhome%2Fflip%2Fics%2Fwuclan%2Fexamples%2Ftwitter%2Fparse%2Fparse_twitter_search_requests.rb+--map+--rm",
+      # "user_name"                               =>"flip",