wukong 1.5.3 → 1.5.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG.textile +4 -0
- data/bin/hdp-bin +44 -0
- data/bin/hdp-ls +2 -1
- data/docpages/avro/performance.textile +36 -0
- data/examples/cassandra_streaming/avromapper.rb +85 -0
- data/examples/cassandra_streaming/berlitz_for_cassandra.textile +22 -0
- data/examples/cassandra_streaming/cassandra.avpr +468 -0
- data/examples/cassandra_streaming/cassandra_random_partitioner.rb +62 -0
- data/examples/cassandra_streaming/catter.sh +45 -0
- data/examples/cassandra_streaming/client_interface_notes.textile +200 -0
- data/examples/cassandra_streaming/client_schema.avpr +211 -0
- data/examples/cassandra_streaming/client_schema.textile +318 -0
- data/examples/cassandra_streaming/foofile.avr +0 -0
- data/examples/cassandra_streaming/pymap.sh +1 -0
- data/examples/cassandra_streaming/pyreduce.sh +1 -0
- data/examples/cassandra_streaming/smutation.avpr +188 -0
- data/examples/cassandra_streaming/streamer.sh +51 -0
- data/examples/cassandra_streaming/struct_loader.rb +24 -0
- data/examples/cassandra_streaming/tuning.textile +73 -0
- data/examples/emr/README-elastic_map_reduce.textile +26 -0
- data/examples/emr/dot_wukong_dir/credentials.json +7 -0
- data/examples/emr/{emr.yaml → dot_wukong_dir/emr.yaml} +33 -16
- data/{bin/bootstrap.sh → examples/emr/dot_wukong_dir/emr_bootstrap.sh} +1 -1
- data/examples/emr/elastic_mapreduce_example.rb +1 -0
- data/lib/wukong/encoding/asciize.rb +108 -0
- data/lib/wukong/extensions/date_time.rb +33 -7
- data/lib/wukong/extensions/emittable.rb +12 -25
- data/lib/wukong/extensions/hash_like.rb +13 -6
- data/lib/wukong/filename_pattern.rb +8 -7
- data/lib/wukong/schema.rb +47 -0
- data/lib/wukong/script.rb +7 -0
- data/lib/wukong/script/cassandra_loader_script.rb +40 -0
- data/lib/wukong/script/emr_command.rb +74 -43
- data/lib/wukong/script/hadoop_command.rb +89 -72
- data/lib/wukong/store.rb +2 -7
- data/lib/wukong/store/cassandra.rb +10 -0
- data/lib/wukong/store/cassandra/streaming.rb +75 -0
- data/lib/wukong/store/cassandra/struct_loader.rb +21 -0
- data/lib/wukong/store/cassandra_model.rb +90 -0
- data/lib/wukong/store/chh_chunked_flat_file_store.rb +1 -1
- data/lib/wukong/store/chunked_flat_file_store.rb +24 -20
- data/wukong.gemspec +32 -4
- metadata +33 -14
| @@ -1,4 +1,3 @@ | |
| 1 | 
            -
             | 
| 2 1 | 
             
            Object.class_eval do
         | 
| 3 2 | 
             
              def to_flat() [to_s] end
         | 
| 4 3 | 
             
            end
         | 
| @@ -54,29 +53,17 @@ Hash.class_eval do | |
| 54 53 | 
             
              end
         | 
| 55 54 | 
             
            end
         | 
| 56 55 |  | 
| 57 | 
            -
            class  | 
| 58 | 
            -
              # | 
| 59 | 
            -
               | 
| 60 | 
            -
              #  | 
| 61 | 
            -
               | 
| 62 | 
            -
             | 
| 63 | 
            -
               | 
| 64 | 
            -
             | 
| 65 | 
            -
             | 
| 66 | 
            -
             | 
| 67 | 
            -
             | 
| 68 | 
            -
             | 
| 69 | 
            -
              # Flatten
         | 
| 70 | 
            -
              def to_flat
         | 
| 71 | 
            -
                strftime(FLAT_FORMAT)
         | 
| 72 | 
            -
              end
         | 
| 73 | 
            -
            end
         | 
| 74 | 
            -
             | 
| 75 | 
            -
            class DateTime < Date
         | 
| 76 | 
            -
              # strftime() format to flatten a date
         | 
| 77 | 
            -
              FLAT_FORMAT = "%Y%m%d%H%M%S"
         | 
| 78 | 
            -
              # Flatten
         | 
| 79 | 
            -
              def to_flat
         | 
| 80 | 
            -
                strftime(FLAT_FORMAT)
         | 
| 56 | 
            +
            class Integer
         | 
| 57 | 
            +
              #
         | 
| 58 | 
            +
              # Express boolean as 1 (true) or 0 (false).  In contravention of typical ruby
         | 
| 59 | 
            +
              # semantics (but in a way that is more robust for wukong-like batch
         | 
| 60 | 
            +
              # processing), the number 0, the string '0', nil and false are all considered
         | 
| 61 | 
            +
              # false. (This also makes the method idempotent: repeated calls give same result.)
         | 
| 62 | 
            +
              #
         | 
| 63 | 
            +
              def self.unbooleanize bool
         | 
| 64 | 
            +
                case bool
         | 
| 65 | 
            +
                when 0, '0', false, nil then 0
         | 
| 66 | 
            +
                else                         1
         | 
| 67 | 
            +
                end
         | 
| 81 68 | 
             
              end
         | 
| 82 69 | 
             
            end
         | 
| @@ -103,16 +103,23 @@ module Wukong | |
| 103 103 | 
             
                  # otherwise they must be uniformly strings
         | 
| 104 104 | 
             
                  #
         | 
| 105 105 | 
             
                  def from_hash(hsh, has_symbol_keys=false)
         | 
| 106 | 
            -
                     | 
| 107 | 
            -
                     | 
| 108 | 
            -
                    self.new *hsh.values_of(*keys)
         | 
| 106 | 
            +
                    extract_keys = has_symbol_keys ? self.keys.map(&:to_sym) : self.keys.map(&:to_s)
         | 
| 107 | 
            +
                    self.new *hsh.values_of(*extract_keys)
         | 
| 109 108 | 
             
                  end
         | 
| 110 109 | 
             
                  #
         | 
| 111 110 | 
             
                  # The last portion of the class in underscored form
         | 
| 112 | 
            -
                  #  | 
| 111 | 
            +
                  # memoized
         | 
| 113 112 | 
             
                  #
         | 
| 114 | 
            -
                  def  | 
| 115 | 
            -
                    @resource_name ||= self. | 
| 113 | 
            +
                  def resource_name
         | 
| 114 | 
            +
                    @resource_name ||= self.class_basename.underscore.to_sym
         | 
| 115 | 
            +
                  end
         | 
| 116 | 
            +
                  # The last portion of the class name
         | 
| 117 | 
            +
                  # memoized
         | 
| 118 | 
            +
                  #
         | 
| 119 | 
            +
                  # @example
         | 
| 120 | 
            +
                  #   This::That::TheOther.new.class_basename   # => TheOther
         | 
| 121 | 
            +
                  def class_basename
         | 
| 122 | 
            +
                    @class_basename ||= self.to_s.gsub(%r{.*::}, '')
         | 
| 116 123 | 
             
                  end
         | 
| 117 124 | 
             
                end
         | 
| 118 125 |  | 
| @@ -16,12 +16,12 @@ module Wukong | |
| 16 16 | 
             
                  # walk through pattern, replacing tokens (eg :time or :pid) with the
         | 
| 17 17 | 
             
                  # corresponding value.
         | 
| 18 18 | 
             
                  #
         | 
| 19 | 
            +
                  # Don't use ':' in a pattern except to introduce a token
         | 
| 20 | 
            +
                  # and separate tokens with '-', '+' '/' or '.'
         | 
| 21 | 
            +
                  #
         | 
| 19 22 | 
             
                  def make token_vals={}
         | 
| 20 23 | 
             
                    token_vals = token_val_defaults.merge token_vals
         | 
| 21 24 | 
             
                    token_vals[:timestamp] ||= Time.now.utc.strftime("%Y%m%d%H%M%S")
         | 
| 22 | 
            -
                    # CHH_NOTE: The following is broken for patterns that need a ":" or 
         | 
| 23 | 
            -
                    # patterns that need text following a token with no special chars in 
         | 
| 24 | 
            -
                    # between.
         | 
| 25 25 | 
             
                    val = pattern.gsub(/:(\w+)/){ replace($1, token_vals)  }
         | 
| 26 26 | 
             
                    val
         | 
| 27 27 | 
             
                  end
         | 
| @@ -39,7 +39,7 @@ module Wukong | |
| 39 39 | 
             
                    case token
         | 
| 40 40 | 
             
                    when :pid           then pid
         | 
| 41 41 | 
             
                    when :hostname      then hostname
         | 
| 42 | 
            -
                    when :handle        then token_vals[:handle] | 
| 42 | 
            +
                    when :handle        then token_vals[:handle]
         | 
| 43 43 | 
             
                    when :handle_prefix then token_vals[:handle].to_s[0..5]
         | 
| 44 44 | 
             
                    when :timestamp     then token_vals[:timestamp]
         | 
| 45 45 | 
             
                    when :date          then token_vals[:timestamp][ 0..7]
         | 
| @@ -56,7 +56,7 @@ module Wukong | |
| 56 56 |  | 
| 57 57 | 
             
                  # Memoized: the hostname for the machine running this script.
         | 
| 58 58 | 
             
                  def hostname
         | 
| 59 | 
            -
                    @hostname ||= ENV['HOSTNAME'] || `hostname`. | 
| 59 | 
            +
                    @hostname ||= ENV['HOSTNAME'] || `hostname`.chomp
         | 
| 60 60 | 
             
                  end
         | 
| 61 61 | 
             
                  # Memoized: the Process ID for this invocation.
         | 
| 62 62 | 
             
                  def pid
         | 
| @@ -64,9 +64,10 @@ module Wukong | |
| 64 64 | 
             
                  end
         | 
| 65 65 |  | 
| 66 66 | 
             
                  # Characters deemed safe in a filename;
         | 
| 67 | 
            -
                  SAFE_CHARS = 'a-zA-Z0-9_ | 
| 67 | 
            +
                  SAFE_CHARS = 'a-zA-Z0-9_\-\.\+\/'
         | 
| 68 | 
            +
                  RE_SAFE_FILENAME = %r{[^#{SAFE_CHARS}]+}moxi
         | 
| 68 69 | 
             
                  def self.sanitize str
         | 
| 69 | 
            -
                    str.gsub( | 
| 70 | 
            +
                    str.gsub(RE_SAFE_FILENAME, '-')
         | 
| 70 71 | 
             
                  end
         | 
| 71 72 |  | 
| 72 73 | 
             
                end
         | 
    
        data/lib/wukong/schema.rb
    CHANGED
    
    | @@ -50,6 +50,35 @@ class << Yaml       ; def to_pig() 'chararray'     end ; end if defined?(Yaml) | |
| 50 50 | 
             
            class << Json       ; def to_pig() 'chararray'     end ; end if defined?(Json)
         | 
| 51 51 | 
             
            class << Regex      ; def to_pig() 'chararray'     end ; end if defined?(Regex)
         | 
| 52 52 |  | 
| 53 | 
            +
             | 
| 54 | 
            +
            #
         | 
| 55 | 
            +
            # Basic types: Avro conversion
         | 
| 56 | 
            +
            #
         | 
| 57 | 
            +
            class << Integer    ; def to_avro() 'int'           end ; end
         | 
| 58 | 
            +
            class << Bignum     ; def to_avro() 'long'          end ; end
         | 
| 59 | 
            +
            class << Float      ; def to_avro() 'float'         end ; end
         | 
| 60 | 
            +
            class << Symbol     ; def to_avro() 'string'        end ; end
         | 
| 61 | 
            +
            class << Date       ; def to_avro() 'long'          end ; end
         | 
| 62 | 
            +
            class << Time       ; def to_avro() 'long'          end ; end
         | 
| 63 | 
            +
            class << DateTime   ; def to_avro() 'long'          end ; end
         | 
| 64 | 
            +
            class << String     ; def to_avro() 'string'        end ; end
         | 
| 65 | 
            +
            class << Text       ; def to_avro() 'string'        end ; end if defined?(Text)
         | 
| 66 | 
            +
            class << Blob       ; def to_avro() 'bytearray'     end ; end if defined?(Blob)
         | 
| 67 | 
            +
            class << Boolean    ; def to_avro() 'bytearray'     end ; end if defined?(Boolean)
         | 
| 68 | 
            +
            class String        ; def to_avro() self.to_s ;     end ; end
         | 
| 69 | 
            +
            class Symbol        ; def to_avro() self.to_s ;     end ; end
         | 
| 70 | 
            +
             | 
| 71 | 
            +
            class << BigDecimal ; def to_avro() 'long'          end ; end if defined?(BigDecimal)
         | 
| 72 | 
            +
            class << EpochTime  ; def to_avro() 'integer'       end ; end if defined?(EpochTime)
         | 
| 73 | 
            +
            class << FilePath   ; def to_avro() 'string'        end ; end if defined?(FilePath)
         | 
| 74 | 
            +
            class << Flag       ; def to_avro() 'string'        end ; end if defined?(Flag)
         | 
| 75 | 
            +
            class << IPAddress  ; def to_avro() 'string'        end ; end if defined?(IPAddress)
         | 
| 76 | 
            +
            class << URI        ; def to_avro() 'string'        end ; end if defined?(URI)
         | 
| 77 | 
            +
            class << Csv        ; def to_avro() 'string'        end ; end if defined?(Csv)
         | 
| 78 | 
            +
            class << Yaml       ; def to_avro() 'string'        end ; end if defined?(Yaml)
         | 
| 79 | 
            +
            class << Json       ; def to_avro() 'string'        end ; end if defined?(Json)
         | 
| 80 | 
            +
            class << Regex      ; def to_avro() 'string'        end ; end if defined?(Regex)
         | 
| 81 | 
            +
             | 
| 53 82 | 
             
            module Wukong
         | 
| 54 83 | 
             
              #
         | 
| 55 84 | 
             
              # Export model's structure for loading and manipulating in other frameworks,
         | 
| @@ -208,6 +237,24 @@ module Wukong | |
| 208 237 | 
             
                    str.join("\n")
         | 
| 209 238 | 
             
                  end
         | 
| 210 239 |  | 
| 240 | 
            +
             | 
| 241 | 
            +
             | 
| 242 | 
            +
             | 
| 243 | 
            +
                  #
         | 
| 244 | 
            +
                  # Avro
         | 
| 245 | 
            +
                  #
         | 
| 246 | 
            +
                  def to_avro
         | 
| 247 | 
            +
                    require 'json' # yikes
         | 
| 248 | 
            +
                    h = {}
         | 
| 249 | 
            +
                    h[:name]   = self.name
         | 
| 250 | 
            +
                    h[:type]   = "record"
         | 
| 251 | 
            +
                    h[:fields] =  []
         | 
| 252 | 
            +
                    members.zip(mtypes).each do |member, type|
         | 
| 253 | 
            +
                      h[:fields] << {:name => member.to_s, :type => type.to_avro}
         | 
| 254 | 
            +
                    end
         | 
| 255 | 
            +
                    h.to_json
         | 
| 256 | 
            +
                  end
         | 
| 257 | 
            +
                  
         | 
| 211 258 | 
             
                end
         | 
| 212 259 | 
             
                # standard stanza for making methods appear on the class itself on include
         | 
| 213 260 | 
             
                def self.included base
         | 
    
        data/lib/wukong/script.rb
    CHANGED
    
    | @@ -145,6 +145,7 @@ module Wukong | |
| 145 145 | 
             
                  when 'map'              then mapper_klass.new(self.options).stream
         | 
| 146 146 | 
             
                  when 'reduce'           then reducer_klass.new(self.options).stream
         | 
| 147 147 | 
             
                  when 'local'            then execute_local_workflow
         | 
| 148 | 
            +
                  when 'cassandra'        then execute_hadoop_workflow
         | 
| 148 149 | 
             
                  when 'hadoop', 'mapred' then execute_hadoop_workflow
         | 
| 149 150 | 
             
                  when 'emr'
         | 
| 150 151 | 
             
                    require 'wukong/script/emr_command'
         | 
| @@ -196,6 +197,12 @@ module Wukong | |
| 196 197 | 
             
                    "#{File.basename(this_script_filename)}---#{input_paths}---#{output_path}".gsub(%r{[^\w/\.\-\+]+}, '')
         | 
| 197 198 | 
             
                end
         | 
| 198 199 |  | 
| 200 | 
            +
                # Wrapper for dangerous operations to catch errors
         | 
| 201 | 
            +
                def safely action, &block
         | 
| 202 | 
            +
                  begin
         | 
| 203 | 
            +
                    block.call
         | 
| 204 | 
            +
                  rescue StandardError => e ; handle_error(action, e); end
         | 
| 205 | 
            +
                end
         | 
| 199 206 |  | 
| 200 207 | 
             
              protected
         | 
| 201 208 |  | 
| @@ -0,0 +1,40 @@ | |
| 1 | 
            +
            Settings.define :cassandra_keyspace,   :required => true, :description => "The keyspace to bulk load"
         | 
| 2 | 
            +
            Settings.define :cassandra_col_family, :required => true, :description => "The column family to bulk load"
         | 
| 3 | 
            +
            Settings.define :cassandra_home,  :env_var => 'CASSANDRA_HOME', :default => '/usr/local/share/cassandra'
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            module Wukong
         | 
| 6 | 
            +
              class CassandraScript < Wukong::Script
         | 
| 7 | 
            +
                def hadoop_other_args *args
         | 
| 8 | 
            +
                  opts = super(*args)
         | 
| 9 | 
            +
                  opts << "-D stream.map.output=\'cassandra_avro_output\'"
         | 
| 10 | 
            +
                  opts << "-D stream.io.identifier.resolver.class=\'org.apache.cassandra.hadoop.streaming.AvroResolver\'"
         | 
| 11 | 
            +
                  opts << "-D cassandra.output.keyspace=\'#{Settings.cassandra_keyspace}\'"
         | 
| 12 | 
            +
                  opts << "-D cassandra.output.columnfamily=\'#{Settings.cassandra_col_family}\'"
         | 
| 13 | 
            +
                  opts << "-D cassandra.partitioner.class=\'org.apache.cassandra.dht.RandomPartitioner\'"
         | 
| 14 | 
            +
                  opts << "-D cassandra.thrift.address=\'#{[Settings.cassandra_hosts].flatten.map{|s| s.gsub(/:.*/, '')}.join(",")}\'"
         | 
| 15 | 
            +
                  opts << "-D cassandra.thrift.port=\'9160\'"
         | 
| 16 | 
            +
                  # opts << "-D mapreduce.output.columnfamilyoutputformat.batch.threshold=\'1024\'"
         | 
| 17 | 
            +
                  # ORDER MATTERS
         | 
| 18 | 
            +
                  opts << "-libjars \'#{cassandra_jars}\'"
         | 
| 19 | 
            +
                  opts << "-file    \'#{avro_schema}\'"
         | 
| 20 | 
            +
                  opts << "-outputformat \'org.apache.cassandra.hadoop.ColumnFamilyOutputFormat\'"
         | 
| 21 | 
            +
                  opts
         | 
| 22 | 
            +
                end
         | 
| 23 | 
            +
             | 
| 24 | 
            +
                #
         | 
| 25 | 
            +
                # Return paths to cassandra jars as a string
         | 
| 26 | 
            +
                #
         | 
| 27 | 
            +
                def cassandra_jars
         | 
| 28 | 
            +
                  jars = []
         | 
| 29 | 
            +
                  Dir["#{Settings.cassandra_home}/build/apache-cassandra*.jar", "#{Settings.cassandra_home}/build/lib/jars/*.jar", "#{Settings.cassandra_home}/lib/*.jar"].each do |jar|
         | 
| 30 | 
            +
                    jars << jar
         | 
| 31 | 
            +
                  end
         | 
| 32 | 
            +
                  jars.join(',')
         | 
| 33 | 
            +
                end
         | 
| 34 | 
            +
             | 
| 35 | 
            +
                def avro_schema
         | 
| 36 | 
            +
                  File.join(Settings.cassandra_home, "interface/avro/cassandra.avpr")
         | 
| 37 | 
            +
                end
         | 
| 38 | 
            +
             | 
| 39 | 
            +
              end
         | 
| 40 | 
            +
            end
         | 
| @@ -1,16 +1,26 @@ | |
| 1 1 | 
             
            require 'right_aws'
         | 
| 2 2 | 
             
            require 'configliere/config_block'
         | 
| 3 | 
            -
             | 
| 3 | 
            +
            #
         | 
| 4 | 
            +
            EMR_CONFIG_DIR = '~/.wukong' unless defined?(EMR_CONFIG_DIR)
         | 
| 5 | 
            +
            #
         | 
| 4 6 | 
             
            Settings.define :emr_credentials_file, :description => 'A .json file holding your AWS access credentials. See http://bit.ly/emr_credentials_file for format'
         | 
| 5 7 | 
             
            Settings.define :access_key,           :description => 'AWS Access key',        :env_var => 'AWS_ACCESS_KEY_ID'
         | 
| 6 8 | 
             
            Settings.define :secret_access_key,    :description => 'AWS Secret Access key', :env_var => 'AWS_SECRET_ACCESS_KEY'
         | 
| 7 9 | 
             
            Settings.define :emr_runner,           :description => 'Path to the elastic-mapreduce command (~ etc will be expanded)'
         | 
| 8 | 
            -
            Settings.define :emr_root,             :description => 'S3  | 
| 9 | 
            -
            Settings.define : | 
| 10 | 
            -
            Settings.define : | 
| 11 | 
            -
            Settings.define : | 
| 10 | 
            +
            Settings.define :emr_root,             :description => 'S3 bucket and path to use as the base for Elastic MapReduce storage, organized by job name'
         | 
| 11 | 
            +
            Settings.define :emr_data_root,        :description => 'Optional '
         | 
| 12 | 
            +
            Settings.define :emr_bootstrap_script, :description => 'Bootstrap actions for Elastic Map Reduce machine provisioning', :default => EMR_CONFIG_DIR+'/emr_bootstrap.sh', :type => :filename, :finally => lambda{ Settings.emr_bootstrap_script = File.expand_path(Settings.emr_bootstrap_script) }
         | 
| 13 | 
            +
            Settings.define :emr_extra_args,       :description => 'kludge: allows you to stuff extra args into the elastic-mapreduce invocation', :type => Array, :wukong => true
         | 
| 14 | 
            +
            Settings.define :alive,                :description => 'Whether to keep machine running after job invocation', :type => :boolean
         | 
| 15 | 
            +
            #
         | 
| 16 | 
            +
            Settings.define :keypair_file,        :description => 'AWS Key pair file',                               :type => :filename
         | 
| 17 | 
            +
            Settings.define :keypair,             :description => "AWS Key pair name. If not specified, it's taken from keypair_file's basename", :finally => lambda{ Settings.keypair ||= File.basename(Settings.keypair_file.to_s, '.pem') if Settings.keypair_file }
         | 
| 18 | 
            +
            Settings.define :instance_type,        :description => 'AWS instance type to use',                        :default => 'm1.small'
         | 
| 12 19 | 
             
            Settings.define :master_instance_type, :description => 'Overrides the instance type for the master node', :finally => lambda{ Settings.master_instance_type ||= Settings.instance_type }
         | 
| 13 | 
            -
            Settings.define :jobflow
         | 
| 20 | 
            +
            Settings.define :jobflow,              :description => "ID of an existing EMR job flow. Wukong will create a new job flow"
         | 
| 21 | 
            +
            #
         | 
| 22 | 
            +
            Settings.read(File.expand_path(EMR_CONFIG_DIR+'/emr.yaml'))
         | 
| 23 | 
            +
             | 
| 14 24 | 
             
            module Wukong
         | 
| 15 25 | 
             
              #
         | 
| 16 26 | 
             
              # EMR Options
         | 
| @@ -26,39 +36,46 @@ module Wukong | |
| 26 36 | 
             
                  Log.info "  Copying this script to the cloud."
         | 
| 27 37 | 
             
                  S3Util.store(this_script_filename, mapper_s3_uri)
         | 
| 28 38 | 
             
                  S3Util.store(this_script_filename, reducer_s3_uri)
         | 
| 29 | 
            -
                  S3Util.store(File.expand_path( | 
| 39 | 
            +
                  S3Util.store(File.expand_path(Settings.emr_bootstrap_script), bootstrap_s3_uri)
         | 
| 40 | 
            +
                end
         | 
| 41 | 
            +
             | 
| 42 | 
            +
                def copy_jars_to_cloud
         | 
| 43 | 
            +
                  S3Util.store(File.expand_path('/tmp/wukong-libs.jar'), wukong_libs_s3_uri)
         | 
| 44 | 
            +
                  # "--cache-archive=#{wukong_libs_s3_uri}#vendor",
         | 
| 45 | 
            +
                end
         | 
| 46 | 
            +
             | 
| 47 | 
            +
                def hadoop_options_for_emr_runner
         | 
| 48 | 
            +
                  [hadoop_jobconf_options, hadoop_other_args].flatten.compact.map{|hdp_opt| "--arg '#{hdp_opt}'"}
         | 
| 30 49 | 
             
                end
         | 
| 31 50 |  | 
| 32 51 | 
             
                def execute_emr_runner
         | 
| 33 52 | 
             
                  command_args = []
         | 
| 34 | 
            -
                  command_args << Settings.dashed_flags(:hadoop_version, :enable_debugging, :step_action, [:emr_runner_verbose, :verbose], [:emr_runner_debug, :debug]).join(' ')
         | 
| 35 | 
            -
                  command_args += emr_credentials
         | 
| 36 53 | 
             
                  if Settings.jobflow
         | 
| 37 54 | 
             
                    command_args << Settings.dashed_flag_for(:jobflow)
         | 
| 38 55 | 
             
                  else
         | 
| 39 | 
            -
                    command_args << Settings.dashed_flag_for(:alive)
         | 
| 40 56 | 
             
                    command_args << "--create --name=#{job_name}"
         | 
| 41 | 
            -
                    command_args << Settings. | 
| 57 | 
            +
                    command_args << Settings.dashed_flag_for(:alive)
         | 
| 58 | 
            +
                    command_args << Settings.dashed_flags(:num_instances, [:instance_type, :slave_instance_type], :master_instance_type, :hadoop_version).join(' ')
         | 
| 59 | 
            +
                    command_args << Settings.dashed_flags(:availability_zone, :keypair, :keypair_file).join(' ')
         | 
| 60 | 
            +
                    command_args << "--bootstrap-action=#{bootstrap_s3_uri}"
         | 
| 42 61 | 
             
                  end
         | 
| 62 | 
            +
                  command_args << Settings.dashed_flags(:enable_debugging, :step_action, [:emr_runner_verbose, :verbose], [:emr_runner_debug, :debug]).join(' ')
         | 
| 63 | 
            +
                  command_args += emr_credentials
         | 
| 43 64 | 
             
                  command_args += [
         | 
| 44 | 
            -
                    "--bootstrap-action=#{bootstrap_s3_uri}",
         | 
| 45 65 | 
             
                    "--log-uri=#{log_s3_uri}",
         | 
| 46 66 | 
             
                    "--stream",
         | 
| 47 67 | 
             
                    "--mapper=#{mapper_s3_uri} ",
         | 
| 48 68 | 
             
                    "--reducer=#{reducer_s3_uri} ",
         | 
| 49 | 
            -
                    "--input=#{input_paths} --output=#{output_path}",
         | 
| 50 | 
            -
                    # to specify zero reducers:
         | 
| 51 | 
            -
                    # "--arg '-D mapred.reduce.tasks=0'"
         | 
| 69 | 
            +
                    "--input=#{input_paths.join(",")} --output=#{output_path}",
         | 
| 52 70 | 
             
                  ]
         | 
| 71 | 
            +
                  # eg to specify zero reducers:
         | 
| 72 | 
            +
                  # Settings[:emr_extra_args] = "--arg '-D mapred.reduce.tasks=0'"
         | 
| 73 | 
            +
                  command_args += Settings[:emr_extra_args] unless Settings[:emr_extra_args].blank?
         | 
| 74 | 
            +
                  command_args += hadoop_options_for_emr_runner
         | 
| 53 75 | 
             
                  Log.info 'Follow along at http://localhost:9000/job'
         | 
| 54 76 | 
             
                  execute_command!( File.expand_path(Settings.emr_runner), *command_args )
         | 
| 55 77 | 
             
                end
         | 
| 56 78 |  | 
| 57 | 
            -
                def emr_ship_jars
         | 
| 58 | 
            -
                  S3Util.store(File.expand_path('/tmp/wukong-libs.jar'), wukong_libs_s3_uri)
         | 
| 59 | 
            -
                  # "--cache-archive=#{wukong_libs_s3_uri}#vendor",
         | 
| 60 | 
            -
                end
         | 
| 61 | 
            -
             | 
| 62 79 | 
             
                def emr_credentials
         | 
| 63 80 | 
             
                  command_args = []
         | 
| 64 81 | 
             
                  if Settings.emr_credentials_file
         | 
| @@ -66,7 +83,6 @@ module Wukong | |
| 66 83 | 
             
                  else
         | 
| 67 84 | 
             
                    command_args << %Q{--access-id #{Settings.access_key} --private-key #{Settings.secret_access_key} }
         | 
| 68 85 | 
             
                  end
         | 
| 69 | 
            -
                  command_args << Settings.dashed_flags(:availability_zone, :key_pair, :key_pair_file).join(' ')
         | 
| 70 86 | 
             
                  command_args
         | 
| 71 87 | 
             
                end
         | 
| 72 88 |  | 
| @@ -75,58 +91,73 @@ module Wukong | |
| 75 91 | 
             
                  File.basename($0,'.rb')
         | 
| 76 92 | 
             
                end
         | 
| 77 93 |  | 
| 94 | 
            +
                # Produces an s3 URI within the Wukong emr sandbox from a set of path
         | 
| 95 | 
            +
                # segments
         | 
| 96 | 
            +
                #
         | 
| 97 | 
            +
                # @example
         | 
| 98 | 
            +
                #   Settings.emr_root = 's3://emr.yourmom.com/wukong'
         | 
| 99 | 
            +
                #   emr_s3_path('log', 'my_happy_job', 'run-97.log')
         | 
| 100 | 
            +
                #   # => "s3://emr.yourmom.com/wukong/log/my_happy_job/run-97.log"
         | 
| 101 | 
            +
                #
         | 
| 102 | 
            +
                def emr_s3_path *path_segs
         | 
| 103 | 
            +
                  File.join(Settings.emr_root, path_segs.flatten.compact)
         | 
| 104 | 
            +
                end
         | 
| 105 | 
            +
             | 
| 78 106 | 
             
                def mapper_s3_uri
         | 
| 79 | 
            -
                  emr_s3_path(job_handle+'-mapper.rb')
         | 
| 107 | 
            +
                  emr_s3_path(job_handle, 'code', job_handle+'-mapper.rb')
         | 
| 80 108 | 
             
                end
         | 
| 81 109 | 
             
                def reducer_s3_uri
         | 
| 82 | 
            -
                  emr_s3_path(job_handle+'-reducer.rb')
         | 
| 110 | 
            +
                  emr_s3_path(job_handle, 'code', job_handle+'-reducer.rb')
         | 
| 83 111 | 
             
                end
         | 
| 84 112 | 
             
                def log_s3_uri
         | 
| 85 | 
            -
                  emr_s3_path('log',  | 
| 113 | 
            +
                  emr_s3_path(job_handle, 'log', 'emr_jobs')
         | 
| 86 114 | 
             
                end
         | 
| 87 115 | 
             
                def bootstrap_s3_uri
         | 
| 88 | 
            -
                  emr_s3_path('bin', " | 
| 116 | 
            +
                  emr_s3_path(job_handle, 'bin', "emr_bootstrap.sh")
         | 
| 89 117 | 
             
                end
         | 
| 90 118 | 
             
                def wukong_libs_s3_uri
         | 
| 91 | 
            -
                  emr_s3_path(' | 
| 92 | 
            -
                end
         | 
| 93 | 
            -
             | 
| 94 | 
            -
                def emr_s3_path *path_segs
         | 
| 95 | 
            -
                  File.join(Settings.emr_root, path_segs.flatten.compact)
         | 
| 119 | 
            +
                  emr_s3_path(job_handle, 'code', "wukong-libs.jar")
         | 
| 96 120 | 
             
                end
         | 
| 97 121 |  | 
| 98 | 
            -
                 | 
| 99 | 
            -
             | 
| 100 | 
            -
             | 
| 101 | 
            -
             | 
| 102 | 
            -
             | 
| 103 | 
            -
             | 
| 104 | 
            -
             | 
| 122 | 
            +
                ABSOLUTE_URI = %r{^/|^\w+://}
         | 
| 123 | 
            +
                #
         | 
| 124 | 
            +
                # Walk through the input paths and the output path. Prepends
         | 
| 125 | 
            +
                # Settings.emr_data_root to any that does NOT look like
         | 
| 126 | 
            +
                # an absolute path ("/foo") or a URI ("s3://yourmom/data")
         | 
| 127 | 
            +
                #
         | 
| 128 | 
            +
                def fix_paths!
         | 
| 129 | 
            +
                  return if Settings.emr_data_root.blank?
         | 
| 130 | 
            +
                  unless input_paths.blank?
         | 
| 131 | 
            +
                    @input_paths = input_paths.map{|path|   (path =~ ABSOLUTE_URI) ? path : File.join(Settings.emr_data_root, path) }
         | 
| 132 | 
            +
                  end
         | 
| 133 | 
            +
                  unless output_path.blank?
         | 
| 134 | 
            +
                    @output_path = [output_path].map{|path| (path =~ ABSOLUTE_URI) ? path : File.join(Settings.emr_data_root, path) }
         | 
| 105 135 | 
             
                  end
         | 
| 106 136 | 
             
                end
         | 
| 107 137 |  | 
| 138 | 
            +
                #
         | 
| 139 | 
            +
                # Simple class to coordinate s3 operations
         | 
| 140 | 
            +
                #
         | 
| 108 141 | 
             
                class S3Util
         | 
| 109 142 | 
             
                  # class methods
         | 
| 110 143 | 
             
                  class << self
         | 
| 111 144 | 
             
                    def s3
         | 
| 112 145 | 
             
                      @s3 ||= RightAws::S3Interface.new(
         | 
| 113 146 | 
             
                        Settings.access_key, Settings.secret_access_key,
         | 
| 114 | 
            -
                        {:multi_thread => true, :logger => Log})
         | 
| 147 | 
            +
                        {:multi_thread => true, :logger => Log, :port => 80, :protocol => 'http' })
         | 
| 115 148 | 
             
                    end
         | 
| 116 | 
            -
             | 
| 117 149 | 
             
                    def bucket_and_path_from_uri uri
         | 
| 118 150 | 
             
                      uri =~ %r{^s3\w*://([\w\.\-]+)\W*(.*)} and return([$1, $2])
         | 
| 119 151 | 
             
                    end
         | 
| 120 | 
            -
             | 
| 121 152 | 
             
                    def store filename, uri
         | 
| 122 | 
            -
                      Log.debug "    #{filename} => #{uri}"
         | 
| 123 153 | 
             
                      dest_bucket, dest_key = bucket_and_path_from_uri(uri)
         | 
| 124 | 
            -
                       | 
| 154 | 
            +
                      Log.debug "    #{filename} => #{dest_bucket} / #{dest_key}"
         | 
| 155 | 
            +
                      contents = File.read(filename)
         | 
| 125 156 | 
             
                      s3.store_object(:bucket => dest_bucket, :key => dest_key, :data => contents)
         | 
| 126 157 | 
             
                    end
         | 
| 127 | 
            -
             | 
| 128 158 | 
             
                  end
         | 
| 129 159 | 
             
                end
         | 
| 160 | 
            +
             | 
| 130 161 | 
             
              end
         | 
| 131 162 | 
             
              Script.class_eval do
         | 
| 132 163 | 
             
                include EmrCommand
         | 
| @@ -32,16 +32,28 @@ module Wukong | |
| 32 32 | 
             
                Settings.define :max_maps_per_node,      :jobconf => true, :description => 'mapred.max.maps.per.node',                               :wukong => true
         | 
| 33 33 | 
             
                Settings.define :max_maps_per_cluster,   :jobconf => true, :description => 'mapred.max.maps.per.cluster',                            :wukong => true
         | 
| 34 34 | 
             
                Settings.define :max_record_length,      :jobconf => true, :description => 'mapred.linerecordreader.maxlength',                      :wukong => true # "Safeguards against corrupted data: lines longer than this (in bytes) are treated as bad records."
         | 
| 35 | 
            -
                Settings.define : | 
| 35 | 
            +
                Settings.define :min_split_size,         :jobconf => true, :description => 'mapred.min.split.size',                                  :wukong => true
         | 
| 36 36 | 
             
                Settings.define :noempty,                                  :description => "don't create zero-byte reduce files (hadoop mode only)", :wukong => true
         | 
| 37 | 
            +
                Settings.define :split_on_xml_tag,                         :description => "Parse XML document by specifying the tag name: 'anything found between <tag> and </tag> will be treated as one record for map tasks'", :wukong => true
         | 
| 38 | 
            +
             | 
| 39 | 
            +
                # emit a -jobconf hadoop option if the simplified command line arg is present
         | 
| 40 | 
            +
                # if not, the resulting nil will be elided later
         | 
| 41 | 
            +
                def jobconf option
         | 
| 42 | 
            +
                  if options[option]
         | 
| 43 | 
            +
                    # "-jobconf %s=%s" % [options.description_for(option), options[option]]
         | 
| 44 | 
            +
                    "-D %s=%s" % [options.description_for(option), options[option]]
         | 
| 45 | 
            +
                  end
         | 
| 46 | 
            +
                end
         | 
| 37 47 |  | 
| 38 48 | 
             
                #
         | 
| 39 49 | 
             
                # Assemble the hadoop command to execute
         | 
| 40 50 | 
             
                # and launch the hadoop runner to execute the script across all tasktrackers
         | 
| 41 51 | 
             
                #
         | 
| 52 | 
            +
                # FIXME: Should add some simple logic to ensure that commands are in the
         | 
| 53 | 
            +
                # right order or hadoop will complain. ie. -D options MUST come before
         | 
| 54 | 
            +
                # others
         | 
| 55 | 
            +
                #
         | 
| 42 56 | 
             
                def execute_hadoop_workflow
         | 
| 43 | 
            -
                  # If no reducer_klass and no reduce_command, then skip the reduce phase
         | 
| 44 | 
            -
                  options[:reduce_tasks] = 0 if (! reducer_klass) && (! options[:reduce_command]) && (! options[:reduce_tasks])
         | 
| 45 57 | 
             
                  # Input paths join by ','
         | 
| 46 58 | 
             
                  input_paths = @input_paths.join(',')
         | 
| 47 59 | 
             
                  #
         | 
| @@ -49,14 +61,14 @@ module Wukong | |
| 49 61 | 
             
                  hadoop_commandline = [
         | 
| 50 62 | 
             
                    hadoop_runner,
         | 
| 51 63 | 
             
                    "jar #{Settings[:hadoop_home]}/contrib/streaming/hadoop-*streaming*.jar",
         | 
| 64 | 
            +
                    hadoop_jobconf_options,
         | 
| 65 | 
            +
                    "-D mapred.job.name='#{job_name}'",
         | 
| 66 | 
            +
                    hadoop_other_args,
         | 
| 52 67 | 
             
                    "-mapper  '#{mapper_commandline}'",
         | 
| 53 68 | 
             
                    "-reducer '#{reducer_commandline}'",
         | 
| 54 69 | 
             
                    "-input   '#{input_paths}'",
         | 
| 55 70 | 
             
                    "-output  '#{output_path}'",
         | 
| 56 | 
            -
                    hadoop_jobconf_options,
         | 
| 57 | 
            -
                    "-jobconf mapred.job.name='#{job_name}'",
         | 
| 58 71 | 
             
                    hadoop_recycle_env,
         | 
| 59 | 
            -
                    hadoop_other_args,
         | 
| 60 72 | 
             
                  ].flatten.compact.join(" \t\\\n  ")
         | 
| 61 73 | 
             
                  Log.info "  Launching hadoop!"
         | 
| 62 74 | 
             
                  execute_command!(hadoop_commandline)
         | 
| @@ -64,48 +76,40 @@ module Wukong | |
| 64 76 |  | 
| 65 77 | 
             
                def hadoop_jobconf_options
         | 
| 66 78 | 
             
                  jobconf_options = []
         | 
| 67 | 
            -
                  #  | 
| 68 | 
            -
                   | 
| 69 | 
            -
             | 
| 70 | 
            -
             | 
| 71 | 
            -
                  ]
         | 
| 79 | 
            +
                  # Fixup these options
         | 
| 80 | 
            +
                  options[:reuse_jvms] = '-1'             if (options[:reuse_jvms] == true)
         | 
| 81 | 
            +
                  options[:respect_exit_status] = 'false' if (options[:ignore_exit_status] == true)
         | 
| 82 | 
            +
                  # If no reducer_klass and no reduce_command, then skip the reduce phase
         | 
| 83 | 
            +
                  options[:reduce_tasks] = 0 if (! reducer_klass) && (! options[:reduce_command]) && (! options[:reduce_tasks])
         | 
| 72 84 | 
             
                  # Fields hadoop should use to distribute records to reducers
         | 
| 73 85 | 
             
                  unless options[:partition_fields].blank?
         | 
| 74 86 | 
             
                    jobconf_options += [
         | 
| 75 | 
            -
                      '-partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner',
         | 
| 76 | 
            -
                      jobconf(:output_field_separator),
         | 
| 77 87 | 
             
                      jobconf(:partition_fields),
         | 
| 88 | 
            +
                      jobconf(:output_field_separator),
         | 
| 78 89 | 
             
                    ]
         | 
| 79 90 | 
             
                  end
         | 
| 80 | 
            -
                  # Setting the number of mappers and reducers.
         | 
| 81 91 | 
             
                  jobconf_options += [
         | 
| 82 | 
            -
                     | 
| 83 | 
            -
                     | 
| 84 | 
            -
                     | 
| 85 | 
            -
                     | 
| 86 | 
            -
                     | 
| 87 | 
            -
                     | 
| 88 | 
            -
                     | 
| 89 | 
            -
                     | 
| 90 | 
            -
             | 
| 92 | 
            +
                    :key_field_separator,  :sort_fields,
         | 
| 93 | 
            +
                    :map_tasks,            :reduce_tasks,
         | 
| 94 | 
            +
                    :max_node_map_tasks,   :max_node_reduce_tasks,
         | 
| 95 | 
            +
                    :max_reduces_per_node, :max_reduces_per_cluster,
         | 
| 96 | 
            +
                    :max_maps_per_node,    :max_maps_per_cluster,
         | 
| 97 | 
            +
                    :min_split_size,
         | 
| 98 | 
            +
                    :map_speculative,
         | 
| 99 | 
            +
                    :timeout,
         | 
| 100 | 
            +
                    :reuse_jvms, :respect_exit_status
         | 
| 101 | 
            +
                  ].map{|opt| jobconf(opt)}
         | 
| 91 102 | 
             
                  jobconf_options.flatten.compact
         | 
| 92 103 | 
             
                end
         | 
| 93 104 |  | 
| 94 | 
            -
                # emit a -jobconf hadoop option if the simplified command line arg is present
         | 
| 95 | 
            -
                # if not, the resulting nil will be elided later
         | 
| 96 | 
            -
                def jobconf option
         | 
| 97 | 
            -
                  if options[option]
         | 
| 98 | 
            -
                    "-jobconf %s=%s" % [options.description_for(option), options[option]]
         | 
| 99 | 
            -
                  end
         | 
| 100 | 
            -
                end
         | 
| 101 | 
            -
             | 
| 102 105 | 
             
                def hadoop_other_args
         | 
| 103 106 | 
             
                  extra_str_args  = [ options[:extra_args] ]
         | 
| 104 | 
            -
                   | 
| 105 | 
            -
             | 
| 106 | 
            -
                   | 
| 107 | 
            -
                   | 
| 108 | 
            -
                  extra_str_args  | 
| 107 | 
            +
                  if Settings.split_on_xml_tag
         | 
| 108 | 
            +
                    extra_str_args << %Q{-inputreader 'StreamXmlRecordReader,begin=<#{Settings.split_on_xml_tag}>,end=</#{Settings.split_on_xml_tag}>'}
         | 
| 109 | 
            +
                  end
         | 
| 110 | 
            +
                  extra_str_args   << ' -lazyOutput' if options[:noempty]  # don't create reduce file if no records
         | 
| 111 | 
            +
                  extra_str_args   << ' -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner' unless options[:partition_fields].blank?
         | 
| 112 | 
            +
                  extra_str_args
         | 
| 109 113 | 
             
                end
         | 
| 110 114 |  | 
| 111 115 | 
             
                def hadoop_recycle_env
         | 
| @@ -135,42 +139,6 @@ module Wukong | |
| 135 139 | 
             
                  #   Thanks to Todd Lipcon for directing me to that hack.
         | 
| 136 140 | 
             
                  #
         | 
| 137 141 |  | 
| 138 | 
            -
                  # "HADOOP_HOME"                             =>"/usr/lib/hadoop-0.20/bin/..",
         | 
| 139 | 
            -
                  # "HADOOP_IDENT_STRING"                     =>"hadoop",
         | 
| 140 | 
            -
                  # "HADOOP_LOGFILE"                          =>"hadoop-hadoop-tasktracker-ip-10-242-14-223.log",
         | 
| 141 | 
            -
                  # "HADOOP_LOG_DIR"                          =>"/usr/lib/hadoop-0.20/bin/../logs",
         | 
| 142 | 
            -
                  # "HOME"                                    =>"/var/run/hadoop-0.20",
         | 
| 143 | 
            -
                  # "JAVA_HOME"                               =>"/usr/lib/jvm/java-6-sun",
         | 
| 144 | 
            -
                  # "LD_LIBRARY_PATH"                         =>"/usr/lib/jvm/java-6-sun-1.6.0.10/jre/lib/i386/client:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/lib/i386:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/../lib/i386:/mnt/hadoop/mapred/local/taskTracker/jobcache/job_200910221152_0023/attempt_200910221152_0023_m_000000_0/work:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/lib/i386/client:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/lib/i386:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/../lib/i386",
         | 
| 145 | 
            -
                  # "PATH"                                    =>"/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games",
         | 
| 146 | 
            -
                  # "USER"                                    =>"hadoop",
         | 
| 147 | 
            -
                  #
         | 
| 148 | 
            -
                  # "dfs_block_size"                          =>"134217728",
         | 
| 149 | 
            -
                  # "map_input_start"                         =>"0",
         | 
| 150 | 
            -
                  # "map_input_length"                        =>"125726898",
         | 
| 151 | 
            -
                  # "mapred_output_key_class"                 =>"org.apache.hadoop.io.Text",
         | 
| 152 | 
            -
                  # "mapred_output_value_class"               =>"org.apache.hadoop.io.Text",
         | 
| 153 | 
            -
                  # "mapred_output_format_class"              =>"org.apache.hadoop.mapred.TextOutputFormat",
         | 
| 154 | 
            -
                  # "mapred_output_compression_codec"         =>"org.apache.hadoop.io.compress.DefaultCodec",
         | 
| 155 | 
            -
                  # "mapred_output_compression_type"          =>"BLOCK",
         | 
| 156 | 
            -
                  # "mapred_task_partition"                   =>"0",
         | 
| 157 | 
            -
                  # "mapred_tasktracker_map_tasks_maximum"    =>"4",
         | 
| 158 | 
            -
                  # "mapred_tasktracker_reduce_tasks_maximum" =>"2",
         | 
| 159 | 
            -
                  # "mapred_tip_id"                           =>"task_200910221152_0023_m_000000",
         | 
| 160 | 
            -
                  # "mapred_task_id"                          =>"attempt_200910221152_0023_m_000000_0",
         | 
| 161 | 
            -
                  # "mapred_job_tracker"                      =>"ec2-174-129-141-78.compute-1.amazonaws.com:8021",
         | 
| 162 | 
            -
                  #
         | 
| 163 | 
            -
                  # "mapred_input_dir"                        =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip/ripd/com.tw/com.twitter.search/20090809",
         | 
| 164 | 
            -
                  # "map_input_file"                          =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip/ripd/com.tw/com.twitter.search/20090809/com.twitter.search+20090809233441-56735-womper.tsv.bz2",
         | 
| 165 | 
            -
                  # "mapred_working_dir"                      =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip",
         | 
| 166 | 
            -
                  # "mapred_work_output_dir"                  =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip/tmp/twsearch-20090809/_temporary/_attempt_200910221152_0023_m_000000_0",
         | 
| 167 | 
            -
                  # "mapred_output_dir"                       =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip/tmp/twsearch-20090809",
         | 
| 168 | 
            -
                  # "mapred_temp_dir"                         =>"/mnt/tmp/hadoop-hadoop/mapred/temp",
         | 
| 169 | 
            -
                  # "PWD"                                     =>"/mnt/hadoop/mapred/local/taskTracker/jobcache/job_200910221152_0023/attempt_200910221152_0023_m_000000_0/work",
         | 
| 170 | 
            -
                  # "TMPDIR"                                  =>"/mnt/hadoop/mapred/local/taskTracker/jobcache/job_200910221152_0023/attempt_200910221152_0023_m_000000_0/work/tmp",
         | 
| 171 | 
            -
                  # "stream_map_streamprocessor"              =>"%2Fusr%2Fbin%2Fruby1.8+%2Fmnt%2Fhome%2Fflip%2Fics%2Fwuclan%2Fexamples%2Ftwitter%2Fparse%2Fparse_twitter_search_requests.rb+--map+--rm",
         | 
| 172 | 
            -
                  # "user_name"                               =>"flip",
         | 
| 173 | 
            -
             | 
| 174 142 | 
             
                  # HDFS pathname to the input file currently being processed.
         | 
| 175 143 | 
             
                  def input_file
         | 
| 176 144 | 
             
                    ENV['map_input_file']
         | 
| @@ -211,3 +179,52 @@ module Wukong | |
| 211 179 | 
             
                end
         | 
| 212 180 | 
             
              end
         | 
| 213 181 | 
             
            end
         | 
| 182 | 
            +
             | 
| 183 | 
            +
                    # -partitioner                          org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner \
         | 
| 184 | 
            +
                    # -D mapred.output.key.comparator.class=org.apache.hadoop.mapred.lib.KeyFieldBasedComparator \
         | 
| 185 | 
            +
                    # -D mapred.text.key.comparator.options=-k2,2nr\
         | 
| 186 | 
            +
                    # -D mapred.text.key.partitioner.options=-k1,2\
         | 
| 187 | 
            +
                    # -D mapred.text.key.partitioner.options=\"-k1,$partfields\"
         | 
| 188 | 
            +
                    # -D stream.num.map.output.key.fields=\"$sortfields\"
         | 
| 189 | 
            +
                    #
         | 
| 190 | 
            +
                    # -D stream.map.output.field.separator=\"'/t'\"
         | 
| 191 | 
            +
                    # -D    map.output.key.field.separator=. \
         | 
| 192 | 
            +
                    # -D       mapred.data.field.separator=. \
         | 
| 193 | 
            +
                    # -D map.output.key.value.fields.spec=6,5,1-3:0- \
         | 
| 194 | 
            +
                    # -D reduce.output.key.value.fields.spec=0-2:5- \
         | 
| 195 | 
            +
             | 
| 196 | 
            +
                  # "HADOOP_HOME"                             =>"/usr/lib/hadoop-0.20/bin/..",
         | 
| 197 | 
            +
                  # "HADOOP_IDENT_STRING"                     =>"hadoop",
         | 
| 198 | 
            +
                  # "HADOOP_LOGFILE"                          =>"hadoop-hadoop-tasktracker-ip-10-242-14-223.log",
         | 
| 199 | 
            +
                  # "HADOOP_LOG_DIR"                          =>"/usr/lib/hadoop-0.20/bin/../logs",
         | 
| 200 | 
            +
                  # "HOME"                                    =>"/var/run/hadoop-0.20",
         | 
| 201 | 
            +
                  # "JAVA_HOME"                               =>"/usr/lib/jvm/java-6-sun",
         | 
| 202 | 
            +
                  # "LD_LIBRARY_PATH"                         =>"/usr/lib/jvm/java-6-sun-1.6.0.10/jre/lib/i386/client:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/lib/i386:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/../lib/i386:/mnt/hadoop/mapred/local/taskTracker/jobcache/job_200910221152_0023/attempt_200910221152_0023_m_000000_0/work:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/lib/i386/client:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/lib/i386:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/../lib/i386",
         | 
| 203 | 
            +
                  # "PATH"                                    =>"/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games",
         | 
| 204 | 
            +
                  # "USER"                                    =>"hadoop",
         | 
| 205 | 
            +
                  #
         | 
| 206 | 
            +
                  # "dfs_block_size"                          =>"134217728",
         | 
| 207 | 
            +
                  # "map_input_start"                         =>"0",
         | 
| 208 | 
            +
                  # "map_input_length"                        =>"125726898",
         | 
| 209 | 
            +
                  # "mapred_output_key_class"                 =>"org.apache.hadoop.io.Text",
         | 
| 210 | 
            +
                  # "mapred_output_value_class"               =>"org.apache.hadoop.io.Text",
         | 
| 211 | 
            +
                  # "mapred_output_format_class"              =>"org.apache.hadoop.mapred.TextOutputFormat",
         | 
| 212 | 
            +
                  # "mapred_output_compression_codec"         =>"org.apache.hadoop.io.compress.DefaultCodec",
         | 
| 213 | 
            +
                  # "mapred_output_compression_type"          =>"BLOCK",
         | 
| 214 | 
            +
                  # "mapred_task_partition"                   =>"0",
         | 
| 215 | 
            +
                  # "mapred_tasktracker_map_tasks_maximum"    =>"4",
         | 
| 216 | 
            +
                  # "mapred_tasktracker_reduce_tasks_maximum" =>"2",
         | 
| 217 | 
            +
                  # "mapred_tip_id"                           =>"task_200910221152_0023_m_000000",
         | 
| 218 | 
            +
                  # "mapred_task_id"                          =>"attempt_200910221152_0023_m_000000_0",
         | 
| 219 | 
            +
                  # "mapred_job_tracker"                      =>"ec2-174-129-141-78.compute-1.amazonaws.com:8021",
         | 
| 220 | 
            +
                  #
         | 
| 221 | 
            +
                  # "mapred_input_dir"                        =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip/ripd/com.tw/com.twitter.search/20090809",
         | 
| 222 | 
            +
                  # "map_input_file"                          =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip/ripd/com.tw/com.twitter.search/20090809/com.twitter.search+20090809233441-56735-womper.tsv.bz2",
         | 
| 223 | 
            +
                  # "mapred_working_dir"                      =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip",
         | 
| 224 | 
            +
                  # "mapred_work_output_dir"                  =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip/tmp/twsearch-20090809/_temporary/_attempt_200910221152_0023_m_000000_0",
         | 
| 225 | 
            +
                  # "mapred_output_dir"                       =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip/tmp/twsearch-20090809",
         | 
| 226 | 
            +
                  # "mapred_temp_dir"                         =>"/mnt/tmp/hadoop-hadoop/mapred/temp",
         | 
| 227 | 
            +
                  # "PWD"                                     =>"/mnt/hadoop/mapred/local/taskTracker/jobcache/job_200910221152_0023/attempt_200910221152_0023_m_000000_0/work",
         | 
| 228 | 
            +
                  # "TMPDIR"                                  =>"/mnt/hadoop/mapred/local/taskTracker/jobcache/job_200910221152_0023/attempt_200910221152_0023_m_000000_0/work/tmp",
         | 
| 229 | 
            +
                  # "stream_map_streamprocessor"              =>"%2Fusr%2Fbin%2Fruby1.8+%2Fmnt%2Fhome%2Fflip%2Fics%2Fwuclan%2Fexamples%2Ftwitter%2Fparse%2Fparse_twitter_search_requests.rb+--map+--rm",
         | 
| 230 | 
            +
                  # "user_name"                               =>"flip",
         |