RubyGems - wukong - Versions diffs - 1.4.2 → 1.4.5 - Mend

wukong 1.4.2 → 1.4.5

Files changed (25) hide show

data/CHANGELOG.textile +9 -0
data/bin/hdp-stream2 +2 -2
data/examples/contrib/jeans/README.markdown +165 -0
data/examples/contrib/jeans/data/normalized_sizes +3 -0
data/examples/contrib/jeans/data/orders.tsv +1302 -0
data/examples/contrib/jeans/data/sizes +3 -0
data/examples/contrib/jeans/normalize.rb +20 -0
data/examples/contrib/jeans/sizes.rb +55 -0
data/examples/foo.rb +9 -0
data/examples/word_count.rb +1 -1
data/lib/wukong.rb +1 -1
data/lib/wukong/extensions/hash.rb +52 -18
data/lib/wukong/schema.rb +10 -10
data/lib/wukong/script.rb +77 -97
data/lib/wukong/script/hadoop_command.rb +21 -19
data/lib/wukong/script/local_command.rb +9 -1
data/lib/wukong/streamer/base.rb +1 -1
data/spec/spec.opts +1 -0
data/spec/spec_helper.rb +11 -0
data/spec/wukong/encoding_spec.rb +36 -0
data/spec/wukong/script_spec.rb +80 -0
data/wukong.gemspec +23 -24
metadata +18 -5
data/lib/wukong/boot.rb +0 -47
data/spec/bin/hdp-wc_spec.rb +0 -4

data/examples/contrib/jeans/data/sizes ADDED Viewed

@@ -0,0 +1,3 @@
+australia	253	499	671	663	710	687	774	654	627	422	376	132	25
+spain	37	102	257	177	118	90	144	183	210	222	162	93	17
+sweden	32	167	306	334	314	287	330	366	415	343	266	130	51

data/examples/contrib/jeans/normalize.rb ADDED Viewed

@@ -0,0 +1,20 @@
+#!/usr/bin/env ruby
+# run like so:
+# $> ruby normalize.rb --run=local data/sizes.tsv data/normalized_sizes.tsv
+require 'rubygems'
+require 'wukong'
+require 'active_support/core_ext/enumerable' # for array#sum
+module Normalize
+  class Mapper < Wukong::Streamer::RecordStreamer
+    def process(country, *sizes)
+      sizes.map!(&:to_i)
+      sum = sizes.sum.to_f
+      normalized = sizes.map{|x| 100 * x/sum }
+      s = normalized.join(",")
+      yield [country, s]
+    end
+  end
+end
+Wukong::Script.new(Normalize::Mapper, nil).run

data/examples/contrib/jeans/sizes.rb ADDED Viewed

@@ -0,0 +1,55 @@
+#!/usr/bin/env ruby
+# run like so:
+# $> ruby sizes.rb --run=local data/orders.tsv data/sizes
+require 'rubygems'
+require 'wukong'
+module JeanSizes
+  class Mapper < Wukong::Streamer::RecordStreamer
+    def process(code,model,time,country,reg,col, n1,c1, venue,n3,n4, *sizes)
+      yield [country, *sizes]
+    end
+  end
+  #
+  # This uses a ListReducer. It's nice and simple, but requires first
+  # accumulating each key's records in memory.
+  #
+  class JeansListReducer < Wukong::Streamer::ListReducer
+    def finalize
+      return if values.empty?
+      sums = []; 13.times{ sums << 0 }
+      values.each do |country, *sizes|
+        sizes.map!(&:to_i)
+        sums = sums.zip(sizes).map{|sum, val| sum + val }
+      end
+      yield [key, *sums]
+    end
+  end
+  #
+  # This uses an AccumulatingReducer directly.
+  # It has the advantage of a minimal footprint.
+  #
+  class JeansAccumulatingReducer < Wukong::Streamer::AccumulatingReducer
+    attr_accessor :sums
+    # start the sum with 0 for each size
+    def start! *_
+      self.sums = []; 13.times{ self.sums << 0 }
+    end
+    # accumulate each size count into the sizes_sum
+    def accumulate country, *sizes
+      sizes.map!(&:to_i)
+      self.sums = self.sums.zip(sizes).map{|sum, val| sum + val }
+    end
+    # emit [country, size_0_sum, size_1_sum, ...]
+    def finalize
+      yield [key, sums].flatten
+    end
+  end
+end
+Wukong::Script.new(JeanSizes::Mapper, JeanSizes::JeansListReducer).run

data/examples/foo.rb ADDED Viewed

@@ -0,0 +1,9 @@
+#!/usr/bin/env ruby
+$: << File.expand_path("~/ics/backend/configliere/lib")
+require "wukong"
+p Wukong::Script.new(nil,nil).options
+p Wukong::Script.new(nil,nil).non_wukong_params
+Wukong::Script.new(nil,nil).run

data/examples/word_count.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 #!/usr/bin/env ruby
-$: << File.dirname(__FILE__)+'/../lib'
+require 'rubygems'
 require 'wukong'
 module WordCount

data/lib/wukong.rb CHANGED Viewed

@@ -1,9 +1,9 @@
-require 'wukong/boot'
 require 'wukong/extensions'
 require 'wukong/datatypes'
 require 'wukong/logger'
 require 'wukong/bad_record'
 autoload :TypedStruct, 'wukong/typed_struct'
+require 'configliere'; Configliere.use :define
 module Wukong
   autoload :Dfs,         'wukong/dfs'
   autoload :Script,      'wukong/script'

data/lib/wukong/extensions/hash.rb CHANGED Viewed

@@ -40,9 +40,9 @@ class Hash
   end
   # lambda for recursive merges
-  Hash::DEEP_MERGER = proc do |key,v1,v2|
+  ::Hash::DEEP_MERGER = proc do |key,v1,v2|
     (v1.respond_to?(:merge) && v2.respond_to?(:merge)) ? v1.merge(v2.compact, &Hash::DEEP_MERGER) : (v2.nil? ? v1 : v2)
-  end
+  end unless defined?(::Hash::DEEP_MERGER)
   #
   # Merge hashes recursively.
@@ -72,36 +72,60 @@ class Hash
     merge! hsh2, &Hash::DEEP_MERGER
   end
   #
   # Treat hash as tree of hashes:
   #
   #     x = { 1 => :val, :subhash => { 1 => :val1 } }
-  #     x.deep_set(:subhash, 3, 4)
-  #     # => { 1 => :val, :subhash => { 1 => :val1,   3 => 4 } }
+  #     x.deep_set(:subhash, :cat, :hat)
+  #     # => { 1 => :val, :subhash => { 1 => :val1,   :cat => :hat } }
   #     x.deep_set(:subhash, 1, :newval)
-  #     # => { 1 => :val, :subhash => { 1 => :newval, 3 => 4 } }
+  #     # => { 1 => :val, :subhash => { 1 => :newval, :cat => :hat } }
   #
   #
   def deep_set *args
-    hsh = self
-    head_keys = args[0..-3]
-    last_key  = args[-2]
-    val       = args[-1]
-    # grab last subtree (building out if necessary)
-    head_keys.each{|key| hsh = (hsh[key] ||= {}) }
+    val      = args.pop
+    last_key = args.pop
+    # dig down to last subtree (building out if necessary)
+    hsh = args.empty? ? self : args.inject(self){|hsh, key| hsh[key] ||= {} }
     # set leaf value
     hsh[last_key] = val
   end
-  # Stolen from ActiveSupport::CoreExtensions::Hash::ReverseMerge.
-  def reverse_merge(other_hash)
-    other_hash.merge(self)
+  #
+  # Treat hash as tree of hashes:
+  #
+  #     x = { 1 => :val, :subhash => { 1 => :val1 } }
+  #     x.deep_get(:subhash, 1)
+  #     # => :val
+  #     x.deep_get(:subhash, 2)
+  #     # => nil
+  #     x.deep_get(:subhash, 2, 3)
+  #     # => nil
+  #     x.deep_get(:subhash, 2)
+  #     # => nil
+  #
+  def deep_get *args
+    last_key = args.pop
+    # dig down to last subtree (building out if necessary)
+    hsh = args.inject(self){|hsh, key| hsh[key] || {} }
+    # get leaf value
+    hsh[last_key]
   end
-  # Stolen from ActiveSupport::CoreExtensions::Hash::ReverseMerge.
-  def reverse_merge!(other_hash)
-    replace(reverse_merge(other_hash))
+  #
+  # Treat hash as tree of hashes:
+  #
+  #     x = { 1 => :val, :subhash => { 1 => :val1, 2 => :val2 } }
+  #     x.deep_delete(:subhash, 1)
+  #     #=> :val
+  #     x
+  #     #=> { 1 => :val, :subhash => { 2 => :val2 } }
+  #
+  def deep_delete *args
+    last_key  = args.pop
+    last_hsh  = args.empty? ? self : (deep_get(*args)||{})
+    last_hsh.delete(last_key)
   end
   #
@@ -117,4 +141,14 @@ class Hash
     replace(compact)
   end
+  # Stolen from ActiveSupport::CoreExtensions::Hash::ReverseMerge.
+  def reverse_merge(other_hash)
+    other_hash.merge(self)
+  end
+  # Stolen from ActiveSupport::CoreExtensions::Hash::ReverseMerge.
+  def reverse_merge!(other_hash)
+    replace(reverse_merge(other_hash))
+  end
 end

data/lib/wukong/schema.rb CHANGED Viewed

@@ -9,16 +9,16 @@ class << Integer    ; def to_sql() 'INT'                              end ; end
 class << Bignum     ; def to_sql() 'BIGINT'                           end ; end
 class << String     ; def to_sql() 'VARCHAR(255) CHARACTER SET ASCII' end ; end
 class << Symbol     ; def to_sql() 'VARCHAR(255) CHARACTER SET ASCII' end ; end
-class << BigDecimal ; def to_pig() 'DECIMAL'                          end ; end if defined?(BigDecimal)
-class << EpochTime  ; def to_pig() 'INT'                              end ; end if defined?(EpochTime)
-class << FilePath   ; def to_pig() 'VARCHAR(255) CHARACTER SET ASCII' end ; end if defined?(FilePath)
-class << Flag       ; def to_pig() 'CHAR(1)      CHARACTER SET ASCII' end ; end if defined?(Flag)
-class << IPAddress  ; def to_pig() 'CHAR(15)     CHARACTER SET ASCII' end ; end if defined?(IPAddress)
-class << URI        ; def to_pig() 'VARCHAR(255) CHARACTER SET ASCII' end ; end if defined?(URI)
-class << Csv        ; def to_pig() 'TEXT'                             end ; end if defined?(Csv)
-class << Yaml       ; def to_pig() 'TEXT'                             end ; end if defined?(Yaml)
-class << Json       ; def to_pig() 'TEXT'                             end ; end if defined?(Json)
-class << Regex      ; def to_pig() 'TEXT'                             end ; end if defined?(Regex)
+class << BigDecimal ; def to_sql() 'DECIMAL'                          end ; end if defined?(BigDecimal)
+class << EpochTime  ; def to_sql() 'INT'                              end ; end if defined?(EpochTime)
+class << FilePath   ; def to_sql() 'VARCHAR(255) CHARACTER SET ASCII' end ; end if defined?(FilePath)
+class << Flag       ; def to_sql() 'CHAR(1)      CHARACTER SET ASCII' end ; end if defined?(Flag)
+class << IPAddress  ; def to_sql() 'CHAR(15)     CHARACTER SET ASCII' end ; end if defined?(IPAddress)
+class << URI        ; def to_sql() 'VARCHAR(255) CHARACTER SET ASCII' end ; end if defined?(URI)
+class << Csv        ; def to_sql() 'TEXT'                             end ; end if defined?(Csv)
+class << Yaml       ; def to_sql() 'TEXT'                             end ; end if defined?(Yaml)
+class << Json       ; def to_sql() 'TEXT'                             end ; end if defined?(Json)
+class << Regex      ; def to_sql() 'TEXT'                             end ; end if defined?(Regex)
 class String        ; def to_sql() self             ; end ; end
 class Symbol        ; def to_sql() self.to_s.upcase ; end ; end

data/lib/wukong/script.rb CHANGED Viewed

@@ -1,7 +1,8 @@
 require 'pathname'
 require 'wukong/script/hadoop_command'
 require 'wukong/script/local_command'
-require 'rbconfig'
+require 'configliere' ; Configliere.use(:commandline, :env_var, :define)
+require 'rbconfig' # for uncovering ruby_interpreter_path
 module Wukong
   # == How to run a Wukong script
@@ -58,6 +59,38 @@ module Wukong
     include Wukong::LocalCommand
     attr_accessor :mapper_klass, :reducer_klass, :options
+    # ---------------------------------------------------------------------------
+    #
+    # Default options for Wukong
+    #   http://github.com/infochimps/wukong
+    #
+    # If you set an environment variable WUKONG_CONFIG, *or* if the file
+    # $HOME/.wukong.rb exists, that file will be +require+'d as well.
+    #
+    # Important values to set:
+    #
+    # * hadoop_home -- Path to root of hadoop install. If your hadoop runner is
+    #     /usr/local/share/hadoop/bin/hadoop
+    #   then your hadoop_home is
+    #     /usr/local/share/hadoop.
+    #   You can also set a :hadoop_runner that gives the full path to the hadoop script
+    #
+    # * default_run_mode -- Whether to run using hadoop (and
+    #   thus, requiring a working hadoop install), or to run in local mode
+    #   (script --map | sort | script --reduce)
+    #
+    Settings.define :default_run_mode, :default => 'hadoop',    :description => 'Run as local or as hadoop?', :wukong => true, :hide_help => false
+    Settings.define :default_mapper,   :default => '/bin/cat',  :description => 'The command to run when a nil mapper is given.', :wukong => true, :hide_help => true
+    Settings.define :default_reducer,  :default => '/bin/cat',  :description => 'The command to run when a nil reducer is given.', :wukong => true, :hide_help => true
+    Settings.define :hadoop_home,      :default => '/usr/lib/hadoop', :env_var => 'HADOOP_HOME', :description => "Path to hadoop installation; :hadoop_home/bin/hadoop should run hadoop.", :wukong => true
+    Settings.define :hadoop_runner,    :description => "Path to hadoop script; usually, set :hadoop_home instead of this.", :wukong => true
+    Settings.define :map,              :description => "run the script's map phase. Reads/writes to STDIN/STDOUT.", :wukong => true
+    Settings.define :reduce,           :description => "run the script's reduce phase. Reads/writes to STDIN/STDOUT. You can only choose one of --run, --map or --reduce.", :wukong => true
+    Settings.define :run,              :description => "run the script's main phase. In hadoop mode, invokes the hadoop script; in local mode, runs your_script.rb --map | sort | your_script.rb --reduce", :wukong => true
+    Settings.define :local,            :description => "run in local mode (invokes 'your_script.rb --map | sort | your_script.rb --reduce'", :wukong => true
+    Settings.define :hadoop,           :description => "run in hadoop mode (invokes the system hadoop runner script)", :wukong => true
+    Settings.define :dry_run,          :description => "echo the command that will be run, but don't run it", :wukong => true
     #
     # Instantiate the Script with the Mapper and the Reducer class (each a
     # Wukong::Streamer) it should call back.
@@ -86,8 +119,9 @@ module Wukong
     #   MyScript.new(MyMapper, nil).run
     #
     def initialize mapper_klass, reducer_klass, extra_options={}
-      self.options = default_options.merge(extra_options)
-      process_argv!
+      self.options = Settings.dup
+      options.resolve!
+      options.merge! extra_options
       self.mapper_klass  = mapper_klass
       self.reducer_klass = reducer_klass
       # If no reducer_klass and no reduce_command, then skip the reduce phase
@@ -97,65 +131,12 @@ module Wukong
     #
     # Gives default options.  Command line parameters take precedence
     #
-    # MAKE SURE YOU CALL SUPER: write your script according to the patter
+    # MAKE SURE YOU CALL SUPER: write your script according to the pattern
     #
     #   super.merge :my_option => :val
     #
     def default_options
-      Wukong::CONFIG[:runner_defaults] || {}
-    end
-    # Options that don't need to go in the :all_args hash
-    def std_options
-      @std_options ||= [:run, :map, :reduce, ] + HADOOP_OPTIONS_MAP.keys
-    end
-    #
-    # Parse the command-line args into the options hash.
-    #
-    # I should not reinvent the wheel.
-    # Yet: here we are.
-    #
-    # '--foo=foo_val'  produces :foo => 'foo_val' in the options hash.
-    # '--'             After seeing a non-'--' flag, or a '--' on its own, no further flags are parsed
-    #
-    # options[:all_args] contains all arguments that are not in std_options
-    # options[:rest]     contains all arguments following the first non-flag (or the '--')
-    #
-    def process_argv!
-      options[:all_args] = []
-      options[:rest]     = []
-      args      = ARGV.dup
-      while (! args.blank?) do
-        arg = args.shift
-        case
-        when arg == '--'
-          options[:rest] += args
-        when arg =~ /\A--(\w+)(?:=(.+))?\z/
-          opt, val = [$1, $2]
-          opt = opt.to_sym
-          val ||= true
-          self.options[opt] = val
-          options[:all_args] << arg unless std_options.include?(opt)
-        else
-          options[:all_args]  << arg
-          options[:rest]      << arg
-        end
-        # p [options, arg, args]
-      end
-      options[:all_args] = options[:all_args].join(" ")
-    end
-    def this_script_filename
-      Pathname.new($0).realpath
-    end
-    def ruby_interpreter_path
-      Pathname.new(
-                   File.join(Config::CONFIG["bindir"],
-                             Config::CONFIG["RUBY_INSTALL_NAME"]+
-                             Config::CONFIG["EXEEXT"])
-                   ).realpath
+      {}
     end
     #
@@ -164,8 +145,8 @@ module Wukong
     def map_command
       case
       when mapper_klass
-        "#{ruby_interpreter_path} #{this_script_filename} --map " + options[:all_args]
-      else options[:map_command] || Wukong::CONFIG[:default_mapper] end
+        "#{ruby_interpreter_path} #{this_script_filename} --map " + non_wukong_params
+      else options[:map_command] || options[:default_mapper] end
     end
     #
@@ -175,8 +156,8 @@ module Wukong
     def reduce_command
       case
       when reducer_klass
-        "#{ruby_interpreter_path} #{this_script_filename} --reduce " + options[:all_args]
-      else options[:reduce_command] || Wukong::CONFIG[:default_reducer] end
+        "#{ruby_interpreter_path} #{this_script_filename} --reduce " + non_wukong_params
+      else options[:reduce_command] || options[:default_reducer] end
     end
     #
@@ -197,14 +178,16 @@ module Wukong
     end
     def run_mode
+      return 'local'  if options[:local]
+      return 'hadoop' if options[:hadoop]
       # if only --run is given, assume default run mode
-      options[:run] = Wukong::CONFIG[:default_run_mode] if (options[:run] == true)
+      options[:run] = options[:default_run_mode] if (options[:run] == true)
       options[:run].to_s
     end
     def input_output_paths
       # input / output paths
-      input_path, output_path = options[:rest][0..1]
+      input_path, output_path = options.rest[0..1]
       raise "You need to specify a parsed input directory and a directory for output. Got #{ARGV.inspect}" if (! options[:dry_run]) && (input_path.blank? || output_path.blank?)
       [input_path, output_path]
     end
@@ -216,6 +199,29 @@ module Wukong
       end
     end
+    # Reassemble all the non-internal-to-wukong options into a command line for
+    # the map/reducer phase scripts
+    def non_wukong_params
+      options.
+        reject{|param, val| options.param_definitions[param][:wukong] }.
+        map{|param,val| "--#{param}=#{val}" }.
+        join(" ")
+    end
+    # the full, real path to the script file
+    def this_script_filename
+      Pathname.new($0).realpath
+    end
+    # use the full ruby interpreter path to run slave processes
+    def ruby_interpreter_path
+      Pathname.new(
+                   File.join(Config::CONFIG["bindir"],
+                             Config::CONFIG["RUBY_INSTALL_NAME"]+
+                             Config::CONFIG["EXEEXT"])
+                   ).realpath
+    end
     #
     # Execute the runner phase
     #
@@ -243,41 +249,15 @@ module Wukong
       when options[:run]
         exec_hadoop_streaming
       else
-        self.help # Normant Vincent Peale is proud of you
+        options.dump_help %Q{Please specify a run mode: you probably want to start with
+  #{$0} --run --local input.tsv output.tsv
+although
+  cat input.tsv | #{$0} --map > mapped.tsv
+or
+  cat mapped.tsv | sort | #{$0} --reduce > reduced.tsv
+can be useful for initial testing.}
       end
     end
-    #
-    # Command line usage
-    #
-    def help
-      $stderr.puts "#{self.class} script"
-      $stderr.puts %Q{
-        #{$0} --run=hadoop input_hdfs_path output_hdfs_dir    # run the script with hadoop streaming
-        #{$0} --run=local  input_hdfs_path output_hdfs_dir    # run the script on local filesystem using unix pipes
-        #{$0} --run        input_hdfs_path output_hdfs_dir    # run the script with the mode given in config/wukong*.yaml
-        #{$0} --map
-        #{$0} --reduce                                        # dispatch to the mapper or reducer
-      All flags must precede the input and output paths.
-      Additional flags:
-        --dry_run
-      Hadoop Options (see hadoop documentation)
-        --max_node_map_tasks     => 'mapred.tasktracker.map.tasks.maximum',
-        --max_node_reduce_tasks  => 'mapred.tasktracker.reduce.tasks.maximum',
-        --map_tasks              => 'mapred.map.tasks',
-        --reduce_tasks           => 'mapred.reduce.tasks',
-        --sort_fields            => 'stream.num.map.output.key.fields',
-        --key_field_separator    => 'map.output.key.field.separator',
-        --partition_fields       => 'num.key.fields.for.partition',
-        --output_field_separator => 'stream.map.output.field.separator',
-        --map_speculative        => 'mapred.map.tasks.speculative.execution',
-        --timeout                => 'mapred.task.timeout',
-        --reuse_jvms             => 'mapred.job.reuse.jvm.num.tasks',
-        --ignore_exit_status     => 'stream.non.zero.exit.status.is.failure',
-      You can specify as well arbitrary script-specific command line flags; they are added to your options[] hash.
-      }
-    end
   end
 end