RubyGems - wukong - Versions diffs - 1.5.4 → 2.0.0 - Mend

wukong 1.5.4 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (87) hide show

data/CHANGELOG.textile +32 -0
data/README.textile +58 -12
data/TODO.textile +0 -8
data/bin/hdp-bzip +12 -17
data/bin/hdp-kill-task +1 -1
data/bin/hdp-sort +7 -7
data/bin/hdp-stream +7 -7
data/bin/hdp-stream-flat +2 -3
data/bin/setcat +11 -0
data/bin/uniq-ord +59 -0
data/examples/corpus/bucket_counter.rb +47 -0
data/examples/corpus/dbpedia_abstract_to_sentences.rb +85 -0
data/examples/corpus/sentence_coocurrence.rb +70 -0
data/examples/emr/README.textile +110 -0
data/examples/emr/dot_wukong_dir/emr_bootstrap.sh +1 -0
data/examples/emr/elastic_mapreduce_example.rb +2 -2
data/examples/ignore_me/counting.rb +56 -0
data/examples/ignore_me/grouper.rb +71 -0
data/examples/network_graph/adjacency_list.rb +2 -2
data/examples/network_graph/breadth_first_search.rb +14 -21
data/examples/network_graph/gen_multi_edge.rb +22 -13
data/examples/pagerank/pagerank.rb +1 -1
data/examples/pagerank/pagerank_initialize.rb +6 -10
data/examples/sample_records.rb +6 -16
data/examples/server_logs/apache_log_parser.rb +7 -22
data/examples/server_logs/breadcrumbs.rb +39 -0
data/examples/server_logs/logline.rb +27 -0
data/examples/size.rb +3 -2
data/examples/{binning_percentile_estimator.rb → stats/binning_percentile_estimator.rb} +9 -11
data/examples/{rank_and_bin.rb → stats/rank_and_bin.rb} +2 -2
data/examples/stupidly_simple_filter.rb +11 -14
data/examples/word_count.rb +16 -36
data/lib/wukong/and_pig.rb +2 -15
data/lib/wukong/logger.rb +7 -28
data/lib/wukong/periodic_monitor.rb +24 -9
data/lib/wukong/script/emr_command.rb +1 -0
data/lib/wukong/script/hadoop_command.rb +31 -29
data/lib/wukong/script.rb +19 -14
data/lib/wukong/store/cassandra_model.rb +2 -1
data/lib/wukong/streamer/accumulating_reducer.rb +5 -9
data/lib/wukong/streamer/base.rb +44 -3
data/lib/wukong/streamer/counting_reducer.rb +12 -12
data/lib/wukong/streamer/filter.rb +2 -2
data/lib/wukong/streamer/list_reducer.rb +3 -3
data/lib/wukong/streamer/reducer.rb +11 -0
data/lib/wukong/streamer.rb +7 -3
data/lib/wukong.rb +7 -3
data/{examples → old}/cassandra_streaming/berlitz_for_cassandra.textile +0 -0
data/{examples → old}/cassandra_streaming/client_interface_notes.textile +0 -0
data/{examples → old}/cassandra_streaming/client_schema.textile +0 -0
data/{examples → old}/cassandra_streaming/tuning.textile +0 -0
data/wukong.gemspec +257 -285
metadata +45 -62
data/examples/cassandra_streaming/avromapper.rb +0 -85
data/examples/cassandra_streaming/cassandra.avpr +0 -468
data/examples/cassandra_streaming/cassandra_random_partitioner.rb +0 -62
data/examples/cassandra_streaming/catter.sh +0 -45
data/examples/cassandra_streaming/client_schema.avpr +0 -211
data/examples/cassandra_streaming/foofile.avr +0 -0
data/examples/cassandra_streaming/pymap.sh +0 -1
data/examples/cassandra_streaming/pyreduce.sh +0 -1
data/examples/cassandra_streaming/smutation.avpr +0 -188
data/examples/cassandra_streaming/streamer.sh +0 -51
data/examples/cassandra_streaming/struct_loader.rb +0 -24
data/examples/count_keys.rb +0 -56
data/examples/count_keys_at_mapper.rb +0 -57
data/examples/emr/README-elastic_map_reduce.textile +0 -26
data/examples/keystore/cassandra_batch_test.rb +0 -41
data/examples/keystore/conditional_outputter_example.rb +0 -70
data/examples/store/chunked_store_example.rb +0 -18
data/lib/wukong/dfs.rb +0 -81
data/lib/wukong/keystore/cassandra_conditional_outputter.rb +0 -122
data/lib/wukong/keystore/redis_db.rb +0 -24
data/lib/wukong/keystore/tyrant_db.rb +0 -137
data/lib/wukong/keystore/tyrant_notes.textile +0 -145
data/lib/wukong/models/graph.rb +0 -25
data/lib/wukong/monitor/chunked_store.rb +0 -23
data/lib/wukong/monitor/periodic_logger.rb +0 -34
data/lib/wukong/monitor/periodic_monitor.rb +0 -70
data/lib/wukong/monitor.rb +0 -7
data/lib/wukong/rdf.rb +0 -104
data/lib/wukong/streamer/cassandra_streamer.rb +0 -61
data/lib/wukong/streamer/count_keys.rb +0 -30
data/lib/wukong/streamer/count_lines.rb +0 -26
data/lib/wukong/streamer/em_streamer.rb +0 -7
data/lib/wukong/streamer/preprocess_with_pipe_streamer.rb +0 -22
data/lib/wukong/wukong_class.rb +0 -21

data/examples/word_count.rb CHANGED Viewed

@@ -1,6 +1,6 @@
 #!/usr/bin/env ruby
 require 'rubygems'
-require 'wukong'
+require 'wukong/script'
 module WordCount
   class Mapper < Wukong::Streamer::LineStreamer
@@ -10,22 +10,22 @@ module WordCount
     # This is pretty simpleminded:
     # * downcase the word
     # * Split at any non-alphanumeric boundary, including '_'
-    # * However, preserve the special cases of 's or 't at the end of a
+    # * However, preserve the special cases of 's, 'd or 't at the end of a
     #   word.
     #
-    #   tokenize("Jim's dawg won't hunt: dawg_hunt error #3007a4")
-    #   # => ["jim's", "dawd", "won't", "hunt", "dawg", "hunt", "error", "3007a4"]
+    #   tokenize("Ability is a poor man's wealth #johnwoodenquote")
+    #   # => ["ability", "is", "a", "poor", "man's", "wealth", "johnwoodenquote"]
     #
     def tokenize str
-      return [] unless str
+      return [] if str.blank?
       str = str.downcase;
       # kill off all punctuation except [stuff]'s or [stuff]'t
       # this includes hyphens (words are split)
       str = str.
         gsub(/[^a-zA-Z0-9\']+/, ' ').
-        gsub(/(\w)\'([st])\b/, '\1!\2').gsub(/\'/, ' ').gsub(/!/, "'")
+        gsub(/(\w)\'([std])\b/, '\1!\2').gsub(/\'/, ' ').gsub(/!/, "'")
       # Busticate at whitespace
-      words = str.strip.split(/\s+/)
+      words = str.split(/\s+/)
       words.reject!{|w| w.blank? }
       words
     end
@@ -39,31 +39,13 @@ module WordCount
   end
   #
-  # Accumulate the sum record-by-record:
+  # You can stack up all the values in a list then sum them at once.
   #
-  class Reducer0 < Wukong::Streamer::Base
-    attr_accessor :key_count
-    def process word, count
-      @last_word ||= word
-      if (@last_word == word)
-        self.key_count += 1
-      else
-        yield [ @last_word, key_count ]
-        @last_word = word
-      end
-    end
-    def stream
-      emit @last_word, key_count
-    end
-  end
-  #
-  # You can stack up all the values in a list then sum them at once:
+  # This isn't good style, as it means the whole list is held in memory
   #
-  require 'active_support/core_ext/enumerable'
   class Reducer1 < Wukong::Streamer::ListReducer
     def finalize
-      yield [ key, values.map(&:last).map(&:to_i).sum ]
+      yield [ key, values.map(&:last).map(&:to_i).inject(0){|x,tot| x+tot } ]
     end
   end
@@ -71,11 +53,10 @@ module WordCount
   # A bit kinder to your memory manager: accumulate the sum record-by-record:
   #
   class Reducer2 < Wukong::Streamer::AccumulatingReducer
-    attr_accessor :key_count
-    def start!(*args)      self.key_count =  0 end
-    def accumulate(*args)  self.key_count += 1 end
+    def start!(*args)      @key_count =  0 end
+    def accumulate(*args)  @key_count += 1 end
     def finalize
-      yield [ key, key_count ]
+      yield [ key, @key_count ]
     end
   end
@@ -85,11 +66,10 @@ module WordCount
   require 'wukong/streamer/count_keys'
   class Reducer3 < Wukong::Streamer::CountKeys
   end
 end
 # Execute the script
-Wukong::Script.new(
+Wukong.run(
   WordCount::Mapper,
-  WordCount::Reducer1
-  ).run
+  WordCount::Reducer
+  )

data/lib/wukong/and_pig.rb CHANGED Viewed

@@ -2,19 +2,13 @@ module Enumerable
   #
   # Convert an array of values to a string representing it as a pig tuple
   #
-  # def to_pig_tuple
-  #   map{|*vals| '(' + vals.join(',') + ')' }
-  # end
-  #
-  # Convert an array to a pig tuple
-  #
   def to_pig_tuple
     '(' + self.join(',') + ')'
   end
   #
   # Convert an array of values to a string pig format
-  # Delegates to to_pig_tuple -- see also to_pig_bag
+  # see also to_pig_bag
   #
   def to_pig *args
     to_pig_tuple *args
@@ -23,13 +17,6 @@ module Enumerable
   #
   # Convert an array of values to a string representing it as a pig bag
   #
-  # def to_pig_bag
-  #   '{' + self.join(',') + '}'
-  # end
-  #
-  # Convert and array of values to a string representing it as a pig bag
-  #
   def to_pig_bag
     '{' + self.map{|*vals| vals.to_pig_tuple}.join(",") + '}'
   end

data/lib/wukong/logger.rb CHANGED Viewed

@@ -13,37 +13,15 @@ module Wukong
   #     I, [2009-07-26T19:58:46-05:00 #12332]: Up to 2000 char message
   #
   def self.logger
-    @logger ||= default_ruby_logger
-  end
-  #
-  # Log4r logger, set up to produce tab-delimited (and thus, wukong|hadoop
-  # friendly) output lines
-  #
-  def self.default_log4r_logger logger_handle='wukong'
-    require 'log4r'
-    lgr       = Log4r::Logger.new logger_handle
-    outputter = Log4r::Outputter.stderr
-    # Define timestamp formatter method
-    ::Time.class_eval do def utc_iso8601() utc.iso8601 ; end ; end
-    # 2009-07-25T00:12:05Z INFO PID\t
-    outputter.formatter  = Log4r::PatternFormatter.new(
-      :pattern     => "%d %.4l #{Process.pid}\t%.2000m",
-      :date_method => :utc_iso8601
-      )
-    lgr.outputters = outputter
-    lgr
-  end
-  def self.default_ruby_logger
+    return @logger if @logger
     require 'logger'
-    logger = Logger.new STDERR
-    logger.instance_eval do
+    @logger = Logger.new STDERR
+    @logger.instance_eval do
       def dump *args
         debug args.inspect
       end
     end
-    logger
+    @logger
   end
   def self.logger= logger
@@ -54,6 +32,7 @@ end
 #
 # A convenient logger.
 #
-# Define NO_WUKONG_LOG (or define Log yourself) to prevent its creation
+# define Log yourself to prevent its creation
 #
-Log = Wukong.logger unless (defined?(Log) || defined?(NO_WUKONG_LOG))
+Log         = Wukong.logger       unless defined?(Log)

data/lib/wukong/periodic_monitor.rb CHANGED Viewed

@@ -1,4 +1,5 @@
-Settings.define :log_interval, :default => 1000, :type => Integer, :description => 'How many iterations between log statements'
+Settings.define :log_interval, :default => 10_000, :type => Integer, :description => 'How many iterations between log statements'
+Settings.define :log_seconds,  :default => 30,     :type => Integer, :description => 'How many seconds between log statements'
 #
 # Periodic monitor
@@ -9,40 +10,48 @@ Settings.define :log_interval, :default => 1000, :type => Integer, :description
 class PeriodicMonitor
   attr_reader   :iter, :start_time, :options
   attr_accessor :interval
+  attr_accessor :time_interval
   def initialize extra_options={}
-    @options      = {}
+    @options       = {}
     @options.deep_merge!( extra_options || {} )
-    @iter         = 0
-    @start_time   = now
-    @interval     = (options[:log_interval] || Settings[:log_interval]).to_i
-    @interval = 1000 unless @interval >= 1
+    @iter          = 0
+    @start_time    = now
+    @last_report   = @start_time
+    @interval      = (options[:log_interval] || Settings[:log_interval]).to_i
+    @interval      = 1000 unless @interval >= 1
+    @time_interval = (options[:log_seconds]  || Settings[:log_seconds]).to_i
   end
   def periodically *args, &block
     incr!
     if ready?
+      @last_report = Time.now
       if block
         block.call(iter, *args)
       else
-        $stderr.puts progress(*args)
+        self.emit progress(*args)
       end
     end
   end
+  def emit log_line
+    Log.info log_line
+  end
   def incr!
     @iter += 1
   end
   def ready?
-    iter % @interval == 0
+    (iter % @interval == 0) || (since > time_interval)
   end
   def progress *stuff
     [
       "%15d" % iter,
       "%7.1f"% elapsed_time, "sec",
-      "%7.1f"%(iter.to_f / elapsed_time), "/sec",
+      "%7.1f"% rate, "/sec",
       now.to_flat,
       *stuff
     ].flatten.join("\t")
@@ -51,7 +60,13 @@ class PeriodicMonitor
   def elapsed_time
     now - start_time
   end
+  def since
+    now - @last_report
+  end
   def now
     Time.now.utc
   end
+  def rate
+    iter.to_f / elapsed_time
+  end
 end

data/lib/wukong/script/emr_command.rb CHANGED Viewed

@@ -49,6 +49,7 @@ module Wukong
     end
     def execute_emr_runner
+      # fix_paths!
       command_args = []
       if Settings.jobflow
         command_args << Settings.dashed_flag_for(:jobflow)

data/lib/wukong/script/hadoop_command.rb CHANGED Viewed

@@ -12,27 +12,27 @@ module Wukong
     #
     # Translate simplified args to their hairy hadoop equivalents
     #
-    Settings.define :max_node_map_tasks,     :jobconf => true, :description => 'mapred.tasktracker.map.tasks.maximum',                   :wukong => true
-    Settings.define :max_node_reduce_tasks,  :jobconf => true, :description => 'mapred.tasktracker.reduce.tasks.maximum',                :wukong => true
-    Settings.define :map_tasks,              :jobconf => true, :description => 'mapred.map.tasks',                                       :wukong => true
-    Settings.define :reduce_tasks,           :jobconf => true, :description => 'mapred.reduce.tasks',                                    :wukong => true
-    Settings.define :sort_fields,            :jobconf => true, :description => 'stream.num.map.output.key.fields',                       :wukong => true
-    Settings.define :key_field_separator,    :jobconf => true, :description => 'map.output.key.field.separator',                         :wukong => true
-    Settings.define :partition_fields,       :jobconf => true, :description => 'num.key.fields.for.partition',                           :wukong => true
-    Settings.define :output_field_separator, :jobconf => true, :description => 'stream.map.output.field.separator',                      :wukong => true
-    Settings.define :map_speculative,        :jobconf => true, :description => 'mapred.map.tasks.speculative.execution',                 :wukong => true
-    Settings.define :timeout,                :jobconf => true, :description => 'mapred.task.timeout',                                    :wukong => true
-    Settings.define :reuse_jvms,             :jobconf => true, :description => 'mapred.job.reuse.jvm.num.tasks',                         :wukong => true
-    Settings.define :respect_exit_status,    :jobconf => true, :description => 'stream.non.zero.exit.is.failure',                        :wukong => true
     Settings.define :io_sort_mb,             :jobconf => true, :description => 'io.sort.mb',                                             :wukong => true
     Settings.define :io_sort_record_percent, :jobconf => true, :description => 'io.sort.record.percent',                                 :wukong => true
     Settings.define :job_name,               :jobconf => true, :description => 'mapred.job.name',                                        :wukong => true
-    Settings.define :max_reduces_per_node,   :jobconf => true, :description => 'mapred.max.reduces.per.node',                            :wukong => true
-    Settings.define :max_reduces_per_cluster,:jobconf => true, :description => 'mapred.max.reduces.per.cluster',                         :wukong => true
-    Settings.define :max_maps_per_node,      :jobconf => true, :description => 'mapred.max.maps.per.node',                               :wukong => true
+    Settings.define :key_field_separator,    :jobconf => true, :description => 'map.output.key.field.separator',                         :wukong => true
+    Settings.define :map_speculative,        :jobconf => true, :description => 'mapred.map.tasks.speculative.execution',                 :wukong => true
+    Settings.define :map_tasks,              :jobconf => true, :description => 'mapred.map.tasks',                                       :wukong => true
     Settings.define :max_maps_per_cluster,   :jobconf => true, :description => 'mapred.max.maps.per.cluster',                            :wukong => true
+    Settings.define :max_maps_per_node,      :jobconf => true, :description => 'mapred.max.maps.per.node',                               :wukong => true
+    Settings.define :max_node_map_tasks,     :jobconf => true, :description => 'mapred.tasktracker.map.tasks.maximum',                   :wukong => true
+    Settings.define :max_node_reduce_tasks,  :jobconf => true, :description => 'mapred.tasktracker.reduce.tasks.maximum',                :wukong => true
     Settings.define :max_record_length,      :jobconf => true, :description => 'mapred.linerecordreader.maxlength',                      :wukong => true # "Safeguards against corrupted data: lines longer than this (in bytes) are treated as bad records."
+    Settings.define :max_reduces_per_cluster,:jobconf => true, :description => 'mapred.max.reduces.per.cluster',                         :wukong => true
+    Settings.define :max_reduces_per_node,   :jobconf => true, :description => 'mapred.max.reduces.per.node',                            :wukong => true
     Settings.define :min_split_size,         :jobconf => true, :description => 'mapred.min.split.size',                                  :wukong => true
+    Settings.define :output_field_separator, :jobconf => true, :description => 'stream.map.output.field.separator',                      :wukong => true
+    Settings.define :partition_fields,       :jobconf => true, :description => 'num.key.fields.for.partition',                           :wukong => true
+    Settings.define :reduce_tasks,           :jobconf => true, :description => 'mapred.reduce.tasks',                                    :wukong => true
+    Settings.define :respect_exit_status,    :jobconf => true, :description => 'stream.non.zero.exit.is.failure',                        :wukong => true
+    Settings.define :reuse_jvms,             :jobconf => true, :description => 'mapred.job.reuse.jvm.num.tasks',                         :wukong => true
+    Settings.define :sort_fields,            :jobconf => true, :description => 'stream.num.map.output.key.fields',                       :wukong => true
+    Settings.define :timeout,                :jobconf => true, :description => 'mapred.task.timeout',                                    :wukong => true
     Settings.define :noempty,                                  :description => "don't create zero-byte reduce files (hadoop mode only)", :wukong => true
     Settings.define :split_on_xml_tag,                         :description => "Parse XML document by specifying the tag name: 'anything found between <tag> and </tag> will be treated as one record for map tasks'", :wukong => true
@@ -60,7 +60,7 @@ module Wukong
       # Use Settings[:hadoop_home] to set the path your config install.
       hadoop_commandline = [
         hadoop_runner,
-        "jar #{Settings[:hadoop_home]}/contrib/streaming/hadoop-*streaming*.jar",
+        "jar #{options[:hadoop_home]}/contrib/streaming/hadoop-*streaming*.jar",
         hadoop_jobconf_options,
         "-D mapred.job.name='#{job_name}'",
         hadoop_other_args,
@@ -68,6 +68,7 @@ module Wukong
         "-reducer '#{reducer_commandline}'",
         "-input   '#{input_paths}'",
         "-output  '#{output_path}'",
+        "-file    '#{this_script_filename}'",
         hadoop_recycle_env,
       ].flatten.compact.join(" \t\\\n  ")
       Log.info "  Launching hadoop!"
@@ -79,8 +80,8 @@ module Wukong
       # Fixup these options
       options[:reuse_jvms] = '-1'             if (options[:reuse_jvms] == true)
       options[:respect_exit_status] = 'false' if (options[:ignore_exit_status] == true)
-      # If no reducer_klass and no reduce_command, then skip the reduce phase
-      options[:reduce_tasks] = 0 if (! reducer_klass) && (! options[:reduce_command]) && (! options[:reduce_tasks])
+      # If no reducer and no reduce_command, then skip the reduce phase
+      options[:reduce_tasks] = 0 if (! reducer) && (! options[:reduce_command]) && (! options[:reduce_tasks])
       # Fields hadoop should use to distribute records to reducers
       unless options[:partition_fields].blank?
         jobconf_options += [
@@ -89,23 +90,24 @@ module Wukong
         ]
       end
       jobconf_options += [
-        :key_field_separator,  :sort_fields,
-        :map_tasks,            :reduce_tasks,
-        :max_node_map_tasks,   :max_node_reduce_tasks,
-        :max_reduces_per_node, :max_reduces_per_cluster,
-        :max_maps_per_node,    :max_maps_per_cluster,
-        :min_split_size,
-        :map_speculative,
-        :timeout,
-        :reuse_jvms, :respect_exit_status
+        :io_sort_mb,               :io_sort_record_percent,
+        :map_speculative,          :map_tasks,
+        :max_maps_per_cluster,     :max_maps_per_node,
+        :max_node_map_tasks,       :max_node_reduce_tasks,
+        :max_reduces_per_cluster,  :max_reduces_per_node,
+        :max_record_length,        :min_split_size,
+        :output_field_separator,   :key_field_separator,
+        :partition_fields,         :sort_fields,
+        :reduce_tasks,             :respect_exit_status,
+        :reuse_jvms,               :timeout,
       ].map{|opt| jobconf(opt)}
       jobconf_options.flatten.compact
     end
     def hadoop_other_args
       extra_str_args  = [ options[:extra_args] ]
-      if Settings.split_on_xml_tag
-        extra_str_args << %Q{-inputreader 'StreamXmlRecordReader,begin=<#{Settings.split_on_xml_tag}>,end=</#{Settings.split_on_xml_tag}>'}
+      if options.split_on_xml_tag
+        extra_str_args << %Q{-inputreader 'StreamXmlRecordReader,begin=<#{options.split_on_xml_tag}>,end=</#{options.split_on_xml_tag}>'}
       end
       extra_str_args   << ' -lazyOutput' if options[:noempty]  # don't create reduce file if no records
       extra_str_args   << ' -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner' unless options[:partition_fields].blank?

data/lib/wukong/script.rb CHANGED Viewed

@@ -1,8 +1,10 @@
 require 'pathname'
+require 'configliere' ; Settings.use(:commandline, :env_var, :define)
+require 'wukong'
 require 'wukong/script/hadoop_command'
 require 'wukong/script/local_command'
-require 'configliere' ; Configliere.use(:commandline, :env_var, :define)
 require 'rbconfig' # for uncovering ruby_interpreter_path
+require 'wukong/streamer' ; include Wukong::Streamer
 module Wukong
   # == How to run a Wukong script
   #
@@ -63,7 +65,7 @@ module Wukong
   class Script
     include Wukong::HadoopCommand
     include Wukong::LocalCommand
-    attr_reader :mapper_klass, :reducer_klass, :options
+    attr_reader :mapper, :reducer, :options
     attr_reader :input_paths, :output_path
     # ---------------------------------------------------------------------------
@@ -122,12 +124,12 @@ module Wukong
     #   end
     #   MyScript.new(MyMapper, nil).run
     #
-    def initialize mapper_klass, reducer_klass=nil, extra_options={}
+    def initialize mapper, reducer=nil, extra_options={}
       Settings.resolve!
-      @options = Settings.dup
-      options.merge! extra_options
-      @mapper_klass  = mapper_klass
-      @reducer_klass = reducer_klass
+      @options = Settings
+      options.merge extra_options
+      @mapper  = (case mapper  when Class then mapper.new  when nil then nil else mapper  ; end)
+      @reducer = (case reducer when Class then reducer.new when nil then nil else reducer ; end)
       @output_path = options.rest.pop
       @input_paths = options.rest.reject(&:blank?)
       if (input_paths.blank? || output_path.blank?) && (not options[:dry_run]) && (not ['map', 'reduce'].include?(run_mode))
@@ -142,8 +144,8 @@ module Wukong
     #
     def run
       case run_mode
-      when 'map'              then mapper_klass.new(self.options).stream
-      when 'reduce'           then reducer_klass.new(self.options).stream
+      when 'map'              then mapper.stream
+      when 'reduce'           then reducer.stream
       when 'local'            then execute_local_workflow
       when 'cassandra'        then execute_hadoop_workflow
       when 'hadoop', 'mapred' then execute_hadoop_workflow
@@ -172,8 +174,9 @@ module Wukong
     # In local mode, it's given to the system() call
     #
     def mapper_commandline
-      if mapper_klass
+      if mapper
         "#{ruby_interpreter_path} #{this_script_filename} --map " + non_wukong_params
+        # "#{ruby_interpreter_path} #{File.basename(this_script_filename)} --map " + non_wukong_params
       else
         options[:map_command]
       end
@@ -185,8 +188,9 @@ module Wukong
     # In local mode, it's given to the system() call
     #
     def reducer_commandline
-      if reducer_klass
-         "#{ruby_interpreter_path} #{this_script_filename} --reduce " + non_wukong_params
+      if reducer
+        "#{ruby_interpreter_path} #{this_script_filename} --reduce " + non_wukong_params
+        # "#{ruby_interpreter_path} #{File.basename(this_script_filename)} --reduce " + non_wukong_params
       else
         options[:reduce_command]
       end
@@ -228,8 +232,9 @@ module Wukong
     #
     def maybe_overwrite_output_paths! output_path
       if (options[:overwrite] || options[:rm]) && (run_mode == 'hadoop')
-        Log.info "Removing output file #{output_path}"
-        `hdp-rm -r '#{output_path}'`
+        cmd = %Q{#{hadoop_runner} fs -rmr '#{output_path}'}
+        Log.info "Removing output file #{output_path}: #{cmd}"
+        puts `#{cmd}`
       end
     end

data/lib/wukong/store/cassandra_model.rb CHANGED Viewed

@@ -26,10 +26,11 @@ module Wukong
       #
       def to_db_hash
         db_hsh = {}
-        to_hash.each{|k,v| db_hsh[k.to_s] = v.to_s unless v.nil? }
+        each_pair{|k,v| db_hsh[k.to_s] = v.to_s unless v.nil? }
         db_hsh
       end
       module ClassMethods
         # Cassandra column family -- taken from the class name by default.
         def table_name

data/lib/wukong/streamer/accumulating_reducer.rb CHANGED Viewed

@@ -15,10 +15,6 @@
      #
      class AccumulatingReducer < Wukong::Streamer::Base
        attr_accessor :key
-       def initialize options
-         super options
-         self.key = :__first_pass__
-       end
        #
        # override for multiple-field keys, etc.
@@ -57,15 +53,12 @@
        # start! is called on the the first record of the new key
        #
        def start! *args
-         raise %Q{start! is the new reset! -- it has args now, namely the first
-         record of the new key.  It doesn\'t want #super either}
        end
        #
        # Override this to accumulate each record for the given key in turn.
        #
        def accumulate *args, &block
-         raise "override the accumulate method in your subclass"
        end
        #
@@ -73,7 +66,11 @@
        # You must override this method.
        #
        def finalize
-         raise "override the finalize method in your subclass"
+       end
+       # make a sentinel
+       def before_stream
+         self.key = :__first_pass__
        end
        # Finalize the last-seen group.
@@ -82,6 +79,5 @@
          super *args
        end
      end
    end
 end

data/lib/wukong/streamer/base.rb CHANGED Viewed

@@ -4,13 +4,17 @@ module Wukong
       # Options, initially set from the command-line args -- see
       # Script#process_argv!
-      attr_accessor :options
+      attr_reader :own_options
       #
       # Accepts option hash from script runner
       #
       def initialize options={}
-        self.options = options
+        @own_options = options
+      end
+      def options
+        Settings.deep_merge own_options
       end
       #
@@ -24,6 +28,7 @@ module Wukong
           process(*record) do |output_record|
             emit output_record
           end
+          monitor.periodically(record.to_s[0..1000])
         end
         after_stream
       end
@@ -64,7 +69,6 @@ module Wukong
       # Process each record in turn, yielding the records to emit
       #
       def process *args, &block
-        raise "override the process method in your implementation: it should process each record."
       end
       #
@@ -75,6 +79,43 @@ module Wukong
         warn "Bad record #{args.inspect[0..400]}"
         puts ["bad_record-"+key, *args].join("\t")
       end
+      # A periodic logger to track progress
+      def monitor
+        @monitor ||= PeriodicMonitor.new
+      end
+      # Defines a process method on the fly to execute the given mapper.
+      #
+      # This is still experimental.
+      # Among other limitations, you can't use ++yield++ -- you have to call
+      # emit() directly.
+      def mapper &mapper_block
+        @mapper_block = mapper_block.to_proc
+        self.instance_eval do
+          def process *args, &block
+            instance_exec(*args, &@mapper_block)
+          end
+        end
+        self
+      end
+      # Creates a new object of this class and injects the given block
+      # as the process method
+      def self.mapper *args, &block
+        self.new.mapper *args, &block
+      end
+      # Delegates back to Wukong to run this instance as a mapper
+      def run options={}
+        Wukong.run(self, nil, options)
+      end
+      # Creates a new object of this class and runs it
+      def self.run options={}
+        Wukong.run(self.new, nil, options)
+      end
     end
   end
 end

data/lib/wukong/streamer/counting_reducer.rb CHANGED Viewed

@@ -1,23 +1,23 @@
 module Wukong
   module Streamer
     #
-    # Count the number of records for each key.
+    # Emit each unique key and the count of its occurrences
     #
-    class CountingReducer < AccumulatingReducer
-      attr_accessor :count
+    class CountingReducer < Wukong::Streamer::AccumulatingReducer
-      # start the sum with 0 for each key
-      def start! *_
-        self.count = 0
+      # reset the counter to zero
+      def start! *args
+        @count = 0
       end
-      # ... and count the number of records for this key
-      def accumulate *_
-        self.count += 1
+      # record one more for this key
+      def accumulate *vals
+        @count += 1
       end
-      # emit [key, count]
+      # emit each key field and the count, tab-separated.
       def finalize
-        yield [key, count].flatten
+        yield [key, @count]
       end
     end

data/lib/wukong/streamer/filter.rb CHANGED Viewed

@@ -12,8 +12,8 @@ module Wukong
       #
       # Subclass and re-define the emit? method
       #
-      def process *record, &block
-        yield record if emit?(record)
+      def process *record
+        yield record if emit?(*record)
       end
     end
   end