RubyGems - wukong - Versions diffs - 2.0.0 → 2.0.1 - Mend

wukong 2.0.0 → 2.0.1

Files changed (38) hide show

data/README.textile +0 -1
data/TODO.textile +6 -0
data/examples/corpus/dbpedia_abstract_to_sentences.rb +1 -0
data/examples/corpus/sentence_bigrams.rb +53 -0
data/examples/corpus/sentence_coocurrence.rb +1 -5
data/examples/corpus/words_to_bigrams.rb +2 -1
data/examples/ignore_me/counting.rb +1 -2
data/examples/network_graph/adjacency_list.rb +1 -1
data/examples/network_graph/breadth_first_search.rb +1 -1
data/examples/network_graph/gen_2paths.rb +2 -2
data/examples/network_graph/gen_multi_edge.rb +0 -1
data/examples/network_graph/gen_symmetric_links.rb +1 -0
data/examples/pagerank/pagerank.rb +5 -21
data/examples/pagerank/pagerank_initialize.rb +1 -1
data/examples/server_logs/apache_log_parser.rb +8 -48
data/examples/server_logs/logline.rb +37 -13
data/examples/server_logs/nook.rb +48 -0
data/examples/server_logs/nook/faraday_dummy_adapter.rb +94 -0
data/examples/simple_word_count.rb +82 -0
data/examples/size.rb +1 -1
data/examples/stats/binning_percentile_estimator.rb +1 -1
data/examples/stats/rank_and_bin.rb +1 -1
data/examples/stupidly_simple_filter.rb +1 -1
data/lib/wukong.rb +1 -1
data/lib/wukong/extensions.rb +2 -2
data/lib/wukong/extensions/blank.rb +6 -6
data/lib/wukong/extensions/hash.rb +9 -9
data/lib/wukong/extensions/hash_like.rb +2 -2
data/lib/wukong/extensions/symbol.rb +1 -1
data/lib/wukong/logger.rb +1 -1
data/lib/wukong/periodic_monitor.rb +2 -2
data/lib/wukong/script.rb +18 -8
data/lib/wukong/script/emr_command.rb +6 -4
data/lib/wukong/script/hadoop_command.rb +9 -4
data/lib/wukong/script/local_command.rb +7 -1
data/lib/wukong/streamer/base.rb +6 -2
data/wukong.gemspec +11 -3
metadata +11 -34

data/README.textile CHANGED

@@ -19,7 +19,6 @@ The **main documentation** lives on the "Wukong Pages.":http://mrflip.github.com
 * Wukong is licensed under the "Apache License":http://mrflip.github.com/wukong/LICENSE.html (same as Hadoop)
 * "More info":http://mrflip.github.com/wukong/moreinfo.html
 h2. Help!
 Send Wukong questions to the "Infinite Monkeywrench mailing list":http://groups.google.com/group/infochimps-code

data/TODO.textile CHANGED

@@ -3,3 +3,9 @@
 ** We should be able to specify comma *or* space separated paths; the last
    space-separated path in Settings.rest becomes the output file, the others are
    used as the input_file list.
+  at_exit do
+    if $!.nil? && $0 == Goliath::Application.app_file
+      Application.run!
+    end
+  end

data/examples/corpus/dbpedia_abstract_to_sentences.rb CHANGED

@@ -1,4 +1,5 @@
 #!/usr/bin/env ruby
+require 'rubygems'
 require 'wukong/script'
 #

data/examples/corpus/sentence_bigrams.rb ADDED

@@ -0,0 +1,53 @@
+#!/usr/bin/env ruby
+$: << File.dirname(__FILE__)
+require 'rubygems'
+require 'wukong/script'
+require 'bucket_counter'
+#
+# Coocurrence counts
+#
+#
+# Input is a list of document-idx-sentences, each field is tab-separated
+#   title   idx   word_a    word_b    word_c ...
+#
+# This emits each co-courring pair exactly once; in the case of a three-word
+# sentence the output would be
+#
+#   word_a  word_b
+#   word_a  word_c
+#   word_b  word_c
+#
+class SentenceBigrams < Wukong::Streamer::RecordStreamer
+  def process title, idx, *words
+    words[0..-2].zip(words[1..-1]).each do |word_a, word_b|
+      yield [word_a, word_b]
+    end
+  end
+end
+#
+# Combine multiple bucket counts into a single on
+#
+class CombineBuckets < Wukong::Streamer::AccumulatingReducer
+  def get_key *fields
+    fields[0..1]
+  end
+  def start! *args
+    @total = 0
+  end
+  def accumulate *fields
+    @total += 1
+  end
+  def finalize
+    yield [@total, key].flatten
+  end
+end
+Wukong.run(
+  SentenceBigrams,
+  CombineBuckets,
+  :io_sort_record_percent => 0.3,
+  :io_sort_mb => 300
+  )

data/examples/corpus/sentence_coocurrence.rb CHANGED

@@ -26,11 +26,7 @@ class SentenceCoocurrence < Wukong::Streamer::RecordStreamer
   end
   def process title, idx, *words
-    words.each_with_index do |word_a, idx|
-      words[(idx+1) .. -1].each do |word_b|
-        @bucket << [word_a, word_b]
-      end
-    end
+    @bucket << words[0..-2].zip(words[1..-1])
     dump_bucket if @bucket.full?
   end

data/examples/corpus/words_to_bigrams.rb CHANGED

@@ -1,5 +1,6 @@
 #!/usr/bin/env ruby
-require 'wukong'
+require 'rubygems'
+require 'wukong/script'
 #
 # Bigram counts

data/examples/ignore_me/counting.rb CHANGED

@@ -1,7 +1,6 @@
 #!/usr/bin/env ruby
 require 'rubygems'
-require 'wukong'
+require 'wukong/script'
 require 'bloomfilter-rb'

data/examples/network_graph/adjacency_list.rb CHANGED

@@ -1,5 +1,5 @@
 #!/usr/bin/env ruby
-$: << File.dirname(__FILE__)+'/../lib'
+require 'rubygems'
 require 'wukong/script'
 #

data/examples/network_graph/breadth_first_search.rb CHANGED

@@ -1,5 +1,5 @@
 #!/usr/bin/env ruby
-$: << File.dirname(__FILE__)+'/../lib'
+require 'rubygems'
 require 'wukong/script'
 #

data/examples/network_graph/gen_2paths.rb CHANGED

@@ -1,6 +1,6 @@
 #!/usr/bin/env ruby
-$: << File.dirname(__FILE__)+'/../../lib'
-require 'wukong'
+require 'rubygems'
+require 'wukong/script'
 class Edge < Struct.new(:src, :dest)
 end

data/examples/network_graph/gen_multi_edge.rb CHANGED

@@ -1,6 +1,5 @@
 #!/usr/bin/env ruby
 require 'rubygems'
-$: << File.dirname(__FILE__)+'/../../lib'
 require 'wukong'
 #

data/examples/network_graph/gen_symmetric_links.rb CHANGED

@@ -1,4 +1,5 @@
 #!/usr/bin/env ruby
+require 'rubygems'
 $: << File.dirname(__FILE__)+'/../../lib'
 require 'wukong'

data/examples/pagerank/pagerank.rb CHANGED

@@ -1,11 +1,7 @@
 #!/usr/bin/env ruby
-$: << File.dirname(__FILE__)+'/../../lib'
+require 'rubygems'
 require 'wukong/script'
-#
-#
-#
 module PageRank
   #
   # Damping factor (prob. of a 'random' jump)
@@ -13,16 +9,12 @@ module PageRank
   #
   DAMPING_FACTOR = 0.85
-  #
   # Each user's line looks like
-  #
   #   user_a    pagerank        id1,id2,...,idN
-  #
   # we need to disperse this user's pagerank to each of id1..idN, and
   # rendezvous the list of outbound links at user_a's reducer as well.
-  #
   module Iterating
-    class Mapper < Wukong::Streamer::Base
+    class PagerankMapper < Wukong::Streamer::Base
       #
       # Send pagerank to each page, and send the dests list back to self
       #
@@ -34,9 +26,7 @@ module PageRank
         yield_own_dest_list   src, dests_str,       &block
       end
-      #
       # Take the source node's pagerank and distribute it among all the out-nodes
-      #
       def yield_pagerank_shares src, pagerank, dests
         pagerank_share = pagerank.to_f / dests.length
         dests.each do |dest|
@@ -44,15 +34,13 @@ module PageRank
         end
       end
-      #
       # Dispatch this user's out-node list to rendezvous with itself.
-      #
       def yield_own_dest_list src, dests_str
         yield [src, 'd', dests_str]
       end
     end
-    class Reducer < Wukong::Streamer::AccumulatingReducer
+    class PagerankReducer < Wukong::Streamer::AccumulatingReducer
       attr_accessor :node_id, :pagerank, :dests_str
       # Begin reduction with 0 accumulated pagerank and no dests as yet
       def start! node_id, *args
@@ -78,11 +66,7 @@ module PageRank
       end
     end
-    class Script < Wukong::Script
-      def default_options
-        super.merge :extra_args => ' -jobconf io.sort.record.percent=0.25 '
-      end
-    end
-    Script.new(Mapper, Reducer).run
+    Wukong.run(PagerankMapper, PagerankReducer,
+      :extra_args => ' -jobconf io.sort.record.percent=0.25 ')
   end
 end

data/examples/pagerank/pagerank_initialize.rb CHANGED

@@ -1,5 +1,5 @@
 #!/usr/bin/env ruby
-$: << File.dirname(__FILE__)+'/../../lib'
+require 'rubygems'
 require 'wukong/script'
 require 'wukong/streamer/list_reducer'

data/examples/server_logs/apache_log_parser.rb CHANGED

@@ -1,58 +1,18 @@
-#!/usr/bin/env ruby
+#!/usr/bin/env ruby -E ASCII-8BIT
 require 'rubygems'
 require 'wukong/script'
+$: << File.dirname(__FILE__)
+require 'logline'
-module ApacheLogParser
-  class Mapper < Wukong::Streamer::LineStreamer
-    #
-    # Regular expression to parse an apache log line.
-    #
-    # 83.240.154.3 - - [07/Jun/2008:20:37:11 +0000] "GET /faq HTTP/1.1" 200 569 "http://infochimps.org/search?query=CAC" "Mozilla/5.0 (Windows; U; Windows NT 5.1; fr; rv:1.9.0.16) Gecko/2009120208 Firefox/3.0.16"
-    #
-    LOG_RE = Regexp.compile(%r{\A
-           (\S+)                        # ip                  83.240.154.3
-         \s(\S+)                        # j1                  -
-         \s(\S+)                        # j2                  -
-       \s\[(\d+)/(\w+)/(\d+)            # date part           [07/Jun/2008
-          :(\d+):(\d+):(\d+)            # time part           :20:37:11
-         \s(\+.*)\]                     # timezone            +0000]
-    \s\"(?:(\S+)                        # http_method         "GET
-         \s(\S+)                        # path                /faq
-         \s(\S+)|-)"                    # protocol            HTTP/1.1"
-         \s(\d+)                        # response_code       200
-         \s(\d+)                        # duration            569
-       \s\"([^\"]*)\"                   # referer             "http://infochimps.org/search?query=CAC"
-       \s\"([^\"]*)\"                   # ua                  "Mozilla/5.0 (Windows; U; Windows NT 5.1; fr; rv:1.9.0.16) Gecko/2009120208 Firefox/3.0.16"
-      \z}x)
-    MONTHS = { 'Jan' => '01', 'Feb' => '02', 'Mar' => '03', 'Apr' => '04', 'May' => '05', 'Jun' => '06', 'Jul' => '07', 'Aug' => '08', 'Sep' => '09', 'Oct' => '10', 'Nov' => '11', 'Dec' => '12', }
-    # Use the regex to break line into fields
-    # Emit each record as flat line
-    def process line
-      line.chomp
-      m = LOG_RE.match(line)
-      if m
-        (ip, j1, j2,
-          ts_day, ts_mo, ts_year,
-          ts_hour, ts_min, ts_sec, tz,
-          http_method, path, protocol,
-          response_code, duration,
-          referer, ua, *cruft) = m.captures
-        date = [ts_year, MONTHS[ts_mo], ts_day].join("")
-        time = [ts_hour, ts_min, ts_sec].join("")
-        yield [:logline, ip, date, time, http_method, protocol, path, response_code, duration, referer, ua, tz]
-      else
-        yield [:unparseable, line]
-      end
-    end
+class ApacheLogParser < Wukong::Streamer::LineStreamer
+  # create a Logline object from each record and serialize it flat to disk
+  def process line
+    yield Logline.parse(line)
   end
 end
-Wukong.run(ApacheLogParser::Mapper, nil, :sort_fields => 7)
-# 55.55.155.55 - - [04/Feb/2008:11:37:52 +0000] 301 "GET /robots.txt HTTP/1.1" 185 "-" "WebAlta Crawler/2.0 (http://www.webalta.net/ru/about_webmaster.html) (Windows; U; Windows NT 5.1; ru-RU)" "-"
+Wukong.run( ApacheLogParser, nil, :sort_fields => 7 ) if $0 == __FILE__

data/examples/server_logs/logline.rb CHANGED

@@ -1,9 +1,6 @@
-#!/usr/bin/env ruby
-require 'rubygems'
-require 'wukong/script'
 class Logline < Struct.new(
-  :ip, :date, :time, :http_method, :protocol, :path, :response_code, :duration, :referer, :ua, :tz)
+    :ip, :dt, :tm, :http_method, :protocol, :path, :response_code, :size, :referer, :ua, :tz, :j1, :j2)
+  # 1    2    3    4              5          6      7               8         9         10    11
   def page_type
     case
@@ -14,14 +11,41 @@ class Logline < Struct.new(
     end
   end
-  def is_page?
-    page_type == :page
-  end
-end
+  #
+  # Regular expression to parse an apache log line.
+  #
+  # 83.240.154.3 - - [07/Jun/2008:20:37:11 +0000] "GET /faq HTTP/1.1" 200 569 "http://infochimps.org/search?query=CAC" "Mozilla/5.0 (Windows; U; Windows NT 5.1; fr; rv:1.9.0.16) Gecko/2009120208 Firefox/3.0.16"
+  #
+  LOG_RE = Regexp.compile(%r{\A
+           (\S+)                        # ip                  83.240.154.3
+         \s(\S+)                        # j1                  -
+         \s(\S+)                        # j2                  -
+       \s\[(\d+)/(\w+)/(\d+)            # date part           [07/Jun/2008
+          :(\d+):(\d+):(\d+)            # time part           :20:37:11
+         \s(\+.*)\]                     # timezone            +0000]
+    \s\"(?:(\S+)                        # http_method         "GET
+         \s(\S+)                        # path                /faq
+         \s(\S+)|-)"                    # protocol            HTTP/1.1"
+         \s(\d+)                        # response_code       200
+         \s(\d+)                        # size                569
+       \s\"([^\"]*)\"                   # referer             "http://infochimps.org/search?query=CAC"
+       \s\"([^\"]*)\"                   # ua                  "Mozilla/5.0 (Windows; U; Windows NT 5.1; fr; rv:1.9.0.16) Gecko/2009120208 Firefox/3.0.16"
+      \z}x)
+  MONTHS = { 'Jan' => '01', 'Feb' => '02', 'Mar' => '03', 'Apr' => '04', 'May' => '05', 'Jun' => '06', 'Jul' => '07', 'Aug' => '08', 'Sep' => '09', 'Oct' => '10', 'Nov' => '11', 'Dec' => '12', }
-class PageFilter < Wukong::Streamer::StructStreamer
-  def process visit, *args
-    yield visit.ua if visit.
+  # Use the regex to break line into fields
+  # Emit each record as flat line
+  def self.parse line
+    m = LOG_RE.match(line.chomp) or return BadRecord.new(line)
+    (ip, j1, j2,
+      ts_day, ts_mo, ts_year,
+      ts_hour, ts_min, ts_sec, tz,
+      http_method, path, protocol,
+      response_code, size,
+      referer, ua, *cruft) = m.captures
+    dt = [ts_year, MONTHS[ts_mo], ts_day].join("")
+    tm = [ts_hour, ts_min, ts_sec].join("")
+    self.new( ip,  dt,  tm,  http_method,  protocol,  path,  response_code,  size,  referer,  ua,  tz, j1, j2 )
   end
 end
-Wukong.run(PageFilter)

data/examples/server_logs/nook.rb ADDED

@@ -0,0 +1,48 @@
+#!/usr/bin/env ruby -E BINARY
+require 'rubygems'
+require 'faraday'
+require 'wukong/script'
+require 'json'
+$: << File.dirname(__FILE__)
+require 'apache_log_parser'
+require 'nook/faraday_dummy_adapter'
+Settings.define :target_host,   :default => 'localhost', :description => "The host name or IP address to target"
+Settings.define :target_scheme, :default => 'http',      :description => "Request scheme (http, https)"
+#
+# A Nook consumes its input stream and, for each input, generates an HTTP
+# request against a remote host. Please use it for good and never for evil.
+#
+# You can use it from your command line:
+#   zcat /var/www/app/current/log/*access*.log.gz | ./nook.rb --map --host=http://my_own_host.com
+#
+#
+class NookMapper < ApacheLogParser
+  # create a Logline object from each record and serialize it flat to disk
+  def process line
+    super(line) do |logline|
+      start = Time.now
+      resp = fetcher.get(logline.path, :user_agent => logline.ua, :referer => logline.referer)
+      yield [Time.now.to_flat, (Time.now - start).to_f, resp.status, resp.body.size, logline.path, resp.body]
+    end
+  end
+  def track record
+    monitor.periodically do |m|
+      m.progress
+    end
+  end
+  # a mock fetcher with a uniformly distributed variable delay
+  def fetcher
+    @fetcher ||= Faraday::Connection.new(:url => 'http://localhost:80/') do |f|
+      f.use Faraday::Adapter::Dummy do |dummy|
+        dummy.delay = Proc.new{|env| 0.05  } # 0.2 * rand()
+        # dummy.body = Proc.new{|env| env[:url] }
+      end
+    end
+  end
+end
+Wukong.run( NookMapper, nil, :sort_fields => 7 )

data/examples/server_logs/nook/faraday_dummy_adapter.rb ADDED

@@ -0,0 +1,94 @@
+module Faraday
+  class Adapter
+    # test = Faraday::Connection.new do |f|
+    #   f.use Faraday::Adapter::Dummy do |dummy|
+    #     dummy.status 404
+    #     dummy.delay  1
+    #   end
+    # end
+    #
+    # # this will delay 0.2s, returning 404 with
+    # resp = text.get("/your/mom", :dummy_delay => 0.2)
+    # resp.body # => {"method":"get","url":"/your/mom","request_headers":{"Dummy-Delay":"0.2","dummy_delay":0.2},"request":{"proxy":null},"ssl":{}}
+    #
+    # More example:
+    #
+    # test = Faraday::Connection.new do |f|
+    #   f.use Faraday::Adapter::Dummy, :status => 503
+    # end
+    #
+    # test = Faraday::Connection.new do |f|
+    #   f.use Faraday::Adapter::Dummy do |dummy|
+    #     dummy.delay = Proc.new{|env| 0.1 + 0.8 * rand() }
+    #   end
+    # end
+    #
+    class Dummy < Middleware
+      include Addressable
+      attr_reader :config
+      def self.loaded?() false end
+      # gets value from environment if set, configured instance variable otherwise
+      def value_for env, key
+        val = env[:request_headers]["Dummy-#{header_hash_key(key)}"] || config[key]
+        if val.respond_to?(:call)
+          val = val.call(env)
+        end
+        val
+      end
+      # With an optional delay, constructs a [status, headers, response] based on the first of:
+      # * request header field (Dummy-Status, Dummy-Headers, Dummy-Resonse)
+      # * adapter's configuration:
+      # * Unless one of the above is set, body will return a json string taken from the request hash
+      #
+      def call(env)
+        status  = value_for(env, :status)
+        headers = value_for(env, :headers)
+        headers = JSON.load(headers) if headers.is_a? String
+        body    = value_for(env, :body) ||
+          env.dup.tap{|hsh| [:response, :parallel_manager, :body].each{|k| hsh.delete k} }.to_json
+        delay   = value_for(env, :delay).to_f
+        sleep delay if delay > 0
+        headers[:dummy_delay] = delay
+        env.update(
+          :status           => status,
+          :response_headers => headers,
+          :body             => body)
+        @app.call(env)
+      end
+      class Configurator < Struct.new(:status, :headers, :delay, :body)
+        def status(val=nil)  self.status  = val if val ; super() end
+        def headers(val=nil) self.headers = val if val ; super() end
+        def body(val=nil)    self.body    = val if val ; super() end
+        def delay(val=nil)   self.delay   = val if val ; super() end
+        def self.from_hash hsh
+          new().tap{|config| hsh.each{|k,v| config.send("#{k}=", v) } }
+        end
+      end
+      def initialize(app, defaults={}, &block)
+        super(app)
+        @config = Configurator.from_hash(defaults.reverse_merge(:status => 200, :delay => 0, :headers => {}))
+        configure(&block) if block
+      end
+      def configure
+        yield config
+      end
+      # same as in Faraday::Utils -- turns :dummy_response_status into 'Dummy-Response-Status'
+      def header_hash_key(str)
+        str.to_s.split('_').each{|w| w.capitalize! }.join('-')
+      end
+      def create_multipart(env, params, boundary = nil)
+        stream = super
+        stream.read
+      end
+    end
+  end
+end

data/examples/simple_word_count.rb ADDED

@@ -0,0 +1,82 @@
+#!/usr/bin/env ruby
+require 'rubygems'
+require 'wukong/script'
+module WordCount
+  class Mapper < Wukong::Streamer::LineStreamer
+    #
+    # Emit each word in each line.
+    #
+    def process line
+      tokenize(line).each{|word| yield [word, 1] }
+    end
+    #
+    # Split a string into its constituent words.
+    #
+    # This is pretty simpleminded:
+    # * downcase the word
+    # * Split at any non-alphanumeric boundary, including '_'
+    # * However, preserve the special cases of 's, 'd or 't at the end of a
+    #   word.
+    #
+    #   tokenize("Ability is a poor man's wealth #johnwoodenquote")
+    #   # => ["ability", "is", "a", "poor", "man's", "wealth", "johnwoodenquote"]
+    #
+    def tokenize str
+      return [] if str.blank?
+      str = str.downcase;
+      # kill off all punctuation except [stuff]'s or [stuff]'t
+      # this includes hyphens (words are split)
+      str = str.
+        gsub(/[^a-zA-Z0-9\']+/, ' ').
+        gsub(/(\w)\'([std])\b/, '\1!\2').gsub(/\'/, ' ').gsub(/!/, "'")
+      # Busticate at whitespace
+      words = str.split(/\s+/)
+      words.reject!{|w| w.blank? }
+      words
+    end
+  end
+  #
+  # A bit kinder to your memory manager: accumulate the sum record-by-record:
+  #
+  class Reducer2 < Wukong::Streamer::AccumulatingReducer
+    def start!(*args)
+      @key_count =  0
+    end
+    def accumulate(*args)
+      @key_count += 1
+    end
+    def finalize
+      yield [ key, @key_count ]
+    end
+  end
+  #
+  # You can stack up all the values in a list then sum them at once.
+  #
+  # This isn't good style, as it means the whole list is held in memory
+  #
+  class Reducer1 < Wukong::Streamer::ListReducer
+    def finalize
+      yield [ key, values.map(&:last).map(&:to_i).inject(0){|x,tot| x+tot } ]
+    end
+  end
+  #
+  # ... easiest of all, though: this is common enough that it's already included
+  #
+  require 'wukong/streamer/count_keys'
+  class Reducer3 < Wukong::Streamer::CountKeys
+  end
+end
+# Execute the script
+Wukong.run(
+  WordCount::Mapper,
+  WordCount::Reducer2
+  )

data/examples/size.rb CHANGED

@@ -1,5 +1,5 @@
 #!/usr/bin/env ruby
-$: << File.dirname(__FILE__)+'/../lib'
+require 'rubygems'
 require 'wukong/script'
 module Size

data/examples/stats/binning_percentile_estimator.rb CHANGED

@@ -1,5 +1,5 @@
 #!/usr/bin/env ruby
-$: << File.dirname(__FILE__)+'/../../lib'
+require 'rubygems'
 require 'wukong/script'
 require 'wukong/streamer/count_keys'

data/examples/stats/rank_and_bin.rb CHANGED

@@ -1,5 +1,5 @@
 #!/usr/bin/env ruby
-$: << File.dirname(__FILE__)+'/../../lib'
+require 'rubygems'
 require 'wukong/script'
 require 'wukong/streamer/rank_and_bin_reducer'

data/examples/stupidly_simple_filter.rb CHANGED

@@ -1,5 +1,5 @@
 #!/usr/bin/env ruby
-$: << File.dirname(__FILE__)+'/../../lib'
+require 'rubygems'
 require 'wukong/script'
 # Run as (local mode)

data/lib/wukong.rb CHANGED

@@ -1,5 +1,5 @@
-require 'configliere'; Settings.use :define
 require 'wukong/extensions'
+require 'configliere'; Settings.use :define
 require 'wukong/datatypes'
 require 'wukong/periodic_monitor'
 require 'wukong/logger'

data/lib/wukong/extensions.rb CHANGED

@@ -2,8 +2,8 @@
 # These pull in the minimal functionality of the extlib|activesupport family of
 # gems.
 #
-require 'wukong/extensions/blank'
-require 'wukong/extensions/class'
+require 'extlib/blank'
+require 'extlib/class'
 require 'wukong/extensions/enumerable'
 require 'wukong/extensions/symbol'
 require 'wukong/extensions/hash'

data/lib/wukong/extensions/blank.rb CHANGED

@@ -15,7 +15,7 @@ class Object
   # @api public
   def blank?
     nil? || (respond_to?(:empty?) && empty?)
-  end
+  end unless method_defined?(:blank?)
 end # class Object
 class Numeric
@@ -31,7 +31,7 @@ class Numeric
   # @api public
   def blank?
     false
-  end
+  end unless method_defined?(:blank?)
 end # class Numeric
 class NilClass
@@ -45,7 +45,7 @@ class NilClass
   # @api public
   def blank?
     true
-  end
+  end unless method_defined?(:blank?)
 end # class NilClass
 class TrueClass
@@ -59,7 +59,7 @@ class TrueClass
   # @api public
   def blank?
     false
-  end
+  end unless method_defined?(:blank?)
 end # class TrueClass
 class FalseClass
@@ -73,7 +73,7 @@ class FalseClass
   # @api public
   def blank?
     true
-  end
+  end unless method_defined?(:blank?)
 end # class FalseClass
 class String
@@ -89,5 +89,5 @@ class String
   # @api public
   def blank?
     strip.empty?
-  end
+  end unless method_defined?(:blank?)
 end # class String

data/lib/wukong/extensions/hash.rb CHANGED

@@ -66,11 +66,11 @@ class Hash
   #
   def deep_merge hsh2
     merge hsh2, &Hash::DEEP_MERGER
-  end
+  end unless method_defined?(:deep_merge)
   def deep_merge! hsh2
     merge! hsh2, &Hash::DEEP_MERGER
-  end
+  end unless method_defined?(:deep_merge!)
   #
   # Treat hash as tree of hashes:
@@ -86,10 +86,10 @@ class Hash
     val      = args.pop
     last_key = args.pop
     # dig down to last subtree (building out if necessary)
-    hsh = args.empty? ? self : args.inject(self){|hsh, key| hsh[key] ||= {} }
+    hsh = args.empty? ? self : args.inject(self){|h, k| h[k] ||= {} }
     # set leaf value
     hsh[last_key] = val
-  end
+  end unless method_defined?(:deep_set)
   #
   # Treat hash as tree of hashes:
@@ -107,10 +107,10 @@ class Hash
   def deep_get *args
     last_key = args.pop
     # dig down to last subtree (building out if necessary)
-    hsh = args.inject(self){|hsh, key| hsh[key] || {} }
+    hsh = args.inject(self){|h, k| h[k] || {} }
     # get leaf value
     hsh[last_key]
-  end
+  end unless method_defined?(:deep_get)
   #
@@ -126,20 +126,20 @@ class Hash
     last_key  = args.pop
     last_hsh  = args.empty? ? self : (deep_get(*args)||{})
     last_hsh.delete(last_key)
-  end
+  end unless method_defined?(:deep_delete)
   #
   # remove all key-value pairs where the value is nil
   #
   def compact
     reject{|key,val| val.nil? }
-  end
+  end unless method_defined?(:compact)
   #
   # Replace the hash with its compacted self
   #
   def compact!
     replace(compact)
-  end
+  end unless method_defined?(:compact!)
   #
   # remove all key-value pairs where the value is blank

data/lib/wukong/extensions/hash_like.rb CHANGED

@@ -64,7 +64,7 @@ module Wukong
     # Analagous to Hash#merge
     #
     def merge *args
-      self.dup.merge! *args
+      self.dup.merge!(*args)
     end
     def merge! hsh, &block
       raise "can't handle block arg yet" if block
@@ -104,7 +104,7 @@ module Wukong
       #
       def from_hash(hsh, has_symbol_keys=false)
         extract_keys = has_symbol_keys ? self.keys.map(&:to_sym) : self.keys.map(&:to_s)
-        self.new *hsh.values_of(*extract_keys)
+        self.new(*hsh.values_of(*extract_keys))
       end
       #
       # The last portion of the class in underscored form

data/lib/wukong/extensions/symbol.rb CHANGED

@@ -7,5 +7,5 @@ class Symbol
   # <tt>ActiveSupport::CoreExtensions::Symbol</tt>).
   def to_proc
     Proc.new { |*args| args.shift.__send__(self, *args) }
-  end
+  end unless method_defined?(:to_proc)
 end

data/lib/wukong/logger.rb CHANGED

@@ -13,7 +13,7 @@ module Wukong
   #     I, [2009-07-26T19:58:46-05:00 #12332]: Up to 2000 char message
   #
   def self.logger
-    return @logger if @logger
+    return @logger if defined?(@logger)
     require 'logger'
     @logger = Logger.new STDERR
     @logger.instance_eval do

data/lib/wukong/periodic_monitor.rb CHANGED

@@ -28,9 +28,9 @@ class PeriodicMonitor
     if ready?
       @last_report = Time.now
       if block
-        block.call(iter, *args)
+        emit block.call(self, *args)
       else
-        self.emit progress(*args)
+        emit progress(*args)
       end
     end
   end

data/lib/wukong/script.rb CHANGED

@@ -1,4 +1,5 @@
 require 'pathname'
+require 'wukong/extensions'
 require 'configliere' ; Settings.use(:commandline, :env_var, :define)
 require 'wukong'
 require 'wukong/script/hadoop_command'
@@ -127,7 +128,7 @@ module Wukong
     def initialize mapper, reducer=nil, extra_options={}
       Settings.resolve!
       @options = Settings
-      options.merge extra_options
+      options.merge! extra_options
       @mapper  = (case mapper  when Class then mapper.new  when nil then nil else mapper  ; end)
       @reducer = (case reducer when Class then reducer.new when nil then nil else reducer ; end)
       @output_path = options.rest.pop
@@ -173,10 +174,14 @@ module Wukong
     # In hadoop mode, this is given to the hadoop streaming command.
     # In local mode, it's given to the system() call
     #
-    def mapper_commandline
+    def mapper_commandline(run_option=:local)
       if mapper
-        "#{ruby_interpreter_path} #{this_script_filename} --map " + non_wukong_params
-        # "#{ruby_interpreter_path} #{File.basename(this_script_filename)} --map " + non_wukong_params
+        case run_option
+        when :local then
+          "#{ruby_interpreter_path} #{this_script_filename} --map " + non_wukong_params
+        when :hadoop then
+          "#{ruby_interpreter_path} #{File.basename(this_script_filename)} --map " + non_wukong_params
+        end
       else
         options[:map_command]
       end
@@ -187,10 +192,14 @@ module Wukong
     # In hadoop mode, this is given to the hadoop streaming command.
     # In local mode, it's given to the system() call
     #
-    def reducer_commandline
+    def reducer_commandline(run_option=:local)
       if reducer
-        "#{ruby_interpreter_path} #{this_script_filename} --reduce " + non_wukong_params
-        # "#{ruby_interpreter_path} #{File.basename(this_script_filename)} --reduce " + non_wukong_params
+        case run_option
+        when :local then
+          "#{ruby_interpreter_path} #{this_script_filename} --reduce " + non_wukong_params
+        when :hadoop then
+          "#{ruby_interpreter_path} #{File.basename(this_script_filename)} --reduce " + non_wukong_params
+        end
       else
         options[:reduce_command]
       end
@@ -222,6 +231,7 @@ module Wukong
       else
         maybe_overwrite_output_paths! output_path
         $stdout.puts `#{command}`
+        raise "Streaming command failed!" unless $?.success?
       end
     end
@@ -242,7 +252,7 @@ module Wukong
     # the map/reducer phase scripts
     def non_wukong_params
       options.
-        reject{|param, val| options.param_definitions[param][:wukong] }.
+        reject{|param, val| options.definition_of(param, :wukong) }.
         map{|param,val| "--#{param}=#{val}" }.
         join(" ")
     end

data/lib/wukong/script/emr_command.rb CHANGED

@@ -13,8 +13,8 @@ Settings.define :emr_bootstrap_script, :description => 'Bootstrap actions for El
 Settings.define :emr_extra_args,       :description => 'kludge: allows you to stuff extra args into the elastic-mapreduce invocation', :type => Array, :wukong => true
 Settings.define :alive,                :description => 'Whether to keep machine running after job invocation', :type => :boolean
 #
-Settings.define :keypair_file,        :description => 'AWS Key pair file',                               :type => :filename
-Settings.define :keypair,             :description => "AWS Key pair name. If not specified, it's taken from keypair_file's basename", :finally => lambda{ Settings.keypair ||= File.basename(Settings.keypair_file.to_s, '.pem') if Settings.keypair_file }
+Settings.define :key_pair_file,        :description => 'AWS Key pair file',                               :type => :filename
+Settings.define :key_pair,             :description => "AWS Key pair name. If not specified, it's taken from key_pair_file's basename", :finally => lambda{ Settings.key_pair ||= File.basename(Settings.key_pair_file.to_s, '.pem') if Settings.key_pair_file }
 Settings.define :instance_type,        :description => 'AWS instance type to use',                        :default => 'm1.small'
 Settings.define :master_instance_type, :description => 'Overrides the instance type for the master node', :finally => lambda{ Settings.master_instance_type ||= Settings.instance_type }
 Settings.define :jobflow,              :description => "ID of an existing EMR job flow. Wukong will create a new job flow"
@@ -45,7 +45,9 @@ module Wukong
     end
     def hadoop_options_for_emr_runner
-      [hadoop_jobconf_options, hadoop_other_args].flatten.compact.map{|hdp_opt| "--arg '#{hdp_opt}'"}
+      [hadoop_jobconf_options, hadoop_other_args].flatten.compact.uniq.map do |hdp_opt|
+        hdp_opt.split(' ').map {|part| "--arg '#{part}'"}
+      end.flatten
     end
     def execute_emr_runner
@@ -57,7 +59,7 @@ module Wukong
         command_args << "--create --name=#{job_name}"
         command_args << Settings.dashed_flag_for(:alive)
         command_args << Settings.dashed_flags(:num_instances, [:instance_type, :slave_instance_type], :master_instance_type, :hadoop_version).join(' ')
-        command_args << Settings.dashed_flags(:availability_zone, :keypair, :keypair_file).join(' ')
+        command_args << Settings.dashed_flags(:availability_zone, :key_pair, :key_pair_file).join(' ')
         command_args << "--bootstrap-action=#{bootstrap_s3_uri}"
       end
       command_args << Settings.dashed_flags(:enable_debugging, :step_action, [:emr_runner_verbose, :verbose], [:emr_runner_debug, :debug]).join(' ')

data/lib/wukong/script/hadoop_command.rb CHANGED

@@ -25,6 +25,9 @@ module Wukong
     Settings.define :max_record_length,      :jobconf => true, :description => 'mapred.linerecordreader.maxlength',                      :wukong => true # "Safeguards against corrupted data: lines longer than this (in bytes) are treated as bad records."
     Settings.define :max_reduces_per_cluster,:jobconf => true, :description => 'mapred.max.reduces.per.cluster',                         :wukong => true
     Settings.define :max_reduces_per_node,   :jobconf => true, :description => 'mapred.max.reduces.per.node',                            :wukong => true
+    Settings.define :max_tracker_failures,   :jobconf => true, :description => 'mapred.max.tracker.failures',                            :wukong => true
+    Settings.define :max_map_attempts,       :jobconf => true, :description => 'mapred.map.max.attempts',                                :wukong => true
+    Settings.define :max_reduce_attempts,    :jobconf => true, :description => 'mapred.reduce.max.attempts',                             :wukong => true
     Settings.define :min_split_size,         :jobconf => true, :description => 'mapred.min.split.size',                                  :wukong => true
     Settings.define :output_field_separator, :jobconf => true, :description => 'stream.map.output.field.separator',                      :wukong => true
     Settings.define :partition_fields,       :jobconf => true, :description => 'num.key.fields.for.partition',                           :wukong => true
@@ -40,8 +43,8 @@ module Wukong
     # if not, the resulting nil will be elided later
     def jobconf option
       if options[option]
-        # "-jobconf %s=%s" % [options.description_for(option), options[option]]
-        "-D %s=%s" % [options.description_for(option), options[option]]
+        # "-jobconf %s=%s" % [options.definition_of(option, :description), options[option]]
+        "-D %s=%s" % [options.definition_of(option, :description), options[option]]
       end
     end
@@ -64,8 +67,8 @@ module Wukong
         hadoop_jobconf_options,
         "-D mapred.job.name='#{job_name}'",
         hadoop_other_args,
-        "-mapper  '#{mapper_commandline}'",
-        "-reducer '#{reducer_commandline}'",
+        "-mapper  '#{mapper_commandline(:hadoop)}'",
+        "-reducer '#{reducer_commandline(:hadoop)}'",
         "-input   '#{input_paths}'",
         "-output  '#{output_path}'",
         "-file    '#{this_script_filename}'",
@@ -100,6 +103,8 @@ module Wukong
         :partition_fields,         :sort_fields,
         :reduce_tasks,             :respect_exit_status,
         :reuse_jvms,               :timeout,
+        :max_tracker_failures,     :max_map_attempts,
+        :max_reduce_attempts
       ].map{|opt| jobconf(opt)}
       jobconf_options.flatten.compact
     end

data/lib/wukong/script/local_command.rb CHANGED

@@ -25,7 +25,13 @@ module Wukong
       @input_paths = input_paths.map(&:strip).join(' ')
       cmd_input_str  = (input_paths == '-') ? "" : "cat '#{input_paths}' | "
       cmd_output_str = (output_path == '-') ? "" : "> '#{output_path}'"
-      %Q{ #{cmd_input_str} #{mapper_commandline} | #{local_mode_sort_commandline} | #{reducer_commandline} #{cmd_output_str} }
+      if (reducer || options[:reduce_command])
+        %Q{ #{cmd_input_str} #{mapper_commandline} | #{local_mode_sort_commandline} | #{reducer_commandline} #{cmd_output_str} }
+      else
+        %Q{ #{cmd_input_str} #{mapper_commandline} | #{local_mode_sort_commandline} #{cmd_output_str} }
+      end
     end
   end

data/lib/wukong/streamer/base.rb CHANGED

@@ -28,11 +28,15 @@ module Wukong
           process(*record) do |output_record|
             emit output_record
           end
-          monitor.periodically(record.to_s[0..1000])
+          track(record)
         end
         after_stream
       end
+      def track record
+        monitor.periodically(record.to_s[0..1000])
+      end
       def each_record &block
         $stdin.each(&block)
       end
@@ -103,7 +107,7 @@ module Wukong
       # Creates a new object of this class and injects the given block
       # as the process method
       def self.mapper *args, &block
-        self.new.mapper *args, &block
+        self.new.mapper(*args, &block)
       end
       # Delegates back to Wukong to run this instance as a mapper

data/wukong.gemspec CHANGED

@@ -5,11 +5,11 @@
 Gem::Specification.new do |s|
   s.name = %q{wukong}
-  s.version = "2.0.0"
+  s.version = "2.0.1"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["Philip (flip) Kromer"]
-  s.date = %q{2011-01-29}
+  s.date = %q{2011-07-01}
   s.description = %q{  Treat your dataset like a:
       * stream of lines when it's efficient to process by lines
@@ -139,6 +139,7 @@ Gem::Specification.new do |s|
     "examples/contrib/jeans/sizes.rb",
     "examples/corpus/bucket_counter.rb",
     "examples/corpus/dbpedia_abstract_to_sentences.rb",
+    "examples/corpus/sentence_bigrams.rb",
     "examples/corpus/sentence_coocurrence.rb",
     "examples/corpus/words_to_bigrams.rb",
     "examples/emr/README.textile",
@@ -162,7 +163,10 @@ Gem::Specification.new do |s|
     "examples/server_logs/apache_log_parser.rb",
     "examples/server_logs/breadcrumbs.rb",
     "examples/server_logs/logline.rb",
+    "examples/server_logs/nook.rb",
+    "examples/server_logs/nook/faraday_dummy_adapter.rb",
     "examples/server_logs/user_agent.rb",
+    "examples/simple_word_count.rb",
     "examples/size.rb",
     "examples/stats/avg_value_frequency.rb",
     "examples/stats/binning_percentile_estimator.rb",
@@ -252,13 +256,14 @@ Gem::Specification.new do |s|
   ]
   s.homepage = %q{http://mrflip.github.com/wukong}
   s.require_paths = ["lib"]
-  s.rubygems_version = %q{1.4.2}
+  s.rubygems_version = %q{1.5.0}
   s.summary = %q{Hadoop Streaming for Ruby. Wukong makes Hadoop so easy a chimpanzee can use it, yet handles terabyte-scale computation with ease.}
   s.test_files = [
     "examples/contrib/jeans/normalize.rb",
     "examples/contrib/jeans/sizes.rb",
     "examples/corpus/bucket_counter.rb",
     "examples/corpus/dbpedia_abstract_to_sentences.rb",
+    "examples/corpus/sentence_bigrams.rb",
     "examples/corpus/sentence_coocurrence.rb",
     "examples/corpus/words_to_bigrams.rb",
     "examples/emr/elastic_mapreduce_example.rb",
@@ -275,7 +280,10 @@ Gem::Specification.new do |s|
     "examples/server_logs/apache_log_parser.rb",
     "examples/server_logs/breadcrumbs.rb",
     "examples/server_logs/logline.rb",
+    "examples/server_logs/nook.rb",
+    "examples/server_logs/nook/faraday_dummy_adapter.rb",
     "examples/server_logs/user_agent.rb",
+    "examples/simple_word_count.rb",
     "examples/size.rb",
     "examples/stats/avg_value_frequency.rb",
     "examples/stats/binning_percentile_estimator.rb",

metadata CHANGED

@@ -1,13 +1,8 @@
 --- !ruby/object:Gem::Specification
 name: wukong
 version: !ruby/object:Gem::Version
-  hash: 15
   prerelease:
-  segments:
-  - 2
-  - 0
-  - 0
-  version: 2.0.0
+  version: 2.0.1
 platform: ruby
 authors:
 - Philip (flip) Kromer
@@ -15,7 +10,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-01-29 00:00:00 -06:00
+date: 2011-07-01 00:00:00 -05:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -26,11 +21,6 @@ dependencies:
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
-        hash: 13
-        segments:
-        - 1
-        - 2
-        - 9
         version: 1.2.9
   type: :development
   version_requirements: *id001
@@ -42,9 +32,6 @@ dependencies:
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
-        hash: 3
-        segments:
-        - 0
         version: "0"
   type: :development
   version_requirements: *id002
@@ -56,9 +43,6 @@ dependencies:
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
-        hash: 3
-        segments:
-        - 0
         version: "0"
   type: :runtime
   version_requirements: *id003
@@ -70,9 +54,6 @@ dependencies:
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
-        hash: 3
-        segments:
-        - 0
         version: "0"
   type: :runtime
   version_requirements: *id004
@@ -84,9 +65,6 @@ dependencies:
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
-        hash: 3
-        segments:
-        - 0
         version: "0"
   type: :runtime
   version_requirements: *id005
@@ -98,9 +76,6 @@ dependencies:
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
-        hash: 3
-        segments:
-        - 0
         version: "0"
   type: :runtime
   version_requirements: *id006
@@ -233,6 +208,7 @@ files:
 - examples/contrib/jeans/sizes.rb
 - examples/corpus/bucket_counter.rb
 - examples/corpus/dbpedia_abstract_to_sentences.rb
+- examples/corpus/sentence_bigrams.rb
 - examples/corpus/sentence_coocurrence.rb
 - examples/corpus/words_to_bigrams.rb
 - examples/emr/README.textile
@@ -256,7 +232,10 @@ files:
 - examples/server_logs/apache_log_parser.rb
 - examples/server_logs/breadcrumbs.rb
 - examples/server_logs/logline.rb
+- examples/server_logs/nook.rb
+- examples/server_logs/nook/faraday_dummy_adapter.rb
 - examples/server_logs/user_agent.rb
+- examples/simple_word_count.rb
 - examples/size.rb
 - examples/stats/avg_value_frequency.rb
 - examples/stats/binning_percentile_estimator.rb
@@ -357,23 +336,17 @@ required_ruby_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
-      hash: 3
-      segments:
-      - 0
       version: "0"
 required_rubygems_version: !ruby/object:Gem::Requirement
   none: false
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
-      hash: 3
-      segments:
-      - 0
       version: "0"
 requirements: []
 rubyforge_project:
-rubygems_version: 1.4.2
+rubygems_version: 1.5.0
 signing_key:
 specification_version: 3
 summary: Hadoop Streaming for Ruby. Wukong makes Hadoop so easy a chimpanzee can use it, yet handles terabyte-scale computation with ease.
@@ -382,6 +355,7 @@ test_files:
 - examples/contrib/jeans/sizes.rb
 - examples/corpus/bucket_counter.rb
 - examples/corpus/dbpedia_abstract_to_sentences.rb
+- examples/corpus/sentence_bigrams.rb
 - examples/corpus/sentence_coocurrence.rb
 - examples/corpus/words_to_bigrams.rb
 - examples/emr/elastic_mapreduce_example.rb
@@ -398,7 +372,10 @@ test_files:
 - examples/server_logs/apache_log_parser.rb
 - examples/server_logs/breadcrumbs.rb
 - examples/server_logs/logline.rb
+- examples/server_logs/nook.rb
+- examples/server_logs/nook/faraday_dummy_adapter.rb
 - examples/server_logs/user_agent.rb
+- examples/simple_word_count.rb
 - examples/size.rb
 - examples/stats/avg_value_frequency.rb
 - examples/stats/binning_percentile_estimator.rb