RubyGems - wukong - Versions diffs - 1.5.2 → 1.5.3 - Mend

wukong 1.5.2 → 1.5.3

Files changed (20) hide show

data/CHANGELOG.textile +10 -0
data/README.textile +1 -3
data/bin/hdp-cp +3 -0
data/bin/wu-lign +31 -36
data/docpages/README-wulign.textile +6 -6
data/examples/emr/elastic_mapreduce_example.rb +9 -0
data/examples/emr/emr.yaml +52 -0
data/lib/wukong.rb +5 -3
data/lib/wukong/filename_pattern.rb +73 -0
data/lib/wukong/keystore/tyrant_db.rb +11 -11
data/lib/wukong/monitor/periodic_monitor.rb +62 -64
data/lib/wukong/script/hadoop_command.rb +7 -6
data/lib/wukong/store.rb +12 -11
data/lib/wukong/store/base.rb +5 -7
data/lib/wukong/store/chh_chunked_flat_file_store.rb +37 -0
data/lib/wukong/store/chunked_flat_file_store.rb +23 -16
data/lib/wukong/store/flat_file_store.rb +9 -10
data/lib/wukong/streamer/em_streamer.rb +7 -0
data/wukong.gemspec +7 -2
metadata +9 -4

@@ -1,3 +1,8 @@
+h2. Wukong v1.5.3
+* A couple of bugfixes. Sorry about that.
+* Documentation fixes
 h2. Wukong v1.5.0
 h4. Elastic Map-Reduce
@@ -16,6 +21,11 @@ Incompatible changes to option handling and script launching:
 * Script doesn't use extra_options any more. You should relocate them to the initializer or to configliere.
 * there is no more default_mapper or default_reducer
+h2. Wukong v.14.12 2010-08-31
+* Improvements to the pig conversion methods
+* @hdp-rm@ respects the -skipTrash method
 h2. Wukong v1.4.11 2010-07-30

data/README.textile CHANGED

@@ -30,9 +30,7 @@ I'm pushing to release "Wukong 3.0 the actual 1.0 release".
     * Standardize the notion that wukong classes have a "key"; by default, it will be to_a.first for Structs/TypedStructs. This shouldn't break anything.
     * May make some things that are derived classes into mixin'ed modules
     * Will probably change the name of AccumulatingReducer into just Accumulator, and have all Accumulator-derived classes include Accumulator; I'll make sure the old names continue to work though.
-*
 h2. Help!

data/bin/hdp-cp ADDED

@@ -0,0 +1,3 @@
+#!/usr/bin/env bash
+exec hadoop dfs -cp "$@"

data/bin/wu-lign CHANGED

@@ -7,7 +7,7 @@ USAGE= %Q{
 # space aligned file that is still suitable for further processing. For example,
 # given the log-file input
 #
-#     <pre><code>
+#     # cat tag_usage.tsv
 #     2009-07-21T21:39:40 day     65536   3.15479 68750   1171316
 #     2009-07-21T21:39:45 doing   65536   1.04533 26230   1053956
 #     2009-07-21T21:41:53 hapaxlegomenon  65536   0.87574e-05     23707   10051141
@@ -15,11 +15,10 @@ USAGE= %Q{
 #     2009-07-21T21:44:29 world   65536   1.09110 32850   200916
 #     2009-07-21T21:44:39 world+series    65536   0.49380 9929    7972025
 #     2009-07-21T21:44:54 iranelection    65536   2.91775 14592   136342
-#     </code></pre>
 #
 # wulign will reformat it to read
 #
-#     <pre><code>
+#     # cat tag_usage.tsv | wu-lign
 #     2009-07-21T21:39:40 day                   65536   3.154791234 68750    1171316
 #     2009-07-21T21:39:45 doing                 65536   1.045330000 26230    1053956
 #     2009-07-21T21:41:53 hapaxlegomenon        65536   0.000008757 23707   10051141
@@ -27,65 +26,61 @@ USAGE= %Q{
 #     2009-07-21T21:44:29 world                 65536   1.091100000 32850     200916
 #     2009-07-21T21:44:39 world+series          65536   0.493800000  9929    7972025
 #     2009-07-21T21:44:54 iranelection          65536   2.917750000 14592     136342
-#     </code></pre>
 #
 # The fields are still tab-delimited by exactly one tab -- only spaces are used to
 # pad out fields. You can still use cuttab and friends to manipulate columns.
 #
-# wulign isn't intended to be smart, or correct, or reliable -- only to be
-# useful for previewing and organizing tab-formatted files. In general
-# @wulign(foo).split("\t").map(&:strip)@ *should* give output semantically
-# equivalent to its input. (That is, the only changes should be insertion of
-# spaces and re-formatting of numerics.) But still -- reserve its use for human
-# inspection only.
-#
-# (Note: tab characters in this source code file have been converted to spaces;
-# replace whitespace with tab in the first example if you'd like to play along at
-# home.)
-#
-# h2. How it works
-#
-# Wulign takes the first 1000 lines, splits by TAB characters into fields, and
-# tries to guess the format -- int, float, or string -- for each. It builds a
-# consensus of the width and type for corresponding columns in the chunk.  If a
-# column has mixed numeric and string formats it degrades to :mixed, which is
-# basically treated as :string. If a column has mixed :float and :int elements all
-# of them are formatted as float.
-#
 # h2. Command-line arguments
 #
 # You can give sprintf-style positional arguments on the command line that will be
 # applied to the corresponding columns. (Blank args are used for placeholding and
 # auto-formatting is still applied).  So with the example above,
 #
-#     @cat foo | wulign  '' '' '' '%8.4e'@
+#     cat foo | wulign  '' '' '' '%8.4e'
 #
 # will format the fourth column with "%8.4e", while the first three columns and
 # fifth-and-higher columns are formatted as usual.
 #
-#     <pre><code>
 #     ...
 #     2009-07-21T21:39:45 doing           65536   1.0453e+00      26230    1053956
 #     2009-07-21T21:41:53 hapaxlegomenon  65536   8.7574e-06      23707   10051141
 #     2009-07-21T21:44:00 concert           500   2.9290e-01      13367    9733414
 #     ....
-#     </code></pre>
+#
+# h2. How it works
+#
+# Wu-lign takes the first 500ish lines, splits into fields on TAB characters,
+# and tries to guess the format (int, float, or string) for each. It builds a
+# consensus of the width and type for corresponding columns in the chunk.  If a
+# column has mixed numeric and string formats it degrades to :mixed, which is
+# basically treated as :string. If a column has mixed :float and :int elements all
+# of them are formatted as float.
 #
 # h2. Notes
 #
-# * It has no knowledge of header rows. An all-text first line will screw everything up.
+# * Header rows: the first line is used for width alignment but not for type detection.
+#   This means that an initial row of text headers will inform column spacing
+#   but still allow a column of floats (say) to be properly aligned as floats.
 #
-# * It also requires a unanimous vote. One screwy line can coerce the whole mess
-#   to :mixed; width formatting will still be applied, though.
+# * It requires a unanimous vote. One screwy line can coerce the whole mess to
+#   :mixed; width formatting will still be applied, though.
 #
-# * It won't set columns wider than 70 chars -- this allows for the occasional
+# * It won't set columns wider than 100 chars -- this allows for the occasional
 #   super-wide column without completely breaking your screen.
 #
 # * For :float values, wulign tries to guess at the right number of significant
 #   digits to the left and right of the decimal point.
 #
-# * wulign does not parse 'TSV files' in their strict sense -- there is no quoting
-#   or escaping; every tab delimits a field, every newline a record.
+# * wulign parses only plain-jane 'TSV files': no quoting or escaping; every tab
+#   delimits a field, every newline a record.
+#
+# wulign isn't intended to be smart, or correct, or reliable -- only to be
+# useful for previewing and organizing tab-formatted files. In general
+# wulign(foo).split("\t").map(&:strip) *should* give output semantically
+# equivalent to its input. (That is, the only changes should be insertion of
+# spaces and re-formatting of numerics.) But still -- reserve its use for human
+# inspection only.
+#
 }
 if ARGV[0] == '--help'
@@ -111,7 +106,7 @@ def get_type val
   when val == ''       then type = nil
   when val =~ INT_RE   then type = :int
   when val =~ FLOAT_RE then type = :float
-  else                      type = :str end
+  else                      type = :str end
 end
 def consensus_type val, alltype, is_first
@@ -149,7 +144,7 @@ FORMAT_GUESSING_LINES.times do
   row.each_with_index{|col,i|
     next if skip_col[i]
     # Let the first row be text (headers)
-    col_types[i] = consensus_type(col, col_types[i], rows.length == 1)
+    col_types[i] = consensus_type(col, col_types[i], rows.length == 1)
     if col_types[i] == :float
       mantissa, radix = f_width(col)
       col_minmag[i] = [radix,    col_minmag[i], 1].compact.max
@@ -175,7 +170,7 @@ def dump_row row, format
   puts row.zip(format).map{|c,f| f.call(c) rescue c }.join("\t")
 end
 def dump_header row, maxw
-  puts row.zip(maxw).map{|col, width| "%-#{width}s" % col.to_s }.join("\t")
+  puts row.zip(maxw).map{|col, width| "%-#{width}s" % col.to_s }.join("\t")
 end
 pad = [''] * maxw.length

data/docpages/README-wulign.textile CHANGED

@@ -38,7 +38,7 @@ wu-lign isn't intended to be smart, or correct, or reliable -- only to be useful
 h2. How it works
-Wu-Lign takes the first 1000 lines, splits by TAB characters into fields, and tries to guess the format -- int, float, or string -- for each. It builds a consensus of the width and type for corresponding columns in the chunk.  If a column has mixed numeric and string formats it degrades to :mixed, which is basically treated as :string. If a column has mixed :float and :int elements all of them are formatted as float.
+Wu-Lign takes the first 500ish lines, splits into fields on TAB characters, and tries to guess the format (int, float, or string) for each. It builds a consensus of the width and type for corresponding columns in the chunk.  If a column has mixed numeric and string formats it degrades to :mixed, which is basically treated as :string. If a column has mixed :float and :int elements all of them are formatted as float.
 h2. Command-line arguments
@@ -58,8 +58,8 @@ will format the fourth column with "%8.4e", while the first three columns and fi
 h2. Notes
-* It has no knowledge of header rows. An all-text first line will screw everything up.
-* It also requires a unanimous vote. One screwy line can coerce the whole mess to :mixed; width formatting will still be applied, though.
-* It won't set columns wider than 70 chars -- this allows for the occasional super-wide column without completely breaking your screen.
-* For :float values, wu-lign tries to guess at the right number of significant digits to the left and right of the decimal point.
-* wu-lign does not parse 'TSV files' in their strict sense -- there is no quoting or escaping; every tab delimits a field, every newline a record.
+* Header rows: the first line is used for width alignment but not for type detection.  This means that an initial row of text headers will inform column spacing but still allow a column of floats (say) to be properly aligned as floats.
+* It requires a unanimous vote. One screwy line can coerce the whole mess to :mixed; width formatting will still be applied, though.
+* It won't set columns wider than 100 chars -- this allows for the occasional super-wide column without completely breaking your screen.
+* For :float values, wulign tries to guess at the right number of significant digits to the left and right of the decimal point.
+* wulign parses only plain-jane 'TSV files': no quoting or escaping; every tab delimits a field, every newline a record.

data/examples/emr/elastic_mapreduce_example.rb CHANGED

@@ -3,6 +3,15 @@ Dir[File.dirname(__FILE__)+'/vendor/**/lib'].each{|dir| $: << dir }
 require 'rubygems'
 require 'wukong'
+#
+# * Copy the emr.yaml from here into ~/.wukong/emr.yaml
+#   and edit it to suit.
+# * Download the Amazon elastic-mapreduce runner. Get a copy from
+#   http://elasticmapreduce.s3.amazonaws.com/elastic-mapreduce-ruby.zip
+# * Find out what breaks, fix it or ask us for help (coders@infochimps.org) and
+#   submit a patch
+#
 class FooStreamer < Wukong::Streamer::LineStreamer
   def initialize *args
     super *args

data/examples/emr/emr.yaml ADDED

@@ -0,0 +1,52 @@
+#
+# Elastic MapReduce config in wukong
+#
+#
+# Infrastructure options
+#
+# == Fill all your information into yet another file with your amazon key Sorry
+#    that it needs to be in so many stupid places, nobody can agree on a
+#    filename or format.
+:emr_credentials_file:          ~/.wukong/credentials.json
+#
+# == Set the AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY env vars, or enter them here:
+# :access_key:                  ASDFAHKHASDF
+# :secret_access_key:           ADSGHASDFJASDFASDF
+#
+# == Path to your keypair file.
+:key_pair_file:                 ~/.wukong/keypairs/gibbon.pem
+# == Keypair will be named after your file, or force the name:
+# :key_pair:                    ~
+# == Path to the Amazon elastic-mapreduce runner. Get a copy from
+#    http://elasticmapreduce.s3.amazonaws.com/elastic-mapreduce-ruby.zip
+:emr_runner:                    ~/ics/hadoop/elastic-mapreduce/elastic-mapreduce
+#
+# Cluster Config
+#
+:num_instances:                 1
+:instance_type:                 m2.xlarge
+:master_instance_type:          ~
+:hadoop_version:                '0.20'
+:availability_zone:             us-east-1b
+#
+# Running and reporting options
+#
+:alive:                         false
+:enable_debugging:              true
+:emr_runner_verbose:            true
+:emr_runner_debug:              ~
+:step_action:                   CANCEL_AND_WAIT         # CANCEL_AND_WAIT, TERMINATE_JOB_FLOW or CONTINUE
+#
+# Remote Paths
+#
+# Wukong is opinionated about the paths and locations of scripts and
+# everything. Make an S3 bucket and let the wookiee win -- or hack
+# lib/wukong/script/emr_command.rb to be more flexible and send us back a patch.
+#
+:emr_root:                      s3n://emr.infinitemonkeys.info

data/lib/wukong.rb CHANGED

@@ -5,7 +5,9 @@ require 'wukong/bad_record'
 autoload :TypedStruct, 'wukong/typed_struct'
 require 'configliere'; Configliere.use :define
 module Wukong
-  autoload :Dfs,         'wukong/dfs'
-  autoload :Script,      'wukong/script'
-  autoload :Streamer,    'wukong/streamer'
+  autoload :Dfs,             'wukong/dfs'
+  autoload :Script,          'wukong/script'
+  autoload :Streamer,        'wukong/streamer'
+  autoload :Store,           'wukong/store'
+  autoload :FilenamePattern, 'wukong/filename_pattern'
 end

data/lib/wukong/filename_pattern.rb ADDED

@@ -0,0 +1,73 @@
+module Wukong
+    class FilenamePattern
+      # the filename pattern, e.g. 'ripd/:handle/:date/:handle+:timestamp-:pid-:hostname.tsv'
+      attr_accessor :pattern
+      # custom token replacements
+      attr_accessor :token_val_defaults
+      DEFAULT_PATTERN_STR = ":dest_dir/:handle_prefix/:handle/:date/:handle:timestamp-:pid-:hostname.tsv"
+      def initialize pattern, token_val_defaults={}
+        self.pattern = pattern
+        self.token_val_defaults    = token_val_defaults
+      end
+      #
+      # walk through pattern, replacing tokens (eg :time or :pid) with the
+      # corresponding value.
+      #
+      def make token_vals={}
+        token_vals = token_val_defaults.merge token_vals
+        token_vals[:timestamp] ||= Time.now.utc.strftime("%Y%m%d%H%M%S")
+        # CHH_NOTE: The following is broken for patterns that need a ":" or
+        # patterns that need text following a token with no special chars in
+        # between.
+        val = pattern.gsub(/:(\w+)/){ replace($1, token_vals)  }
+        val
+      end
+      def to_s token_vals={}
+        make token_vals
+      end
+      #
+      # substitute for token
+      #
+      def replace token, token_vals
+        token = token.to_sym
+        return token_vals[token] if token_vals.include? token
+        case token
+        when :pid           then pid
+        when :hostname      then hostname
+        when :handle        then token_vals[:handle]
+        when :handle_prefix then token_vals[:handle].to_s[0..5]
+        when :timestamp     then token_vals[:timestamp]
+        when :date          then token_vals[:timestamp][ 0..7]
+        when :time          then token_vals[:timestamp][ 8..13]
+        when :hour          then token_vals[:timestamp][ 8..9]
+        when :h4            then "%0.2d" % (( token_vals[:timestamp][8..9].to_i / 4 ) * 4)
+        when :min           then token_vals[:timestamp][10..11]
+        when :sec           then token_vals[:timestamp][12..13]
+        when :s10           then "%0.2d" % (( token_vals[:timestamp][12..13].to_i / 10 ) * 10)
+        else
+          raise "Don't know how to encode token #{token} #{token_vals[token]}"
+        end
+      end
+      # Memoized: the hostname for the machine running this script.
+      def hostname
+        @hostname ||= ENV['HOSTNAME'] || `hostname`.delete("\n")
+      end
+      # Memoized: the Process ID for this invocation.
+      def pid
+        @pid      ||= Process.pid
+      end
+      # Characters deemed safe in a filename;
+      SAFE_CHARS = 'a-zA-Z0-9_\-\.\+\/\;'
+      def self.sanitize str
+        str.gsub(%r{[^#{SAFE_CHARS}]+}, '-')
+      end
+    end
+end

data/lib/wukong/keystore/tyrant_db.rb CHANGED

@@ -66,17 +66,17 @@ module TokyoDbConnection
     ].freeze unless defined?(TokyoDbConnection::TyrantDb::DB_SERVERS)
     DB_PORTS = {
-      :screen_names    => 12002,
-      :search_ids      => 12003,
+      :tw_screen_names  => 12002,
+      :tw_search_ids    => 12003,
       #
-      :tw_user_info    => 14000,
-      :tw_wordbag      => 14101,
-      :tw_influence    => 14102,
-      :tw_trstrank     => 14103,
-      :tw_conversation => 14104,
+      :tw_user_info     => 14000,
+      :tw_wordbag       => 14101,
+      :tw_influence     => 14102,
+      :tw_trstrank      => 14103,
+      :tw_conversation  => 14104,
       #
-      :screen_names2   => 12004,
-      :search_ids2     => 12005,
+      :tw_screen_names2 => 12004,
+      :tw_search_ids2   => 12005,
       #
       :tw_user_info2    => 14200,
       :tw_wordbag2      => 14201,
@@ -84,7 +84,7 @@ module TokyoDbConnection
       :tw_trstrank2     => 14203,
       :tw_conversation2 => 14204,
       :tw_strong_links2 => 14205,
-      :tw_word_stats2   => 14206,
+      :tw_word_stats2   => 14210,
       #
       :ip_geo_census    => 14400,
     } unless defined?(TokyoDbConnection::TyrantDb::DB_PORTS)
@@ -123,7 +123,7 @@ module TokyoDbConnection
     end
     def handle_error action, e
-      warn "#{action} failed: #{e} #{e.backtrace.join("\t")}" ;
+      Log.warn "#{action} failed: #{e} #{e.backtrace.join("\t")}" ;
       invalidate!
     end

data/lib/wukong/monitor/periodic_monitor.rb CHANGED

@@ -1,72 +1,70 @@
-module Monkeyshines
-  module Monitor
-    #
-    # Accepts a lightweight call every iteration.
-    #
-    # Once either a time or an iteration criterion is met, executes the block
-    # and resets the timer until next execution.
-    #
-    # Note that the +time_interval+ is measured *excution to execution* and not
-    # in multiples of iter_interval. Say I set a time_interval of 300s, and
-    # happen to iterate at 297s and 310s after start.  Then the monitor will
-    # execute at 310s, and the next execution will happen on or after 610s.
-    #
-    # Also note that when *either* criterion is met, *both* criteria are
-    # reset. Say I set a time interval of 300s and an +iter_interval+ of 10_000;
-    # and that at 250s I reach iteration 10_000.  Then the monitor will execute
-    # on or after 20_000 iteration or 550s, whichever happens first.
-    #
-    class PeriodicMonitor
-      attr_accessor :time_interval, :iter_interval
-      attr_accessor :last_time, :current_iter, :iter, :started_at
+module Wukong::Monitor
+  #
+  # Accepts a lightweight call every iteration.
+  #
+  # Once either a time or an iteration criterion is met, executes the block
+  # and resets the timer until next execution.
+  #
+  # Note that the +time_interval+ is measured *excution to execution* and not
+  # in multiples of iter_interval. Say I set a time_interval of 300s, and
+  # happen to iterate at 297s and 310s after start.  Then the monitor will
+  # execute at 310s, and the next execution will happen on or after 610s.
+  #
+  # Also note that when *either* criterion is met, *both* criteria are
+  # reset. Say I set a time interval of 300s and an +iter_interval+ of 10_000;
+  # and that at 250s I reach iteration 10_000.  Then the monitor will execute
+  # on or after 20_000 iteration or 550s, whichever happens first.
+  #
+class PeriodicMonitor
+  attr_accessor :time_interval, :iter_interval
+  attr_accessor :last_time, :current_iter, :iter, :started_at
-      def initialize options={}
-        self.started_at    = Time.now.utc.to_f
-        self.last_time     = started_at
-        self.iter          = 0
-        self.current_iter  = 0
-        self.time_interval = options[:time]
-        self.iter_interval = options[:iters]
-      end
+  def initialize options={}
+    self.started_at    = Time.now.utc.to_f
+    self.last_time     = started_at
+    self.iter          = 0
+    self.current_iter  = 0
+    self.time_interval = options[:time]
+    self.iter_interval = options[:iters]
+  end
-      # True if more than +iter_interval+ has elapsed since last execution.
-      def enough_iterations?
-        iter % iter_interval == 0 if iter_interval
-      end
+  # True if more than +iter_interval+ has elapsed since last execution.
+  def enough_iterations?
+    iter % iter_interval == 0 if iter_interval
+  end
-      # True if more than +time_interval+ has elapsed since last execution.
-      def enough_time? now
-        (now - last_time) > time_interval if time_interval
-      end
+  # True if more than +time_interval+ has elapsed since last execution.
+  def enough_time? now
+    (now - last_time) > time_interval if time_interval
+  end
-      # Time since monitor was created
-      def since
-        Time.now.utc.to_f - started_at
-      end
-      # Overall iterations per second
-      def rate
-        iter.to_f / since.to_f
-      end
-      # "Instantaneous" iterations per second
-      def inst_rate now
-        current_iter.to_f / (now-last_time).to_f
-      end
+  # Time since monitor was created
+  def since
+    Time.now.utc.to_f - started_at
+  end
+  # Overall iterations per second
+  def rate
+    iter.to_f / since.to_f
+  end
+  # "Instantaneous" iterations per second
+  def inst_rate now
+    current_iter.to_f / (now-last_time).to_f
+  end
-      #
-      # if the interval conditions are met, executes block; otherwise just does
-      # bookkeeping and returns.
-      #
-      def periodically &block
-        self.iter += 1
-        self.current_iter += 1
-        now       = Time.now.utc.to_f
-        if enough_iterations? || enough_time?(now)
-          block.call(iter, (now-last_time))
-          self.last_time = now
-          self.current_iter = 0
-        end
-      end
+  #
+  # if the interval conditions are met, executes block; otherwise just does
+  # bookkeeping and returns.
+  #
+  def periodically &block
+    self.iter += 1
+    self.current_iter += 1
+    now       = Time.now.utc.to_f
+    if enough_iterations? || enough_time?(now)
+      block.call(iter, (now-last_time))
+      self.last_time = now
+      self.current_iter = 0
     end
   end
 end
+end

data/lib/wukong/script/hadoop_command.rb CHANGED

@@ -32,6 +32,7 @@ module Wukong
     Settings.define :max_maps_per_node,      :jobconf => true, :description => 'mapred.max.maps.per.node',                               :wukong => true
     Settings.define :max_maps_per_cluster,   :jobconf => true, :description => 'mapred.max.maps.per.cluster',                            :wukong => true
     Settings.define :max_record_length,      :jobconf => true, :description => 'mapred.linerecordreader.maxlength',                      :wukong => true # "Safeguards against corrupted data: lines longer than this (in bytes) are treated as bad records."
+    Settings.define :min_input_split_size,   :jobconf => true, :description => 'mapred.min.split.size',                                  :wukong => true
     Settings.define :noempty,                                  :description => "don't create zero-byte reduce files (hadoop mode only)", :wukong => true
     #
@@ -48,14 +49,14 @@ module Wukong
       hadoop_commandline = [
         hadoop_runner,
         "jar #{Settings[:hadoop_home]}/contrib/streaming/hadoop-*streaming*.jar",
-        hadoop_jobconf_options,
-        "-D mapred.job.name '#{job_name}",
-        "-mapper  '#{map_commandline}'",
-        "-reducer '#{reduce_commandline}'",
+        "-mapper  '#{mapper_commandline}'",
+        "-reducer '#{reducer_commandline}'",
         "-input   '#{input_paths}'",
         "-output  '#{output_path}'",
+        hadoop_jobconf_options,
+        "-jobconf mapred.job.name='#{job_name}'",
         hadoop_recycle_env,
-        hadoop_other_args(input_paths, output_path),
+        hadoop_other_args,
       ].flatten.compact.join(" \t\\\n  ")
       Log.info "  Launching hadoop!"
       execute_command!(hadoop_commandline)
@@ -94,7 +95,7 @@ module Wukong
     # if not, the resulting nil will be elided later
     def jobconf option
       if options[option]
-        "-D %s=%s" % [options.description_for(option), options[option]]
+        "-jobconf %s=%s" % [options.description_for(option), options[option]]
       end
     end

data/lib/wukong/store.rb CHANGED

@@ -1,14 +1,15 @@
-module Monkeyshines
+module Wukong
   module Store
-    extend FactoryModule
-    autoload :Base,                 'monkeyshines/store/base'
-    autoload :FlatFileStore,        'monkeyshines/store/flat_file_store'
-    autoload :ConditionalStore,     'monkeyshines/store/conditional_store'
-    autoload :ChunkedFlatFileStore, 'monkeyshines/store/chunked_flat_file_store'
-    autoload :KeyStore,             'monkeyshines/store/key_store'
-    autoload :TokyoTdbKeyStore,     'monkeyshines/store/tokyo_tdb_key_store'
-    autoload :TyrantTdbKeyStore,    'monkeyshines/store/tyrant_tdb_key_store'
-    autoload :TyrantRdbKeyStore,    'monkeyshines/store/tyrant_rdb_key_store'
-    autoload :ReadThruStore,        'monkeyshines/store/read_thru_store'
+#    extend FactoryModule
+    autoload :Base,                    'wukong/store/base'
+    autoload :FlatFileStore,           'wukong/store/flat_file_store'
+#    autoload :ConditionalStore,       'monkeyshines/store/conditional_store'
+    autoload :ChunkedFlatFileStore,    'wukong/store/chunked_flat_file_store'
+    autoload :ChhChunkedFlatFileStore, 'wukong/store/chh_chunked_flat_file_store'
+#    autoload :KeyStore,               'monkeyshines/store/key_store'
+#    autoload :TokyoTdbKeyStore,       'monkeyshines/store/tokyo_tdb_key_store'
+#    autoload :TyrantTdbKeyStore,      'monkeyshines/store/tyrant_tdb_key_store'
+#    autoload :TyrantRdbKeyStore,      'monkeyshines/store/tyrant_rdb_key_store'
+#    autoload :ReadThruStore,          'monkeyshines/store/read_thru_store'
   end
 end

data/lib/wukong/store/base.rb CHANGED

@@ -1,18 +1,16 @@
-module Monkeyshines
+module Wukong
   module Store
     class Base
-      attr_accessor :options
-      def initialize _options={}
-        self.options = _options
-        Log.info "Creating #{self.class}"
+      def initialize options={}
+        Log.info "Creating #{self.class} with #{options.inspect}"
       end
-      #
+      #Iterate through each object casting it as a new object of klass.
       def each_as klass, &block
         self.each do |*args|
           begin
             item = klass.new *args[1..-1]
-          rescue Exception => e
+          rescue StandardError => e
             Log.info [args, e.to_s, self].join("\t")
             raise e
           end

data/lib/wukong/store/chh_chunked_flat_file_store.rb ADDED

@@ -0,0 +1,37 @@
+module Wukong
+  module Store
+    class ChhChunkedFlatFileStore < Wukong::Store::FlatFileStore
+      attr_accessor :filename_pattern, :handle, :rootdir
+      # Move to configliere
+      Settings.define :chunk_file_pattern,   :default => ":rootdir/:date/:handle:timestamp-:pid.tsv",:description => "The pattern for chunked files."
+      Settings.define :chunk_file_rootdir,   :default => nil, :description => "The root directory for the chunked files."
+      #Note that filemode is inherited from flat_file
+      def initialize options={}
+        # super wants a :filename in the options or it will fail. We need to get the initial filename
+        # set up before we call super, so we need all of the parts of the pattern set up.
+        self.rootdir          = options[:rootdir]   || Settings[:chunk_file_rootdir]
+        self.handle           = options[:handle]
+        pattern               = options[:pattern] || Settings[:chunk_file_pattern]
+        self.filename_pattern = FilenamePattern.new(pattern, :handle => handle, :rootdir => self.rootdir)
+        options[:filename]    = filename_pattern.make()
+        super options
+        self.mkdir!
+      end
+      def new_chunk
+        new_filename = filename_pattern.make()
+        Log.info "Rotating chunked file #{filename} into #{new_filename}"
+        self.flush
+        self.close
+        @filename = new_filename
+        self.mkdir!
+      end
+    end
+  end
+end

data/lib/wukong/store/chunked_flat_file_store.rb CHANGED

@@ -1,22 +1,29 @@
-module Monkeyshines
+module Wukong
   module Store
-    class ChunkedFlatFileStore < Monkeyshines::Store::FlatFileStore
-      attr_accessor :filename_pattern, :chunk_monitor, :handle
+    class ChunkedFlatFileStore < Wukong::Store::FlatFileStore
+      attr_accessor :filename_pattern, :chunk_monitor, :handle, :chunktime, :rootdir
-      DEFAULT_OPTIONS = {
-        :chunktime    => 4*60*60, # default 4 hours
-        :pattern   => ":rootdir/:date/:handle+:timestamp-:pid.tsv",
-        :rootdir   => nil,
-        :filemode  => 'w',
-      }
+      # Move to configliere
+      Settings.define :chunk_file_pattern,   :default => ":rootdir/:date/:handle:timestamp-:pid.tsv",:description => "The pattern for chunked files."
+      Settings.define :chunk_file_chunktime, :default => 4*60*60,:description => "The time interval to keep a chunk file open."
+      Settings.define :chunk_file_rootdir,   :default => nil, :description => "The root directory for the chunked files."
+      #Note that filemode is inherited from flat_file
-      def initialize _options
-        self.options = DEFAULT_OPTIONS.deep_merge(_options)
-        raise "You don't really want a chunk time this small: #{options[:chunktime]}" unless options[:chunktime] > 600
-        self.chunk_monitor    = Monkeyshines::Monitor::PeriodicMonitor.new( :time => options[:chunktime] )
-        self.handle           = options[:handle] || Monkeyshines::CONFIG[:handle]
-        self.filename_pattern = Monkeyshines::Utils::FilenamePattern.new(options[:pattern], :handle => handle, :rootdir => options[:rootdir])
-        super options.merge(:filename => filename_pattern.make())
+      def initialize options={}
+        # super wants a :filename in the options or it will fail. We need to get the initial filename
+        # set up before we call super, so we need all of the parts of the pattern set up.
+        self.chunktime        = options[:chunktime] || Settings[:chunk_file_chunktime]
+        self.rootdir          = options[:rootdir]   || Settings[:chunk_file_rootdir]
+        self.handle           = options[:handle]
+        pattern               = options[:pattern] || Settings[:chunk_file_pattern]
+        self.filename_pattern = FilenamePattern.new(pattern, :handle => handle, :rootdir => self.rootdir)
+        options[:filename]    = filename_pattern.make()
+        super options
+        Log.warn "You don't really want a chunk time this small: #{self.chunktime}" unless self.chunktime > 600
+        self.chunk_monitor    = Wukong::PeriodicMonitor.new( :time => self.chunktime )
         self.mkdir!
       end

data/lib/wukong/store/flat_file_store.rb CHANGED

@@ -1,6 +1,6 @@
 require 'fileutils'; include FileUtils
-module Monkeyshines
+module Wukong
   module Store
     #
     class FlatFileStore < Store::Base
@@ -10,7 +10,7 @@ module Monkeyshines
       # +filename_root+  : first part of name for files
       #
       def initialize options={}
-        Log.debug "New #{self.class} as #{options.inspect}"
+        super options
         self.filename = options[:filename] or raise "Missing filename in #{self.class}"
         self.filemode = options[:filemode] || 'r'
         skip!(options[:skip]) if options[:skip]
@@ -21,7 +21,6 @@ module Monkeyshines
       #
       def each &block
         file.each do |line|
-          next if line[0..0] == '#'
           attrs = line.chomp.split("\t")
           next if attrs.blank?
           yield *attrs
@@ -54,6 +53,10 @@ module Monkeyshines
         @file = nil
       end
+      def flush
+        @file.flush if @file
+      end
       # Ensure the file's directory exists
       def mkdir!
         dir = File.dirname(filename)
@@ -64,7 +67,7 @@ module Monkeyshines
       # write to the file
       def save obj
-        file << obj.to_flat.join("\t")+"\n"
+        file.puts obj
         obj
       end
@@ -74,14 +77,10 @@ module Monkeyshines
         File.size(filename)
       end
-      def set key, *args, &block
-        tok, obj = block.call
-        save obj
-      end
-      # delegates to +#save+ -- writes the object to the file
+      # delegates to +#save+ -- writes the object to the file. Returns self for chaining on the stream.
       def <<(obj)
         save obj
+	self
       end
     end

data/lib/wukong/streamer/em_streamer.rb ADDED

@@ -0,0 +1,7 @@
+module Wukong
+  module Streamer
+    class EmStreamer
+    end
+  end
+end

data/wukong.gemspec CHANGED

@@ -5,11 +5,11 @@
 Gem::Specification.new do |s|
   s.name = %q{wukong}
-  s.version = "1.5.2"
+  s.version = "1.5.3"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["Philip (flip) Kromer"]
-  s.date = %q{2010-08-11}
+  s.date = %q{2010-08-19}
   s.description = %q{  Treat your dataset like a:
       * stream of lines when it's efficient to process by lines
@@ -37,6 +37,7 @@ Gem::Specification.new do |s|
      "bin/hdp-bzip",
      "bin/hdp-cat",
      "bin/hdp-catd",
+     "bin/hdp-cp",
      "bin/hdp-du",
      "bin/hdp-get",
      "bin/hdp-kill",
@@ -138,6 +139,7 @@ Gem::Specification.new do |s|
      "examples/count_keys.rb",
      "examples/count_keys_at_mapper.rb",
      "examples/emr/elastic_mapreduce_example.rb",
+     "examples/emr/emr.yaml",
      "examples/keystore/cassandra_batch_test.rb",
      "examples/keystore/conditional_outputter_example.rb",
      "examples/network_graph/adjacency_list.rb",
@@ -185,6 +187,7 @@ Gem::Specification.new do |s|
      "lib/wukong/extensions/string.rb",
      "lib/wukong/extensions/struct.rb",
      "lib/wukong/extensions/symbol.rb",
+     "lib/wukong/filename_pattern.rb",
      "lib/wukong/keystore/cassandra_conditional_outputter.rb",
      "lib/wukong/keystore/redis_db.rb",
      "lib/wukong/keystore/tyrant_db.rb",
@@ -205,6 +208,7 @@ Gem::Specification.new do |s|
      "lib/wukong/script/local_command.rb",
      "lib/wukong/store.rb",
      "lib/wukong/store/base.rb",
+     "lib/wukong/store/chh_chunked_flat_file_store.rb",
      "lib/wukong/store/chunked_flat_file_store.rb",
      "lib/wukong/store/conditional_store.rb",
      "lib/wukong/store/factory.rb",
@@ -222,6 +226,7 @@ Gem::Specification.new do |s|
      "lib/wukong/streamer/count_keys.rb",
      "lib/wukong/streamer/count_lines.rb",
      "lib/wukong/streamer/counting_reducer.rb",
+     "lib/wukong/streamer/em_streamer.rb",
      "lib/wukong/streamer/filter.rb",
      "lib/wukong/streamer/line_streamer.rb",
      "lib/wukong/streamer/list_reducer.rb",

metadata CHANGED

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: wukong
 version: !ruby/object:Gem::Version
-  hash: 7
+  hash: 5
   prerelease: false
   segments:
   - 1
   - 5
-  - 2
-  version: 1.5.2
+  - 3
+  version: 1.5.3
 platform: ruby
 authors:
 - Philip (flip) Kromer
@@ -15,7 +15,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2010-08-11 00:00:00 -05:00
+date: 2010-08-19 00:00:00 -05:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -131,6 +131,7 @@ files:
 - bin/hdp-bzip
 - bin/hdp-cat
 - bin/hdp-catd
+- bin/hdp-cp
 - bin/hdp-du
 - bin/hdp-get
 - bin/hdp-kill
@@ -232,6 +233,7 @@ files:
 - examples/count_keys.rb
 - examples/count_keys_at_mapper.rb
 - examples/emr/elastic_mapreduce_example.rb
+- examples/emr/emr.yaml
 - examples/keystore/cassandra_batch_test.rb
 - examples/keystore/conditional_outputter_example.rb
 - examples/network_graph/adjacency_list.rb
@@ -279,6 +281,7 @@ files:
 - lib/wukong/extensions/string.rb
 - lib/wukong/extensions/struct.rb
 - lib/wukong/extensions/symbol.rb
+- lib/wukong/filename_pattern.rb
 - lib/wukong/keystore/cassandra_conditional_outputter.rb
 - lib/wukong/keystore/redis_db.rb
 - lib/wukong/keystore/tyrant_db.rb
@@ -299,6 +302,7 @@ files:
 - lib/wukong/script/local_command.rb
 - lib/wukong/store.rb
 - lib/wukong/store/base.rb
+- lib/wukong/store/chh_chunked_flat_file_store.rb
 - lib/wukong/store/chunked_flat_file_store.rb
 - lib/wukong/store/conditional_store.rb
 - lib/wukong/store/factory.rb
@@ -316,6 +320,7 @@ files:
 - lib/wukong/streamer/count_keys.rb
 - lib/wukong/streamer/count_lines.rb
 - lib/wukong/streamer/counting_reducer.rb
+- lib/wukong/streamer/em_streamer.rb
 - lib/wukong/streamer/filter.rb
 - lib/wukong/streamer/line_streamer.rb
 - lib/wukong/streamer/list_reducer.rb