RubyGems - wukong - Versions diffs - 1.4.7 → 1.4.9 - Mend

wukong 1.4.7 → 1.4.9

Files changed (62) hide show

data/CHANGELOG.textile +9 -0
data/README.textile +1 -1
data/bin/hdp-bzip +28 -0
data/bin/hdp-mkdir +1 -1
data/bin/hdp-stream-flat +3 -2
data/bin/wu-lign +32 -18
data/docpages/pig/cookbook.html +481 -0
data/docpages/pig/images/hadoop-logo.jpg +0 -0
data/docpages/pig/images/instruction_arrow.png +0 -0
data/docpages/pig/images/pig-logo.gif +0 -0
data/docpages/pig/piglatin_ref1.html +1103 -0
data/docpages/pig/piglatin_ref2.html +14340 -0
data/docpages/pig/setup.html +505 -0
data/docpages/pig/skin/basic.css +166 -0
data/docpages/pig/skin/breadcrumbs.js +237 -0
data/docpages/pig/skin/fontsize.js +166 -0
data/docpages/pig/skin/getBlank.js +40 -0
data/docpages/pig/skin/getMenu.js +45 -0
data/docpages/pig/skin/images/chapter.gif +0 -0
data/docpages/pig/skin/images/chapter_open.gif +0 -0
data/docpages/pig/skin/images/current.gif +0 -0
data/docpages/pig/skin/images/external-link.gif +0 -0
data/docpages/pig/skin/images/header_white_line.gif +0 -0
data/docpages/pig/skin/images/page.gif +0 -0
data/docpages/pig/skin/images/pdfdoc.gif +0 -0
data/docpages/pig/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
data/docpages/pig/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
data/docpages/pig/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
data/docpages/pig/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
data/docpages/pig/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
data/docpages/pig/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
data/docpages/pig/skin/print.css +54 -0
data/docpages/pig/skin/profile.css +181 -0
data/docpages/pig/skin/screen.css +587 -0
data/docpages/pig/tutorial.html +1059 -0
data/docpages/pig/udf.html +1509 -0
data/examples/keystore/conditional_outputter_example.rb +70 -0
data/examples/{graph → network_graph}/adjacency_list.rb +0 -0
data/examples/{graph → network_graph}/breadth_first_search.rb +0 -0
data/examples/{graph → network_graph}/gen_2paths.rb +0 -0
data/examples/{graph → network_graph}/gen_multi_edge.rb +0 -0
data/examples/{graph → network_graph}/gen_symmetric_links.rb +0 -0
data/examples/pagerank/run_pagerank.sh +10 -8
data/examples/{apache_log_parser.rb → server_logs/apache_log_parser.rb} +0 -0
data/examples/stupidly_simple_filter.rb +43 -0
data/lib/wukong/extensions/hash.rb +13 -0
data/lib/wukong/extensions/hash_like.rb +7 -0
data/lib/wukong/keystore/cassandra_conditional_outputter.rb +122 -0
data/lib/wukong/script.rb +27 -22
data/lib/wukong/script/hadoop_command.rb +5 -3
data/lib/wukong/streamer/accumulating_reducer.rb +2 -1
data/wukong.gemspec +64 -26
metadata +89 -31
data/docpages/pig/PigLatinReferenceManual.html +0 -19134
data/examples/foo.rb +0 -9
data/examples/package-local.rb +0 -100
data/examples/package.rb +0 -96
data/examples/run_all.sh +0 -47

data/examples/keystore/conditional_outputter_example.rb ADDED Viewed

@@ -0,0 +1,70 @@
+#!/usr/bin/env ruby
+require 'rubygems'
+require 'cassandra'
+require 'wukong'
+require 'wukong/encoding'
+require 'wukong/keystore/cassandra_conditional_outputter'
+#
+# Usage:
+#   echo -e "bob has boobs ha ha ha" | ./examples/keystore/conditional_outputter_example.rb  --map
+#
+CASSANDRA_KEYSPACE = 'CorpusAnalysis'
+#
+# This demonstrates the CassandraConditionalOutputter module.
+#
+# CassandraConditionalOutputter uses and a cassandra key-value store to
+# track unique IDs and prevent output of any record already present in the
+# database.
+#
+# For this example, it takes an input stream, generates all letter pairs for
+# each line, and emits
+#
+#
+class LetterPairMapper <  Wukong::Streamer::LineStreamer
+  include CassandraConditionalOutputter
+  #
+  # A unique key for the given record. If an object with
+  # that key has been seen, it won't be re-emitted.
+  #
+  # In this example, we'll just encode the letter pair
+  #
+  def conditional_output_key record
+    record.to_s.wukong_encode(:url)
+  end
+  #
+  # Emit each letter pair in the line.
+  # the CassandraConditionalOutputter will swallow all duplicate lines.
+  #
+  def process line, &block
+    letter_pairs(line).each do |pair|
+      yield(pair)
+    end
+  end
+  # turn a string into the pairs of adjacent letters
+  #
+  # @example
+  #   letter_pairs('abracadabra')
+  #   # => ['ab', 'br',
+  def letter_pairs str, &block
+    chars = str.chars.to_a
+    chars[0..-2].zip(chars[1..-1]).map(&:join)
+  end
+  # Clear the entire cached keys column at the end of the run.
+  #
+  # You almost certainly don't want to do this in a real script.
+  #
+  def after_stream
+    $stderr.puts 'Clearing conditional_output_key cache...'
+    @key_cache.clear_column_family!(conditional_output_key_column)
+  end
+end
+# Execute the script
+Wukong::Script.new( LetterPairMapper, nil ).run

data/examples/{graph → network_graph}/adjacency_list.rb RENAMED Viewed

File without changes

data/examples/{graph → network_graph}/breadth_first_search.rb RENAMED Viewed

File without changes

data/examples/{graph → network_graph}/gen_2paths.rb RENAMED Viewed

File without changes

data/examples/{graph → network_graph}/gen_multi_edge.rb RENAMED Viewed

File without changes

data/examples/{graph → network_graph}/gen_symmetric_links.rb RENAMED Viewed

File without changes

data/examples/pagerank/run_pagerank.sh CHANGED Viewed

@@ -1,19 +1,21 @@
 #!/usr/bin/env bash
 # Directory to pagerank on.
-work_dir=$1 ; shift
-if [ "$work_dir" == '' ] ; then echo "Please specify the parent of the directory made by gen_initial_pagerank" ; exit ; fi
+work_dir=$1     ; shift
+if [ "$work_dir" == '' ] ; then echo "Please specify the parent of the directory made by gen_initial_pagerank: $0 initial_dir [number_of_iterations] [start_iteration]" ; exit ; fi
+# How many rounds to run: default 10
+n_iters=${1-10} ; shift
+# the iteration to start with: default 0
+start_i=${1-0}  ; shift
-# How many rounds to run
-max_iter=10
 # this directory
 script_dir="`dirname $0`"
-for (( curr=0 , next=1 ; "$curr" < "$max_iter" ; curr++ , next++ )) ; do
-  curr_str=`printf "%03d" ${curr}`
-  next_str=`printf "%03d" ${next}`
+for (( iter=0 ; "$iter" < "$n_iters" ; iter++ )) ; do
+  curr_str=`printf "%03d" $(( $start_i + $iter     ))`
+  next_str=`printf "%03d" $(( $start_i + $iter + 1 ))`
   curr_dir=$work_dir/pagerank_graph_${curr_str}
   next_dir=$work_dir/pagerank_graph_${next_str}
+  echo -e "Iteration $(( $iter + 1 )) / $n_iters:\t `basename $curr_dir` => `basename $next_dir`"
   $script_dir/pagerank.rb --rm --run $curr_dir $next_dir
 done

data/examples/{apache_log_parser.rb → server_logs/apache_log_parser.rb} RENAMED Viewed

File without changes

data/examples/stupidly_simple_filter.rb ADDED Viewed

@@ -0,0 +1,43 @@
+#!/usr/bin/env ruby
+require 'rubygems'
+require 'wukong'
+# Run as (local mode)
+#
+#   ./examples/stupidly_simple_filter.rb --run=local input.tsv output.tsv
+#
+# for hadoop mode,
+#
+#   ./examples/stupidly_simple_filter.rb --run=hadoop input.tsv output.tsv
+#
+# For debugging, run
+#
+#   cat input.tsv | ./examples/stupidly_simple_filter.rb --map input.tsv | more
+#
+#
+# A very simple mapper -- looks for a regex match in one field,
+# and emits the whole record if the field matches
+#
+class GrepMapper < Wukong::Streamer::RecordStreamer
+  MATCHER = %r{(ford|mercury|saab|mazda|isuzu)}
+  #
+  # Given a series of records like:
+  #
+  #    tweet  123456789   20100102030405     @frank: I'm having a bacon sandwich
+  #    tweet  123456789   20100102030405     @jerry, I'm having your baby
+  #
+  # emits only the lines matching that regex
+  #
+  def process rsrc, id, timestamp, text, *rest
+    yield [rsrc, id, timestamp, text, *rest] if line =~ MATCHER
+  end
+end
+# Execute the script
+Wukong::Script.new(
+  GrepMapper,
+  nil
+  ).run

data/lib/wukong/extensions/hash.rb CHANGED Viewed

@@ -141,6 +141,19 @@ class Hash
     replace(compact)
   end
+  #
+  # remove all key-value pairs where the value is blank
+  #
+  def compact_blank
+    reject{|key,val| val.blank? }
+  end
+  #
+  # Replace the hash with its compact_blank'ed self
+  #
+  def compact_blank!
+    replace(compact_blank)
+  end
   # Stolen from ActiveSupport::CoreExtensions::Hash::ReverseMerge.
   def reverse_merge(other_hash)
     other_hash.merge(self)

data/lib/wukong/extensions/hash_like.rb CHANGED Viewed

@@ -88,6 +88,13 @@ module Wukong
       merge hsh2, &Hash::DEEP_MERGER
     end
+    #
+    # remove all key-value pairs where the value is blank
+    #
+    def compact_blank
+      to_hash.compact_blank!
+    end
     module ClassMethods
       #
       # Instantiate an instance of the struct from a hash

data/lib/wukong/keystore/cassandra_conditional_outputter.rb ADDED Viewed

@@ -0,0 +1,122 @@
+#
+# For a stream process that sees a significant number of duplicated heavyweight
+# objects, it may be better to deduplicate them midflight (rather than, say,
+# using a reducer to effectively `cat | sort | uniq` the data).
+#
+# This uses a cassandra key-value store to track unique IDs and prevent output
+# of any record already present in the database.  (Why cassandra? Because we use
+# it in production.  Might be nice to rewrite this example against redis or
+# TokyoTyrant or something less demanding.)
+#
+# Things you have to do:
+#
+# * Override the conditional_output_key method to distinguish identical records
+# * Define a constant CASSANDRA_KEYSPACE giving the Cassandra keyspace you're working in
+# * (Optionally) override conditional_output_key_column
+#
+# * In your cassandra storage-conf.xml, add a column family to your keyspace:
+#
+#     <Keyspace Name="CorpusAnalysis">
+#         <KeysCachedFraction>0.01</KeysCachedFraction>
+#
+#         <!-- Added for CassandraConditionalOutputter -->
+#         <ColumnFamily CompareWith="UTF8Type" Name="LetterPairMapperKeys" />
+#
+#         <ReplicaPlacementStrategy>org.apache.cassandra.locator.RackUnawareStrategy</ReplicaPlacementStrategy>
+#         <ReplicationFactor>1</ReplicationFactor>
+#         <EndPointSnitch>org.apache.cassandra.locator.EndPointSnitch</EndPointSnitch>
+#     </Keyspace>
+#
+#  In this example, the CASSANDRA_KEYSPACE is 'CorpusAnalysis' and the
+#  conditional_output_key_column is 'LetterPairMapperKeys'
+#
+# @example
+#    Given
+#      tweet  123456789   20100102030405     @frank: I'm having a bacon sandwich
+#      tweet      24601   20100104136526     @jerry, I'm having your baby
+#      tweet    8675309   20100102030405     I find pastrami to be the most sensual of the salted, cured meats.
+#      tweet      24601   20100104136526     @jerry, I'm having your baby
+#      tweet       1137   20100119234532     These pretzels are making me thirsty
+#      ....
+#    will emit:
+#      tweet  123456789   20100102030405     @frank: I'm having a bacon sandwich
+#      tweet      24601   20100104136526     @jerry, I'm having your baby
+#      tweet    8675309   20100102030405     I find pastrami to be the most sensual of the salted, cured meats.
+#      tweet      24601   20100104136526     @jerry, I'm having your baby
+#      tweet       1137   20100119234532     These pretzels are making me thirsty
+#      ....
+#
+module CassandraConditionalOutputter
+  #
+  # A unique key for the given record. If an object with
+  # that key has been seen, it won't be re-emitted.
+  #
+  # You will almost certainly want to override this method in your subclass.  Be
+  # sure that the key is a string, and is encoded properly (Cassandra likes to
+  # strip whitespace from keys, for instance).
+  #
+  def conditional_output_key record
+    record.to_s
+  end
+  #
+  # Checks each record against the key cache
+  # Swallows records already there,
+  #
+  #
+  def emit record, &block
+    key = conditional_output_key(record)
+    if should_emit?(record)
+      set_key(key, {'t' => record.timestamp})
+      super record
+    end
+  end
+  # Default. Emit record if its key is not already contained
+  # in the key-value store. Overwrite this as necessary
+  def should_emit? record
+    key = conditional_output_key(record)
+    !has_key?(key)
+  end
+  # Check for presence of key in the cache
+  def has_key? key
+    not key_cache.get(conditional_output_key_column, key).blank?
+  end
+  # register key in the key_cache
+  def set_key key, data={'t' => '0'}
+    key_cache.insert(conditional_output_key_column, key, data)
+  end
+  # nuke key from the key_cache
+  def remove_key key
+    key_cache.remove(conditional_output_key_column, key)
+  end
+  #
+  # Key cache implementation in Cassandra
+  #
+  # The cache
+  def key_cache
+    @key_cache ||= Cassandra.new(CASSANDRA_KEYSPACE)
+  end
+  # The column to use for the key cache. By default, the class name plus 'Keys',
+  # but feel free to override.
+  #
+  # @example
+  #
+  #    class FooMapper < Wukong::Streamer::RecordStreamer
+  #      include ConditionalOutputter
+  #    end
+  #    FooMapper.new.conditional_output_key_column
+  #    # => 'FooMapperKeys'
+  #
+  def conditional_output_key_column
+    self.class.to_s+'Keys'
+  end
+end

data/lib/wukong/script.rb CHANGED Viewed

@@ -82,6 +82,7 @@ module Wukong
     Settings.define :default_run_mode, :default => 'hadoop',    :description => 'Run as local or as hadoop?', :wukong => true, :hide_help => false
     Settings.define :default_mapper,   :default => '/bin/cat',  :description => 'The command to run when a nil mapper is given.', :wukong => true, :hide_help => true
     Settings.define :default_reducer,  :default => '/bin/cat',  :description => 'The command to run when a nil reducer is given.', :wukong => true, :hide_help => true
+    Settings.define :map_command,      :description => "shell command to run as mapper, in place of this wukong script", :wukong => true
     Settings.define :hadoop_home,      :default => '/usr/lib/hadoop', :env_var => 'HADOOP_HOME', :description => "Path to hadoop installation; :hadoop_home/bin/hadoop should run hadoop.", :wukong => true
     Settings.define :hadoop_runner,    :description => "Path to hadoop script; usually, set :hadoop_home instead of this.", :wukong => true
     Settings.define :map,              :description => "run the script's map phase. Reads/writes to STDIN/STDOUT.", :wukong => true
@@ -118,11 +119,11 @@ module Wukong
     #   end
     #   MyScript.new(MyMapper, nil).run
     #
-    def initialize mapper_klass, reducer_klass, extra_options={}
+    def initialize mapper_klass, reducer_klass=nil, extra_options={}
       self.options = Settings.dup
-      options.resolve!
-      options.merge! self.default_options
-      options.merge! extra_options
+      self.options.resolve!
+      self.options.merge! self.default_options
+      self.options.merge! extra_options
       self.mapper_klass  = mapper_klass
       self.reducer_klass = reducer_klass
       # If no reducer_klass and no reduce_command, then skip the reduce phase
@@ -141,24 +142,29 @@ module Wukong
     end
     #
-    # by default, call this script in --map mode
+    # Shell command for map phase. By default, calls the script in --map mode
+    # In hadoop mode, this is given to the hadoop streaming command.
+    # In local mode, it's given to the system() call
     #
     def map_command
-      case
-      when mapper_klass
-        "#{ruby_interpreter_path} #{this_script_filename} --map " + non_wukong_params
-      else options[:map_command] || options[:default_mapper] end
+      if mapper_klass
+         "#{ruby_interpreter_path} #{this_script_filename} --map " + non_wukong_params
+      else
+        options[:map_command] || options[:default_mapper]
+      end
     end
     #
-    # Shell command for reduce phase
-    # by default, call this script in --reduce mode
+    # Shell command for reduce phase. By default, calls the script in --reduce mode
+    # In hadoop mode, this is given to the hadoop streaming command.
+    # In local mode, it's given to the system() call
     #
     def reduce_command
-      case
-      when reducer_klass
-        "#{ruby_interpreter_path} #{this_script_filename} --reduce " + non_wukong_params
-      else options[:reduce_command] || options[:default_reducer] end
+      if reducer_klass
+         "#{ruby_interpreter_path} #{this_script_filename} --reduce " + non_wukong_params
+      else
+        options[:reduce_command]
+      end
     end
     #
@@ -187,10 +193,10 @@ module Wukong
     end
     def input_output_paths
-      # input / output paths
-      input_path, output_path = options.rest[0..1]
-      raise "You need to specify a parsed input directory and a directory for output. Got #{ARGV.inspect}" if (! options[:dry_run]) && (input_path.blank? || output_path.blank?)
-      [input_path, output_path]
+      output_path = options.rest.pop
+      input_paths = options.rest.reject(&:blank?)
+      raise "You need to specify a parsed input directory and a directory for output. Got #{ARGV.inspect}" if (! options[:dry_run]) && (input_paths.blank? || output_path.blank?)
+      [input_paths, output_path]
     end
     def maybe_overwrite_output_paths! output_path
@@ -218,8 +224,7 @@ module Wukong
     def ruby_interpreter_path
       Pathname.new(
                    File.join(Config::CONFIG["bindir"],
-                             Config::CONFIG["RUBY_INSTALL_NAME"]+
-                             Config::CONFIG["EXEEXT"])
+                             Config::CONFIG["RUBY_INSTALL_NAME"]+Config::CONFIG["EXEEXT"])
                    ).realpath
     end
@@ -229,10 +234,10 @@ module Wukong
     def exec_hadoop_streaming
       $stderr.puts "Streaming on self"
       input_path, output_path = input_output_paths
-      maybe_overwrite_output_paths! output_path
       command = runner_command(input_path, output_path)
       $stderr.puts command
       unless options[:dry_run]
+        maybe_overwrite_output_paths! output_path
         $stdout.puts `#{command}`
       end
     end

data/lib/wukong/script/hadoop_command.rb CHANGED Viewed

@@ -28,6 +28,7 @@ module Wukong
     Settings.define :reuse_jvms,             :jobconf => true, :description => 'mapred.job.reuse.jvm.num.tasks', :wukong => true
     Settings.define :respect_exit_status,    :jobconf => true, :description => 'stream.non.zero.exit.is.failure', :wukong => true
     Settings.define :noempty,                                  :description => "don't create zero-byte reduce files (hadoop mode only)", :wukong => true
+    Settings.define :job_name,               :jobconf => true, :description => 'mapred.job.name', :wukong => true
     # mapred.linerecordreader.maxlength :description => "Safeguards against corrupted data: lines longer than this (in bytes) are treated as bad records."
     # emit a -jobconf hadoop option if the simplified command line arg is present
@@ -67,12 +68,13 @@ module Wukong
       ]
     end
-    def hadoop_other_args
+    def hadoop_other_args input_path, output_path
       extra_str_args  = [ options[:extra_args] ]
       extra_str_args               += ' -lazyOutput' if options[:noempty]  # don't create reduce file if no records
       options[:reuse_jvms]          = '-1'     if (options[:reuse_jvms] == true)
       options[:respect_exit_status] = 'false'  if (options[:ignore_exit_status] == true)
-      extra_hsh_args = [:map_speculative, :timeout, :reuse_jvms, :respect_exit_status].map{|opt| jobconf(opt)  }
+      options[:job_name] ||= "#{File.basename(this_script_filename)}---#{input_path}---#{output_path}".gsub(%r{[^\w/\.\-\+]+}, '')
+      extra_hsh_args = [:job_name, :map_speculative, :timeout, :reuse_jvms, :respect_exit_status].map{|opt| jobconf(opt)  }
       extra_str_args + extra_hsh_args
     end
@@ -105,7 +107,7 @@ module Wukong
         "-input   '#{input_path}'",
         "-output  '#{output_path}'",
         hadoop_recycle_env,
-        hadoop_other_args,
+        hadoop_other_args(input_path, output_path),
       ].flatten.compact.join(" \t\\\n  ")
     end