RubyGems - wukong - Versions diffs - 0.1.1 - Mend

wukong 0.1.1

Files changed (143) hide show

data/LICENSE.textile +107 -0
data/README.textile +166 -0
data/bin/cutc +30 -0
data/bin/cuttab +5 -0
data/bin/greptrue +8 -0
data/bin/hdp-cat +3 -0
data/bin/hdp-catd +3 -0
data/bin/hdp-du +81 -0
data/bin/hdp-get +3 -0
data/bin/hdp-kill +3 -0
data/bin/hdp-ls +10 -0
data/bin/hdp-mkdir +3 -0
data/bin/hdp-mv +3 -0
data/bin/hdp-parts_to_keys.rb +77 -0
data/bin/hdp-ps +3 -0
data/bin/hdp-put +3 -0
data/bin/hdp-rm +11 -0
data/bin/hdp-sort +29 -0
data/bin/hdp-stream +29 -0
data/bin/hdp-stream-flat +18 -0
data/bin/hdp-sync +17 -0
data/bin/hdp-wc +67 -0
data/bin/md5sort +20 -0
data/bin/tabchar +5 -0
data/bin/uniqc +3 -0
data/bin/wu-hist +3 -0
data/bin/wu-lign +177 -0
data/bin/wu-sum +30 -0
data/doc/INSTALL.textile +41 -0
data/doc/LICENSE.textile +107 -0
data/doc/README-tutorial.textile +163 -0
data/doc/README-wulign.textile +59 -0
data/doc/README-wutils.textile +128 -0
data/doc/TODO.textile +61 -0
data/doc/UsingWukong-part1-setup.textile +2 -0
data/doc/UsingWukong-part2-scraping.textile +2 -0
data/doc/UsingWukong-part3-parsing.textile +132 -0
data/doc/code/api_response_example.txt +20 -0
data/doc/code/parser_skeleton.rb +38 -0
data/doc/hadoop-nfs.textile +51 -0
data/doc/hadoop-setup.textile +29 -0
data/doc/index.textile +124 -0
data/doc/intro_to_map_reduce/MapReduceDiagram.graffle +0 -0
data/doc/links.textile +42 -0
data/doc/overview.textile +91 -0
data/doc/pig/PigLatinExpressionsList.txt +122 -0
data/doc/pig/PigLatinReferenceManual.html +19134 -0
data/doc/pig/PigLatinReferenceManual.txt +1640 -0
data/doc/tips.textile +116 -0
data/doc/usage.textile +102 -0
data/doc/utils.textile +48 -0
data/examples/README.txt +17 -0
data/examples/and_pig/sample_queries.rb +128 -0
data/examples/apache_log_parser.rb +53 -0
data/examples/count_keys.rb +56 -0
data/examples/count_keys_at_mapper.rb +57 -0
data/examples/graph/adjacency_list.rb +74 -0
data/examples/graph/breadth_first_search.rb +79 -0
data/examples/graph/gen_2paths.rb +68 -0
data/examples/graph/gen_multi_edge.rb +103 -0
data/examples/graph/gen_symmetric_links.rb +53 -0
data/examples/package-local.rb +100 -0
data/examples/package.rb +96 -0
data/examples/pagerank/README.textile +6 -0
data/examples/pagerank/gen_initial_pagerank_graph.pig +57 -0
data/examples/pagerank/pagerank.rb +88 -0
data/examples/pagerank/pagerank_initialize.rb +46 -0
data/examples/pagerank/run_pagerank.sh +19 -0
data/examples/rank_and_bin.rb +173 -0
data/examples/run_all.sh +47 -0
data/examples/sample_records.rb +44 -0
data/examples/size.rb +60 -0
data/examples/word_count.rb +95 -0
data/lib/wukong.rb +11 -0
data/lib/wukong/and_pig.rb +62 -0
data/lib/wukong/and_pig/README.textile +12 -0
data/lib/wukong/and_pig/as.rb +37 -0
data/lib/wukong/and_pig/data_types.rb +30 -0
data/lib/wukong/and_pig/functions.rb +50 -0
data/lib/wukong/and_pig/generate.rb +85 -0
data/lib/wukong/and_pig/generate/variable_inflections.rb +82 -0
data/lib/wukong/and_pig/junk.rb +51 -0
data/lib/wukong/and_pig/operators.rb +8 -0
data/lib/wukong/and_pig/operators/compound.rb +29 -0
data/lib/wukong/and_pig/operators/evaluators.rb +7 -0
data/lib/wukong/and_pig/operators/execution.rb +15 -0
data/lib/wukong/and_pig/operators/file_methods.rb +29 -0
data/lib/wukong/and_pig/operators/foreach.rb +98 -0
data/lib/wukong/and_pig/operators/groupies.rb +212 -0
data/lib/wukong/and_pig/operators/load_store.rb +65 -0
data/lib/wukong/and_pig/operators/meta.rb +42 -0
data/lib/wukong/and_pig/operators/relational.rb +129 -0
data/lib/wukong/and_pig/pig_struct.rb +48 -0
data/lib/wukong/and_pig/pig_var.rb +95 -0
data/lib/wukong/and_pig/symbol.rb +29 -0
data/lib/wukong/and_pig/utils.rb +0 -0
data/lib/wukong/bad_record.rb +18 -0
data/lib/wukong/boot.rb +47 -0
data/lib/wukong/datatypes.rb +24 -0
data/lib/wukong/datatypes/enum.rb +123 -0
data/lib/wukong/dfs.rb +80 -0
data/lib/wukong/encoding.rb +111 -0
data/lib/wukong/extensions.rb +15 -0
data/lib/wukong/extensions/array.rb +18 -0
data/lib/wukong/extensions/blank.rb +93 -0
data/lib/wukong/extensions/class.rb +189 -0
data/lib/wukong/extensions/date_time.rb +24 -0
data/lib/wukong/extensions/emittable.rb +82 -0
data/lib/wukong/extensions/hash.rb +120 -0
data/lib/wukong/extensions/hash_like.rb +119 -0
data/lib/wukong/extensions/hashlike_class.rb +47 -0
data/lib/wukong/extensions/module.rb +2 -0
data/lib/wukong/extensions/pathname.rb +27 -0
data/lib/wukong/extensions/string.rb +65 -0
data/lib/wukong/extensions/struct.rb +17 -0
data/lib/wukong/extensions/symbol.rb +11 -0
data/lib/wukong/logger.rb +53 -0
data/lib/wukong/models/graph.rb +27 -0
data/lib/wukong/rdf.rb +104 -0
data/lib/wukong/schema.rb +37 -0
data/lib/wukong/script.rb +265 -0
data/lib/wukong/script/hadoop_command.rb +111 -0
data/lib/wukong/script/local_command.rb +14 -0
data/lib/wukong/streamer.rb +13 -0
data/lib/wukong/streamer/accumulating_reducer.rb +89 -0
data/lib/wukong/streamer/base.rb +76 -0
data/lib/wukong/streamer/count_keys.rb +30 -0
data/lib/wukong/streamer/count_lines.rb +26 -0
data/lib/wukong/streamer/filter.rb +20 -0
data/lib/wukong/streamer/line_streamer.rb +12 -0
data/lib/wukong/streamer/list_reducer.rb +20 -0
data/lib/wukong/streamer/preprocess_with_pipe_streamer.rb +22 -0
data/lib/wukong/streamer/rank_and_bin_reducer.rb +145 -0
data/lib/wukong/streamer/set_reducer.rb +14 -0
data/lib/wukong/streamer/struct_streamer.rb +48 -0
data/lib/wukong/streamer/summing_reducer.rb +29 -0
data/lib/wukong/streamer/uniq_by_last_reducer.rb +44 -0
data/lib/wukong/typed_struct.rb +12 -0
data/lib/wukong/wukong_class.rb +21 -0
data/spec/bin/hdp-wc_spec.rb +4 -0
data/spec/spec_helper.rb +0 -0
data/wukong.gemspec +179 -0
metadata +214 -0

data/examples/run_all.sh ADDED Viewed

@@ -0,0 +1,47 @@
+#!/usr/bin/env bash
+src_path="tmp/README.textile"
+out_root="tmp/test"
+hdp_opts="--map_tasks=1 --reduce_tasks=1"
+# ---------------------------------------------------------------------------
+#
+# Set up directories and copy over sample input
+#
+# hdp-rm ${src_path}
+# hdp-put `dirname $0`/../README.textile tmp/
+# hdp-mkdir $out_root
+# ---------------------------------------------------------------------------
+#
+# Run scripts
+#
+cmd="word_count"
+    # hdp-rm -r ${out_root}/${cmd}
+    # ./examples/${cmd}.rb  	--run $hdp_opts $src_path ${out_root}/${cmd}
+    # hdp-catd ${out_root}/${cmd} | head -n 20
+    word_count=${out_root}/${cmd}
+cmd="sample_records"
+    # hdp-rm -r ${out_root}/${cmd}
+    # ./examples/${cmd}.rb  	--sampling_fraction=0.8 \
+    #     			--run $hdp_opts $src_path ${out_root}/${cmd}
+    # hdp-catd ${out_root}/${cmd} | head -n 200 | tail -n 20
+    sample_records=${out_root}/${cmd}
+# cmd="size"
+#     hdp-rm -r ${out_root}/${cmd}
+#     ./examples/${cmd}.rb  	--run $hdp_opts $src_path ${out_root}/${cmd}
+#     hdp-catd ${out_root}/${cmd}
+#     size=${out_root}/${cmd}
+cmd="count_keys"
+    hdp-rm -r ${out_root}/${cmd}
+    ./examples/${cmd}.rb  	--run $hdp_opts $word_count ${out_root}/${cmd}
+    hdp-catd ${out_root}/${cmd}  | head -n 200 | tail -n 20
+    count_keys=${out_root}/${cmd}

data/examples/sample_records.rb ADDED Viewed

@@ -0,0 +1,44 @@
+#!/usr/bin/env ruby
+$: << File.dirname(__FILE__)+'/../lib'
+require 'wukong'
+#
+# Probabilistically emit some fraction of record/lines
+#
+# Set the sampling fraction at the command line using the
+#   --sampling_fraction=
+# option: for example, to take a random 1/1000th of the lines in huge_files,
+#  ./examples/sample_records.rb --sampling_fraction=0.001 --go huge_files sampled_files
+#
+class Mapper < Wukong::Streamer::LineStreamer
+  include Wukong::Streamer::Filter
+  #
+  # floating-point number between 0 and 1 giving the fraction of lines to emit:
+  # at sampling_fraction=1 all records are emitted, at 0 none are.
+  #
+  # Takes its value from a mandatory command-line option
+  #
+  def sampling_fraction
+    @sampling_fraction ||= ( options[:sampling_fraction] && options[:sampling_fraction].to_f ) or
+      raise "Please supply a --sampling_fraction= argument, a decimal number between 0 and 1"
+  end
+  #
+  # randomly decide to emit +sampling_fraction+ fraction of lines
+  #
+  def emit? line
+    rand < self.sampling_fraction
+  end
+end
+class Script < Wukong::Script
+  def default_options
+    super.merge :reduce_tasks => 0
+  end
+end
+#
+# Executes the script
+#
+Script.new( Mapper, nil ).run

data/examples/size.rb ADDED Viewed

@@ -0,0 +1,60 @@
+#!/usr/bin/env ruby
+$: << File.dirname(__FILE__)+'/../lib'
+require 'wukong'
+module Size
+  #
+  # Feed the entire dataset through wc and sum the results
+  #
+  class Script < Wukong::Script
+    #
+    # Don't implement a wukong script to do something if there's a unix command
+    # that does it faster: just override map_command or reduce_command in your
+    # subclass of Wukong::Script to return the complete command line
+    #
+    def map_command
+      '/usr/bin/wc'
+    end
+    # Make all records go to one reducer
+    def default_options
+      super.merge :reduce_tasks => 1
+    end
+  end
+  #
+  # Sums the numeric value of each column in its input
+  #
+  class Reducer < Wukong::Streamer::Base
+    attr_accessor :sums
+    #
+    # The unix +wc+ command uses whitespace, not tabs, so we'll recordize
+    # accordingly.
+    #
+    def recordize line
+      line.strip.split(/\s+/)
+    end
+    #
+    # add each corresponding column in the input
+    #
+    def process *vals
+      self.sums = vals.zip( sums || [] ).map{|val,sum| val.to_i + sum.to_i }
+    end
+    #
+    # run through the whole reduction input and then output the total
+    #
+    def stream *args
+      super *args
+      emit sums
+    end
+  end
+end
+# Execute the script
+Size::Script.new(
+  nil,
+  Size::Reducer
+  ).run

data/examples/word_count.rb ADDED Viewed

@@ -0,0 +1,95 @@
+#!/usr/bin/env ruby
+$: << File.dirname(__FILE__)+'/../lib'
+require 'wukong'
+module WordCount
+  class Mapper < Wukong::Streamer::LineStreamer
+    #
+    # Split a string into its constituent words.
+    #
+    # This is pretty simpleminded:
+    # * downcase the word
+    # * Split at any non-alphanumeric boundary, including '_'
+    # * However, preserve the special cases of 's or 't at the end of a
+    #   word.
+    #
+    #   tokenize("Jim's dawg won't hunt: dawg_hunt error #3007a4")
+    #   # => ["jim's", "dawd", "won't", "hunt", "dawg", "hunt", "error", "3007a4"]
+    #
+    def tokenize str
+      return [] unless str
+      str = str.downcase;
+      # kill off all punctuation except [stuff]'s or [stuff]'t
+      # this includes hyphens (words are split)
+      str = str.
+        gsub(/[^a-zA-Z0-9\']+/, ' ').
+        gsub(/(\w)\'([st])\b/, '\1!\2').gsub(/\'/, ' ').gsub(/!/, "'")
+      # Busticate at whitespace
+      words = str.strip.split(/\s+/)
+      words.reject!{|w| w.blank? }
+      words
+    end
+    #
+    # Emit each word in each line.
+    #
+    def process line
+      tokenize(line).each{|word| yield [word, 1] }
+    end
+  end
+  #
+  # Accumulate the sum record-by-record:
+  #
+  class Reducer0 < Wukong::Streamer::Base
+    attr_accessor :key_count
+    def process word, count
+      @last_word ||= word
+      if (@last_word == word)
+        self.key_count += 1
+      else
+        yield [ @last_word, key_count ]
+        @last_word = word
+      end
+    end
+    def stream
+      emit @last_word, key_count
+    end
+  end
+  #
+  # You can stack up all the values in a list then sum them at once:
+  #
+  require 'active_support/core_ext/enumerable'
+  class Reducer1 < Wukong::Streamer::ListReducer
+    def finalize
+      yield [ key, values.map(&:last).map(&:to_i).sum ]
+    end
+  end
+  #
+  # A bit kinder to your memory manager: accumulate the sum record-by-record:
+  #
+  class Reducer2 < Wukong::Streamer::AccumulatingReducer
+    attr_accessor :key_count
+    def start!(*args)      self.key_count =  0 end
+    def accumulate(*args)  self.key_count += 1 end
+    def finalize
+      yield [ key, key_count ]
+    end
+  end
+  #
+  # ... easiest of all, though: this is common enough that it's already included
+  #
+  require 'wukong/streamer/count_keys'
+  class Reducer3 < Wukong::Streamer::CountKeys
+  end
+end
+# Execute the script
+Wukong::Script.new(
+  WordCount::Mapper,
+  WordCount::Reducer1
+  ).run

data/lib/wukong.rb ADDED Viewed

@@ -0,0 +1,11 @@
+require 'wukong/boot'
+require 'wukong/extensions'
+require 'wukong/datatypes'
+require 'wukong/logger'
+require 'wukong/bad_record'
+autoload :TypedStruct, 'wukong/typed_struct'
+module Wukong
+  autoload :Dfs,         'wukong/dfs'
+  autoload :Script,      'wukong/script'
+  autoload :Streamer,    'wukong/streamer'
+end

data/lib/wukong/and_pig.rb ADDED Viewed

@@ -0,0 +1,62 @@
+require 'wukong/and_pig/pig_var'
+require 'wukong/and_pig/as'
+require 'wukong/and_pig/functions'
+require 'wukong/and_pig/operators'
+require 'wukong/and_pig/data_types'
+require 'wukong/and_pig/pig_struct'
+require 'wukong/and_pig/generate'
+require 'wukong/and_pig/symbol'
+require 'wukong/and_pig/utils'
+module Wukong
+  #
+  # Wukong::AndPig lets you generate and run pig[http://hadoop.apache.org/pig]
+  # code from within ruby (and interactively, from the +irb+ console).
+  #
+  # It uses the same typed structures you've defined for Wukong to create
+  # pig-types aware commands. For example, the Wukong class
+  #
+  #   class Customer < TypedStruct.new( [:id, Integer],
+  #     [:name, String], [:postal_code, Integer], [:balance, Float] )
+  #   end
+  #
+  # will generate a LOAD command for pig as
+  #
+  #   Customer1.pig_load('q4_reports/customers.tsv').set!
+  #   # => Q4ReportsCustomers2 = LOAD 'q4_reports/customers.tsv'
+  #        AS (id: int, name: chararray, postal_code: int, balance: float) ;
+  #
+  # You can write anonymous chains
+  #
+  #   q1 = Customer1.
+  #     pig_load('q4_reports/customers.tsv').set!.
+  #     distinct.set! ;
+  #   q1.
+  #     group(:by => :postal_code).set!.
+  #     generate([:group, :postal_code], ["COUNT(#{q1.relation})", :customers_per_zip]).set!.
+  #     store!
+  #
+  #   Q4ReportsCustomers35    = LOAD    'q4_reports/customers.tsv' AS (id: int,name: chararray,postal_code: int,balance: float) ;
+  #   Q4ReportsCustomers36    = DISTINCT Q4ReportsCustomers35 ;
+  #   Q4ReportsCustomers37    = GROUP    Q4ReportsCustomers36 BY postal_code ;
+  #   Q4ReportsCustomers38    = FOREACH  Q4ReportsCustomers37 GENERATE
+  #       group AS postal_code,
+  #       COUNT(Q4ReportsCustomers36) AS customers_per_zip ;
+  #
+  # ---------------------------------------------------------------------------
+  #
+  # Note on pig:
+  #
+  # 1) Reverse the order of your tables in your join statement. Pig always
+  #    streams the keys of the last input, (materializing in memory the keys of
+  #    the first), so if one of your inputs has less instances of of a given key
+  #    this may help.
+  #
+  # 2) Reduce the number of maps and reducers per machine and give it all the
+  #    memory you can.
+  #
+  #
+  module AndPig
+  end
+end

data/lib/wukong/and_pig/README.textile ADDED Viewed

@@ -0,0 +1,12 @@
+Wukong::AndPig is a small library to more easily generate code for the
+"Pig":http://hadoop.apache.org/pig data analysis language.
+Wukong::AndPig lets you use the structs from your Wukong scripts to
+generate Pig instructions that know their types and structure -- even through
+multiple pig commands. For example, if you use +FOREACH ... GENERATE+ to select
+only a few of those fields, Wukong::AndPig will know that the result has only
+those fields.
+We're still trying to figure out if this is a stupid and crazy idea, or just a
+crazy idea: Yeah, we're using a functional/OO scripting language to generate code for an
+imperative query language that generates Java code for ad-hoc map-reduce operations.

data/lib/wukong/and_pig/as.rb ADDED Viewed

@@ -0,0 +1,37 @@
+class AS
+  attr_accessor :expr, :name, :type, :ref, :options
+  def initialize expr, name=nil, type=nil, ref=nil, *option_flags
+    case expr
+    when AS
+      self.expr = expr.expr
+      self.name = expr.name
+      self.type = expr.type
+      self.ref  = expr.ref
+      self.options = expr.options
+    end
+    self.expr ||= expr
+    self.name = name if name
+    self.type = type if type
+    self.ref  = ref  if ref
+    self.options ||= { }
+    option_flags.each{|option| self.options[option] = true }
+  end
+  def to_s
+    clause  = "%-30s \t" % [ref, expr].compact.join('::')
+    if name
+      clause << "AS #{name}"      unless options[:skip_name]
+      clause << ":#{type.typify}" unless ((!type) || options[:skip_type])
+    end
+    clause
+  end
+  def self.[] *args
+    self.new *args
+  end
+  # Useful for feeding back into TypedStruct
+  def name_type
+    [name, type]
+  end
+end

data/lib/wukong/and_pig/data_types.rb ADDED Viewed

@@ -0,0 +1,30 @@
+# == SimpleDataTypes ==
+# int
+# long
+# double
+# arrays
+# chararray
+# bytearray
+#
+# == ComplexDataTypes ==
+# tuple
+# bag
+# map
+module Wukong
+  module AndPig
+    class PigVar
+    end
+  end
+end
+# class ScalarInteger  < TypedStruct.new [
+#     [:count,    Integer  ],
+#   ]
+#   include Wukong::AndPig::PigEmitter
+#   def self.load_scalar path
+#     var = super path
+#     var.to_i
+#   end
+# end

data/lib/wukong/and_pig/functions.rb ADDED Viewed

@@ -0,0 +1,50 @@
+# == Built-in Functions
+# EvalFunctions
+# AVG
+# CONCAT
+# COUNT
+# DIFF
+# MIN
+# MAX
+# SIZE
+# SUM
+# TOKENIZE
+# == NullOperators
+# isnull
+# isnotnull
+#
+# == BooleanOperators
+# and
+# or
+# not
+#
+# == DereferenceOperators
+# tupledereference.
+# mapdereference#
+#
+# == SignOperators
+# positive+
+# negative-
+#
+# == CastOperators
+# (type)$0
+# (type)alias
+#
+# == ArithmeticOperators
+# addition+
+# subtraction-
+# multiplication*
+# division/
+# modulo%
+# bincond?
+#
+# == ComparisonOperators
+# Equal==
+# notequal!=
+# lessthan<
+# greaterthan>
+# lessthanorequalto<=
+# greaterthanorequalto>=
+# patternmatchingmatches