RubyGems - wukong-hadoop - Versions diffs - 0.0.1 - Mend

wukong-hadoop 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

data/.gitignore +59 -0
data/.rspec +2 -0
data/Gemfile +3 -0
data/README.md +339 -0
data/Rakefile +13 -0
data/bin/hdp-bin +44 -0
data/bin/hdp-bzip +23 -0
data/bin/hdp-cat +3 -0
data/bin/hdp-catd +3 -0
data/bin/hdp-cp +3 -0
data/bin/hdp-du +86 -0
data/bin/hdp-get +3 -0
data/bin/hdp-kill +3 -0
data/bin/hdp-kill-task +3 -0
data/bin/hdp-ls +11 -0
data/bin/hdp-mkdir +2 -0
data/bin/hdp-mkdirp +12 -0
data/bin/hdp-mv +3 -0
data/bin/hdp-parts_to_keys.rb +77 -0
data/bin/hdp-ps +3 -0
data/bin/hdp-put +3 -0
data/bin/hdp-rm +32 -0
data/bin/hdp-sort +40 -0
data/bin/hdp-stream +40 -0
data/bin/hdp-stream-flat +22 -0
data/bin/hdp-stream2 +39 -0
data/bin/hdp-sync +17 -0
data/bin/hdp-wc +67 -0
data/bin/wu-hadoop +14 -0
data/examples/counter.rb +17 -0
data/examples/map_only.rb +28 -0
data/examples/processors.rb +4 -0
data/examples/sonnet_18.txt +14 -0
data/examples/tokenizer.rb +28 -0
data/examples/word_count.rb +44 -0
data/features/step_definitions/wu_hadoop_steps.rb +4 -0
data/features/support/env.rb +1 -0
data/features/wu_hadoop.feature +113 -0
data/lib/wukong-hadoop.rb +21 -0
data/lib/wukong-hadoop/configuration.rb +133 -0
data/lib/wukong-hadoop/driver.rb +190 -0
data/lib/wukong-hadoop/driver/hadoop_invocation.rb +184 -0
data/lib/wukong-hadoop/driver/inputs_and_outputs.rb +27 -0
data/lib/wukong-hadoop/driver/local_invocation.rb +48 -0
data/lib/wukong-hadoop/driver/map_logic.rb +104 -0
data/lib/wukong-hadoop/driver/reduce_logic.rb +129 -0
data/lib/wukong-hadoop/extensions.rb +2 -0
data/lib/wukong-hadoop/hadoop_env_methods.rb +80 -0
data/lib/wukong-hadoop/version.rb +6 -0
data/spec/spec_helper.rb +21 -0
data/spec/support/driver_helper.rb +15 -0
data/spec/support/integration_helper.rb +39 -0
data/spec/wukong-hadoop/driver_spec.rb +117 -0
data/spec/wukong-hadoop/hadoop_env_methods_spec.rb +14 -0
data/spec/wukong-hadoop/hadoop_mode_spec.rb +78 -0
data/spec/wukong-hadoop/local_mode_spec.rb +22 -0
data/spec/wukong-hadoop/wu_hadoop_spec.rb +34 -0
data/wukong-hadoop.gemspec +33 -0
metadata +168 -0

data/bin/hdp-bzip ADDED Viewed

@@ -0,0 +1,23 @@
+#!/bin/bash
+HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
+input_file=${1} 		; shift
+output_file=${1} 		; shift
+if [ "$output_file" == "" ] ; then echo "$0 input_file output_file" ; exit ; fi
+HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
+cmd="${HADOOP_HOME}/bin/hadoop \
+     jar         ${HADOOP_HOME}/contrib/streaming/hadoop-*streaming*.jar	\
+    -Dmapred.output.compress=true                                               \
+    -Dmapred.output.compression.codec=org.apache.hadoop.io.compress.BZip2Codec  \
+    -Dmapred.reduce.tasks=1                                                     \
+    -mapper  	 \"/bin/cat\"                                                   \
+    -reducer	 \"/bin/cat\"                                                   \
+    -input       \"$input_file\"                                                \
+    -output  	 \"$output_file\"                                               \
+    "
+echo $cmd
+$cmd

data/bin/hdp-cat ADDED Viewed

@@ -0,0 +1,3 @@
+#!/usr/bin/env bash
+exec hadoop dfs -cat "$@"

data/bin/hdp-catd ADDED Viewed

@@ -0,0 +1,3 @@
+#!/usr/bin/env bash
+args=`echo "$@" | ruby -ne 'a = $_.split(/\s+/); puts a.map{|arg| arg+"/[^_]*" }.join(" ")'`
+exec hadoop dfs -cat $args

data/bin/hdp-cp ADDED Viewed

@@ -0,0 +1,3 @@
+#!/usr/bin/env bash
+exec hadoop dfs -cp "$@"

data/bin/hdp-du ADDED Viewed

@@ -0,0 +1,86 @@
+#!/usr/bin/env ruby
+OPTIONS={}
+#
+# grok options
+#
+if ARGV[0] =~ /\A-[sh]+\z/
+  flags = ARGV.shift
+  OPTIONS[:summary]  = flags.include?('s')
+  OPTIONS[:humanize] = flags.include?('h')
+end
+#
+# Prepare command
+#
+def prepare_command
+  dfs_cmd  = OPTIONS[:summary] ? 'dus' : 'du'
+  dfs_args = ((!ARGV[0]) || ARGV[0]=='') ? '.' : "'#{ARGV.join("' '")}'"
+  %Q{ hadoop dfs -#{dfs_cmd} #{dfs_args} }
+end
+Numeric.class_eval do
+  def bytes()       self                    ; end
+  alias :byte :bytes
+  def kilobytes()   self * 1024             ; end
+  alias :kilobyte :kilobytes
+  def megabytes()   self * 1024.kilobytes   ; end
+  alias :megabyte :megabytes
+  def gigabytes()   self * 1024.megabytes   ; end
+  alias :gigabyte :gigabytes
+  def terabytes()   self * 1024.gigabytes   ; end
+  alias :terabyte :terabytes
+  def petabytes()   self * 1024.terabytes   ; end
+  alias :petabyte :petabytes
+  def exabytes()    self * 1024.petabytes   ; end
+  alias :exabyte :exabytes
+end
+# Formats the bytes in +size+ into a more understandable representation
+# (e.g., giving it 1500 yields 1.5 KB). This method is useful for
+# reporting file sizes to users. This method returns nil if
+# +size+ cannot be converted into a number. You can change the default
+# precision of 1 using the precision parameter +precision+.
+#
+# ==== Examples
+#  number_to_human_size(123)           # => 123 Bytes
+#  number_to_human_size(1234)          # => 1.2 KB
+#  number_to_human_size(12345)         # => 12.1 KB
+#  number_to_human_size(1234567)       # => 1.2 MB
+#  number_to_human_size(1234567890)    # => 1.1 GB
+#  number_to_human_size(1234567890123) # => 1.1 TB
+#  number_to_human_size(1234567, 2)    # => 1.18 MB
+#  number_to_human_size(483989, 0)     # => 4 MB
+def number_to_human_size(size, precision=1)
+  size = Kernel.Float(size)
+  case
+  when size.to_i == 1;    "1 Byte"
+  when size < 1.kilobyte; "%d Bytes" % size
+  when size < 1.megabyte; "%.#{precision}f KB"  % (size / 1.0.kilobyte)
+  when size < 1.gigabyte; "%.#{precision}f MB"  % (size / 1.0.megabyte)
+  when size < 1.terabyte; "%.#{precision}f GB"  % (size / 1.0.gigabyte)
+  else                    "%.#{precision}f TB"  % (size / 1.0.terabyte)
+  end #.sub(/([0-9]\.\d*?)0+ /, '\1 ' ).sub(/\. /,' ')
+rescue
+  nil
+end
+OUTPUT_LINE_FMT = "%-71s\t%15d\t%15s"
+def format_output file, size
+  human_size = number_to_human_size(size) || ""
+  file = file.gsub(%r{hdfs://[^/]+/}, '/') # kill off hdfs paths, otherwise leave it alone
+  OUTPUT_LINE_FMT % [file, size.to_i, human_size]
+end
+entries_count = 0
+total_size  = 0
+%x{ #{prepare_command} }.split("\n").each do |line|
+  if line =~ /^Found \d+ items$/ then puts line ; next end
+  info = line.split(/\s+/)
+  if OPTIONS[:summary] then file, size = info else size, file = info end
+  puts format_output(file, size)
+  total_size  += size.to_i
+  entries_count += 1
+end
+$stderr.puts OUTPUT_LINE_FMT%[" #{"%55d"%entries_count} entries", total_size, number_to_human_size(total_size)]

data/bin/hdp-get ADDED Viewed

@@ -0,0 +1,3 @@
+#!/usr/bin/env bash
+exec hadoop dfs -copyToLocal "$1" "$2"

data/bin/hdp-kill ADDED Viewed

@@ -0,0 +1,3 @@
+#!/usr/bin/env bash
+exec hadoop job -kill "$@"

data/bin/hdp-kill-task ADDED Viewed

@@ -0,0 +1,3 @@
+#!/usr/bin/env bash
+exec hadoop job -kill-task "$1"

data/bin/hdp-ls ADDED Viewed

@@ -0,0 +1,11 @@
+#!/usr/bin/env bash
+if [ "$1" == "-r" ] || [ "$1" == "-R" ] ; then
+    shift
+    action=lsr
+else
+    action=ls
+fi
+HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
+exec $HADOOP_HOME/bin/hadoop dfs -$action "$@"

data/bin/hdp-mkdir ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ #!/usr/bin/env bash
2	+ exec hadoop fs -mkdir "$@"

data/bin/hdp-mkdirp ADDED Viewed

@@ -0,0 +1,12 @@
+#!/usr/bin/env bash
+#
+# Despite arguments and deliberation, this IS a necessary script. Azkaban, should you choose to
+# use it, will fail if (it seems) ANY of its spawned subprocesses fails
+#
+hadoop fs -test -e "$@"
+if [ "$?" != "0" ] ; then
+  # echo "File does not exist, making..."
+  exec hadoop fs -mkdir "$@"
+fi

data/bin/hdp-mv ADDED Viewed

@@ -0,0 +1,3 @@
+#!/usr/bin/env bash
+exec hadoop dfs -mv "$@"

data/bin/hdp-parts_to_keys.rb ADDED Viewed

@@ -0,0 +1,77 @@
+#!/usr/bin/env ruby
+dir_to_rename = ARGV[0]
+dest_ext = '.tsv'
+unless dir_to_rename && (! dir_to_rename.empty?)
+  warn "Need a directory or file spec to rename."
+  exit
+end
+#
+# Setup
+#
+warn "\nPlease IGNORE the 'cat: Unable to write to output stream.' errors\n"
+#
+# Examine the files
+#
+file_listings = `hdp-ls #{dir_to_rename}`.split("\n")
+command_lists = { }
+file_listings[1..-1].each do |file_listing|
+  m = %r{[-drwx]+\s+[\-\d]+\s+\w+\s+\w+\s+(\d+)\s+[\d\-]+\s+[\d\:]+\s+(.+)$}.match(file_listing)
+  if !m then warn "Couldn't grok #{file_listing}" ; next ; end
+  size, filename = m.captures
+  case
+  when size.to_i == 0 then (command_lists[:deletes]||=[]) << filename
+  else
+    firstline = `hdp-cat #{filename} | head -qn1 `
+    file_key, _ = firstline.split("\t", 2)
+    unless file_key && (file_key =~ /\A[\w\-\.]+\z/)
+      warn "Don't want to rename to '#{file_key}'... skipping"
+      next
+    end
+    dirname = File.dirname(filename)
+    destfile = File.join(dirname, file_key)+dest_ext
+    (command_lists[:moves]||=[]) << "hdp-mv #{filename} #{destfile}"
+  end
+end
+#
+# Execute the command_lists
+#
+command_lists.each do |type, command_list|
+  case type
+  when :deletes
+    command = "hdp-rm #{command_list.join(" ")}"
+    puts command
+    `#{command}`
+  when :moves
+    command_list.each do |command|
+      puts command
+      `#{command}`
+    end
+  end
+end
+# -rw-r--r--   3 flip supergroup          0 2008-12-20 05:51 /user/flip/out/sorted-tweets-20081220/part-00010
+# # Killing empty files
+# find . -size 0 -print -exec rm {} \;
+#
+# for foo in part-0* ; do
+#   newname=`
+#     head -n1 $foo |
+#     cut -d'   ' -f1 |
+#     ruby -ne 'puts $_.chomp.gsub(/[^\-\w]/){|s| s.bytes.map{|c| "%%%02X" % c }}'
+#     `.tsv ;
+#   echo "moving $foo to $newname"
+#   mv "$foo" "$newname"
+# done
+#
+# # dir=`basename $PWD`
+# # for foo in *.tsv ; do
+# #   echo "Compressing $dir"
+# #   bzip2 -c $foo > ../$dir-bz2/$foo.bz2
+# # done

data/bin/hdp-ps ADDED Viewed

@@ -0,0 +1,3 @@
+#!/usr/bin/env bash
+exec hadoop job -list all

data/bin/hdp-put ADDED Viewed

@@ -0,0 +1,3 @@
+#!/usr/bin/env bash
+exec hadoop dfs -put "$@"

data/bin/hdp-rm ADDED Viewed

@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+#
+# Documentation for hadoop fs -rmr says "acts just like Unix rm -rf command". If this
+# is true then we need to ignore directories that don't exist and still return 0.
+#
+#
+# All the dirty conditional logic here does is test whether a directory exists. If so, remove it
+#
+if [ "$1" == "-r" ] ; then
+  shift
+  if [ "$1" == "-skipTrash" ] ; then
+    shift
+    hadoop fs -test -e "$@"
+    if [ "$?" == "0" ] ; then
+      # echo "File exists, skipping trash, removing it..."
+      echo hadoop dfs -rmr -skipTrash "$@"
+      exec hadoop dfs -rmr -skipTrash "$@"
+    fi
+  else
+    hadoop fs -test -e "$@"
+    if [ "$?" == "0" ] ; then
+            # echo "File exists, removing it..."
+      echo hadoop dfs -rmr "$@"
+      exec hadoop dfs -rmr "$@"
+    fi
+  fi
+else
+  echo hadoop dfs -rm "$@"
+  exec hadoop dfs -rm "$@"
+fi

data/bin/hdp-sort ADDED Viewed

@@ -0,0 +1,40 @@
+#!/usr/bin/env bash
+input_file=${1} 		; shift
+output_file=${1} 		; shift
+map_script=${1-/bin/cat}	; shift
+reduce_script=${1-/usr/bin/uniq} ; shift
+partfields=${1-2} 		; shift
+sortfields=${1-2} 		; shift
+if [ "$output_file" == "" ] ; then echo "$0 input_file output_file [mapper=/bin/cat] [reducer=/usr/bin/uniq] [partfields=2] [sortfields=2] [extra_args]" ; exit ; fi
+HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
+cmd="${HADOOP_HOME}/bin/hadoop \
+     jar         ${HADOOP_HOME}/contrib/streaming/hadoop-*streaming*.jar
+    $@
+    -D   num.key.fields.for.partition=\"$partfields\"
+    -D 	 stream.num.map.output.key.fields=\"$sortfields\"
+    -D   stream.map.output.field.separator=\"'/t'\"
+    -D   mapred.text.key.partitioner.options=\"-k1,$partfields\"
+    -D   mapred.job.name=\"`basename $0`-$map_script-$input_file-$output_file\"
+    -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner
+    -mapper  	 \"$map_script\"
+    -reducer	 \"$reduce_script\"
+    -input       \"$input_file\"
+    -output  	 \"$output_file\"
+    "
+echo "$cmd"
+$cmd
+# For a map-side-only job specify
+# -jobconf mapred.reduce.tasks=0                                                    \
+# Maybe?
+#
+#     -inputformat    org.apache.hadoop.mapred.KeyValueTextInputFormat \
+#     -mapper         org.apache.hadoop.mapred.lib.IdentityMapper      \
+#

data/bin/hdp-stream ADDED Viewed

@@ -0,0 +1,40 @@
+#!/usr/bin/env bash
+input_file=${1} 		; shift
+output_file=${1} 		; shift
+map_script=${1-/bin/cat}	; shift
+reduce_script=${1-/usr/bin/uniq} ; shift
+partfields=${1-2} 		; shift
+sortfields=${1-2} 		; shift
+if [ "$output_file" == "" ] ; then echo "$0 input_file output_file [mapper=/bin/cat] [reducer=/usr/bin/uniq] [partfields=2] [sortfields=2] [extra_args]" ; exit ; fi
+HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
+cmd="${HADOOP_HOME}/bin/hadoop \
+     jar         ${HADOOP_HOME}/contrib/streaming/hadoop-*streaming*.jar
+    $@
+    -D   num.key.fields.for.partition=\"$partfields\"
+    -D 	 stream.num.map.output.key.fields=\"$sortfields\"
+    -D   stream.map.output.field.separator=\"'/t'\"
+    -D   mapred.text.key.partitioner.options=\"-k1,$partfields\"
+    -D   mapred.job.name=\"`basename $0`-$map_script-$input_file-$output_file\"
+    -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner
+    -mapper  	 \"$map_script\"
+    -reducer	 \"$reduce_script\"
+    -input       \"$input_file\"
+    -output  	 \"$output_file\"
+    "
+echo "$cmd"
+$cmd
+# For a map-side-only job specify
+# -jobconf mapred.reduce.tasks=0                                                    \
+# Maybe?
+#
+#     -inputformat    org.apache.hadoop.mapred.KeyValueTextInputFormat \
+#     -mapper         org.apache.hadoop.mapred.lib.IdentityMapper      \
+#

data/bin/hdp-stream-flat ADDED Viewed

@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+input_file="${1}" 			; shift
+output_file="${1}" 			; shift
+map_script="${1-/bin/cat}"		; shift
+reduce_script="${1-/usr/bin/uniq}"	; shift
+if [ "$output_file" == "" ] ; then echo "$0 input_file output_file [mapper=/bin/cat] [reducer=/usr/bin/uniq] [extra_args]" ; exit ; fi
+HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
+# Can add fun stuff like
+# -Dmapred.reduce.tasks=0                                                    \
+exec ${HADOOP_HOME}/bin/hadoop \
+     jar         ${HADOOP_HOME}/contrib/streaming/hadoop-*streaming*.jar		\
+    "$@"                                                                                \
+    -Dmapred.job.name=`basename $0`-$map_script-$input_file-$output_file                \
+    -mapper  	"$map_script"  								\
+    -reducer	"$reduce_script"							\
+    -input      "$input_file"								\
+    -output  	"$output_file"

data/bin/hdp-stream2 ADDED Viewed

@@ -0,0 +1,39 @@
+#!/usr/bin/env ruby
+require 'wukong'
+# Example usage:
+#
+#  ~/ics/wukong/bin/hdp-stream2 input_path1,input_path2 output_path  \
+#     "`which cuttab` 2,3,7" "`which uniq` -c" 1 3 -jobconf mapred.reduce.tasks=23
+# options = Settings[:runner_defaults].dup
+# cmdline_opts = Hash.zip(
+#   [ :input_file, :output_file,
+#     :map_command, :reduce_command,
+#     :partition_fields, :sort_fields],
+#   ARGV.map{|s| s.blank? ? nil : s }
+#   )
+# argvs = ARGV.slice!(0..5) ;
+# ARGV.unshift cmdline_opts[:input_file];
+# ARGV.unshift cmdline_opts[:output_file]
+# p [argvs, ARGV]
+#
+# # cmdline_opts[:map_command]    = `which cat`.chomp if cmdline_opts[:map_command].blank?
+# # cmdline_opts[:reduce_command] = nil               if cmdline_opts[:reduce_command].blank?
+# cmdline_opts[:dry_run] = true
+# cmdline_opts[:run]     = true
+#p cmdline_opts, Settings[:runner_defaults]
+# Go script go!
+runner = Wukong::Script.new(
+  nil, # use mapper_command
+  nil, # use reducer_command
+  :run => true
+  )
+# runner.options.merge cmdline_opts
+runner.options[:reuse_jvms] = true if runner.options[:reuse_jvms].blank?
+runner.run