wukong-hadoop 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. data/.gitignore +59 -0
  2. data/.rspec +2 -0
  3. data/Gemfile +3 -0
  4. data/README.md +339 -0
  5. data/Rakefile +13 -0
  6. data/bin/hdp-bin +44 -0
  7. data/bin/hdp-bzip +23 -0
  8. data/bin/hdp-cat +3 -0
  9. data/bin/hdp-catd +3 -0
  10. data/bin/hdp-cp +3 -0
  11. data/bin/hdp-du +86 -0
  12. data/bin/hdp-get +3 -0
  13. data/bin/hdp-kill +3 -0
  14. data/bin/hdp-kill-task +3 -0
  15. data/bin/hdp-ls +11 -0
  16. data/bin/hdp-mkdir +2 -0
  17. data/bin/hdp-mkdirp +12 -0
  18. data/bin/hdp-mv +3 -0
  19. data/bin/hdp-parts_to_keys.rb +77 -0
  20. data/bin/hdp-ps +3 -0
  21. data/bin/hdp-put +3 -0
  22. data/bin/hdp-rm +32 -0
  23. data/bin/hdp-sort +40 -0
  24. data/bin/hdp-stream +40 -0
  25. data/bin/hdp-stream-flat +22 -0
  26. data/bin/hdp-stream2 +39 -0
  27. data/bin/hdp-sync +17 -0
  28. data/bin/hdp-wc +67 -0
  29. data/bin/wu-hadoop +14 -0
  30. data/examples/counter.rb +17 -0
  31. data/examples/map_only.rb +28 -0
  32. data/examples/processors.rb +4 -0
  33. data/examples/sonnet_18.txt +14 -0
  34. data/examples/tokenizer.rb +28 -0
  35. data/examples/word_count.rb +44 -0
  36. data/features/step_definitions/wu_hadoop_steps.rb +4 -0
  37. data/features/support/env.rb +1 -0
  38. data/features/wu_hadoop.feature +113 -0
  39. data/lib/wukong-hadoop.rb +21 -0
  40. data/lib/wukong-hadoop/configuration.rb +133 -0
  41. data/lib/wukong-hadoop/driver.rb +190 -0
  42. data/lib/wukong-hadoop/driver/hadoop_invocation.rb +184 -0
  43. data/lib/wukong-hadoop/driver/inputs_and_outputs.rb +27 -0
  44. data/lib/wukong-hadoop/driver/local_invocation.rb +48 -0
  45. data/lib/wukong-hadoop/driver/map_logic.rb +104 -0
  46. data/lib/wukong-hadoop/driver/reduce_logic.rb +129 -0
  47. data/lib/wukong-hadoop/extensions.rb +2 -0
  48. data/lib/wukong-hadoop/hadoop_env_methods.rb +80 -0
  49. data/lib/wukong-hadoop/version.rb +6 -0
  50. data/spec/spec_helper.rb +21 -0
  51. data/spec/support/driver_helper.rb +15 -0
  52. data/spec/support/integration_helper.rb +39 -0
  53. data/spec/wukong-hadoop/driver_spec.rb +117 -0
  54. data/spec/wukong-hadoop/hadoop_env_methods_spec.rb +14 -0
  55. data/spec/wukong-hadoop/hadoop_mode_spec.rb +78 -0
  56. data/spec/wukong-hadoop/local_mode_spec.rb +22 -0
  57. data/spec/wukong-hadoop/wu_hadoop_spec.rb +34 -0
  58. data/wukong-hadoop.gemspec +33 -0
  59. metadata +168 -0
data/bin/hdp-bzip ADDED
@@ -0,0 +1,23 @@
1
+ #!/bin/bash
2
+
3
+ HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
4
+
5
+ input_file=${1} ; shift
6
+ output_file=${1} ; shift
7
+
8
+ if [ "$output_file" == "" ] ; then echo "$0 input_file output_file" ; exit ; fi
9
+
10
+ HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
11
+
12
+ cmd="${HADOOP_HOME}/bin/hadoop \
13
+ jar ${HADOOP_HOME}/contrib/streaming/hadoop-*streaming*.jar \
14
+ -Dmapred.output.compress=true \
15
+ -Dmapred.output.compression.codec=org.apache.hadoop.io.compress.BZip2Codec \
16
+ -Dmapred.reduce.tasks=1 \
17
+ -mapper \"/bin/cat\" \
18
+ -reducer \"/bin/cat\" \
19
+ -input \"$input_file\" \
20
+ -output \"$output_file\" \
21
+ "
22
+ echo $cmd
23
+ $cmd
data/bin/hdp-cat ADDED
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env bash
2
+
3
+ exec hadoop dfs -cat "$@"
data/bin/hdp-catd ADDED
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env bash
2
+ args=`echo "$@" | ruby -ne 'a = $_.split(/\s+/); puts a.map{|arg| arg+"/[^_]*" }.join(" ")'`
3
+ exec hadoop dfs -cat $args
data/bin/hdp-cp ADDED
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env bash
2
+
3
+ exec hadoop dfs -cp "$@"
data/bin/hdp-du ADDED
@@ -0,0 +1,86 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ OPTIONS={}
4
+
5
+ #
6
+ # grok options
7
+ #
8
+ if ARGV[0] =~ /\A-[sh]+\z/
9
+ flags = ARGV.shift
10
+ OPTIONS[:summary] = flags.include?('s')
11
+ OPTIONS[:humanize] = flags.include?('h')
12
+ end
13
+
14
+ #
15
+ # Prepare command
16
+ #
17
+ def prepare_command
18
+ dfs_cmd = OPTIONS[:summary] ? 'dus' : 'du'
19
+ dfs_args = ((!ARGV[0]) || ARGV[0]=='') ? '.' : "'#{ARGV.join("' '")}'"
20
+ %Q{ hadoop dfs -#{dfs_cmd} #{dfs_args} }
21
+ end
22
+
23
+ Numeric.class_eval do
24
+ def bytes() self ; end
25
+ alias :byte :bytes
26
+ def kilobytes() self * 1024 ; end
27
+ alias :kilobyte :kilobytes
28
+ def megabytes() self * 1024.kilobytes ; end
29
+ alias :megabyte :megabytes
30
+ def gigabytes() self * 1024.megabytes ; end
31
+ alias :gigabyte :gigabytes
32
+ def terabytes() self * 1024.gigabytes ; end
33
+ alias :terabyte :terabytes
34
+ def petabytes() self * 1024.terabytes ; end
35
+ alias :petabyte :petabytes
36
+ def exabytes() self * 1024.petabytes ; end
37
+ alias :exabyte :exabytes
38
+ end
39
+
40
+ # Formats the bytes in +size+ into a more understandable representation
41
+ # (e.g., giving it 1500 yields 1.5 KB). This method is useful for
42
+ # reporting file sizes to users. This method returns nil if
43
+ # +size+ cannot be converted into a number. You can change the default
44
+ # precision of 1 using the precision parameter +precision+.
45
+ #
46
+ # ==== Examples
47
+ # number_to_human_size(123) # => 123 Bytes
48
+ # number_to_human_size(1234) # => 1.2 KB
49
+ # number_to_human_size(12345) # => 12.1 KB
50
+ # number_to_human_size(1234567) # => 1.2 MB
51
+ # number_to_human_size(1234567890) # => 1.1 GB
52
+ # number_to_human_size(1234567890123) # => 1.1 TB
53
+ # number_to_human_size(1234567, 2) # => 1.18 MB
54
+ # number_to_human_size(483989, 0) # => 4 MB
55
+ def number_to_human_size(size, precision=1)
56
+ size = Kernel.Float(size)
57
+ case
58
+ when size.to_i == 1; "1 Byte"
59
+ when size < 1.kilobyte; "%d Bytes" % size
60
+ when size < 1.megabyte; "%.#{precision}f KB" % (size / 1.0.kilobyte)
61
+ when size < 1.gigabyte; "%.#{precision}f MB" % (size / 1.0.megabyte)
62
+ when size < 1.terabyte; "%.#{precision}f GB" % (size / 1.0.gigabyte)
63
+ else "%.#{precision}f TB" % (size / 1.0.terabyte)
64
+ end #.sub(/([0-9]\.\d*?)0+ /, '\1 ' ).sub(/\. /,' ')
65
+ rescue
66
+ nil
67
+ end
68
+
69
+ OUTPUT_LINE_FMT = "%-71s\t%15d\t%15s"
70
+ def format_output file, size
71
+ human_size = number_to_human_size(size) || ""
72
+ file = file.gsub(%r{hdfs://[^/]+/}, '/') # kill off hdfs paths, otherwise leave it alone
73
+ OUTPUT_LINE_FMT % [file, size.to_i, human_size]
74
+ end
75
+
76
+ entries_count = 0
77
+ total_size = 0
78
+ %x{ #{prepare_command} }.split("\n").each do |line|
79
+ if line =~ /^Found \d+ items$/ then puts line ; next end
80
+ info = line.split(/\s+/)
81
+ if OPTIONS[:summary] then file, size = info else size, file = info end
82
+ puts format_output(file, size)
83
+ total_size += size.to_i
84
+ entries_count += 1
85
+ end
86
+ $stderr.puts OUTPUT_LINE_FMT%[" #{"%55d"%entries_count} entries", total_size, number_to_human_size(total_size)]
data/bin/hdp-get ADDED
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env bash
2
+
3
+ exec hadoop dfs -copyToLocal "$1" "$2"
data/bin/hdp-kill ADDED
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env bash
2
+
3
+ exec hadoop job -kill "$@"
data/bin/hdp-kill-task ADDED
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env bash
2
+
3
+ exec hadoop job -kill-task "$1"
data/bin/hdp-ls ADDED
@@ -0,0 +1,11 @@
1
+ #!/usr/bin/env bash
2
+
3
+ if [ "$1" == "-r" ] || [ "$1" == "-R" ] ; then
4
+ shift
5
+ action=lsr
6
+ else
7
+ action=ls
8
+ fi
9
+
10
+ HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
11
+ exec $HADOOP_HOME/bin/hadoop dfs -$action "$@"
data/bin/hdp-mkdir ADDED
@@ -0,0 +1,2 @@
1
+ #!/usr/bin/env bash
2
+ exec hadoop fs -mkdir "$@"
data/bin/hdp-mkdirp ADDED
@@ -0,0 +1,12 @@
1
+ #!/usr/bin/env bash
2
+
3
+ #
4
+ # Despite arguments and deliberation, this IS a necessary script. Azkaban, should you choose to
5
+ # use it, will fail if (it seems) ANY of its spawned subprocesses fails
6
+ #
7
+
8
+ hadoop fs -test -e "$@"
9
+ if [ "$?" != "0" ] ; then
10
+ # echo "File does not exist, making..."
11
+ exec hadoop fs -mkdir "$@"
12
+ fi
data/bin/hdp-mv ADDED
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env bash
2
+
3
+ exec hadoop dfs -mv "$@"
@@ -0,0 +1,77 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ dir_to_rename = ARGV[0]
4
+ dest_ext = '.tsv'
5
+
6
+ unless dir_to_rename && (! dir_to_rename.empty?)
7
+ warn "Need a directory or file spec to rename."
8
+ exit
9
+ end
10
+
11
+ #
12
+ # Setup
13
+ #
14
+ warn "\nPlease IGNORE the 'cat: Unable to write to output stream.' errors\n"
15
+
16
+ #
17
+ # Examine the files
18
+ #
19
+ file_listings = `hdp-ls #{dir_to_rename}`.split("\n")
20
+ command_lists = { }
21
+ file_listings[1..-1].each do |file_listing|
22
+ m = %r{[-drwx]+\s+[\-\d]+\s+\w+\s+\w+\s+(\d+)\s+[\d\-]+\s+[\d\:]+\s+(.+)$}.match(file_listing)
23
+ if !m then warn "Couldn't grok #{file_listing}" ; next ; end
24
+ size, filename = m.captures
25
+ case
26
+ when size.to_i == 0 then (command_lists[:deletes]||=[]) << filename
27
+ else
28
+ firstline = `hdp-cat #{filename} | head -qn1 `
29
+ file_key, _ = firstline.split("\t", 2)
30
+ unless file_key && (file_key =~ /\A[\w\-\.]+\z/)
31
+ warn "Don't want to rename to '#{file_key}'... skipping"
32
+ next
33
+ end
34
+ dirname = File.dirname(filename)
35
+ destfile = File.join(dirname, file_key)+dest_ext
36
+ (command_lists[:moves]||=[]) << "hdp-mv #{filename} #{destfile}"
37
+ end
38
+ end
39
+
40
+ #
41
+ # Execute the command_lists
42
+ #
43
+ command_lists.each do |type, command_list|
44
+ case type
45
+ when :deletes
46
+ command = "hdp-rm #{command_list.join(" ")}"
47
+ puts command
48
+ `#{command}`
49
+ when :moves
50
+ command_list.each do |command|
51
+ puts command
52
+ `#{command}`
53
+ end
54
+ end
55
+ end
56
+
57
+
58
+ # -rw-r--r-- 3 flip supergroup 0 2008-12-20 05:51 /user/flip/out/sorted-tweets-20081220/part-00010
59
+
60
+ # # Killing empty files
61
+ # find . -size 0 -print -exec rm {} \;
62
+ #
63
+ # for foo in part-0* ; do
64
+ # newname=`
65
+ # head -n1 $foo |
66
+ # cut -d' ' -f1 |
67
+ # ruby -ne 'puts $_.chomp.gsub(/[^\-\w]/){|s| s.bytes.map{|c| "%%%02X" % c }}'
68
+ # `.tsv ;
69
+ # echo "moving $foo to $newname"
70
+ # mv "$foo" "$newname"
71
+ # done
72
+ #
73
+ # # dir=`basename $PWD`
74
+ # # for foo in *.tsv ; do
75
+ # # echo "Compressing $dir"
76
+ # # bzip2 -c $foo > ../$dir-bz2/$foo.bz2
77
+ # # done
data/bin/hdp-ps ADDED
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env bash
2
+
3
+ exec hadoop job -list all
data/bin/hdp-put ADDED
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env bash
2
+
3
+ exec hadoop dfs -put "$@"
data/bin/hdp-rm ADDED
@@ -0,0 +1,32 @@
1
+ #!/usr/bin/env bash
2
+
3
+ #
4
+ # Documentation for hadoop fs -rmr says "acts just like Unix rm -rf command". If this
5
+ # is true then we need to ignore directories that don't exist and still return 0.
6
+ #
7
+
8
+ #
9
+ # All the dirty conditional logic here does is test whether a directory exists. If so, remove it
10
+ #
11
+ if [ "$1" == "-r" ] ; then
12
+ shift
13
+ if [ "$1" == "-skipTrash" ] ; then
14
+ shift
15
+ hadoop fs -test -e "$@"
16
+ if [ "$?" == "0" ] ; then
17
+ # echo "File exists, skipping trash, removing it..."
18
+ echo hadoop dfs -rmr -skipTrash "$@"
19
+ exec hadoop dfs -rmr -skipTrash "$@"
20
+ fi
21
+ else
22
+ hadoop fs -test -e "$@"
23
+ if [ "$?" == "0" ] ; then
24
+ # echo "File exists, removing it..."
25
+ echo hadoop dfs -rmr "$@"
26
+ exec hadoop dfs -rmr "$@"
27
+ fi
28
+ fi
29
+ else
30
+ echo hadoop dfs -rm "$@"
31
+ exec hadoop dfs -rm "$@"
32
+ fi
data/bin/hdp-sort ADDED
@@ -0,0 +1,40 @@
1
+ #!/usr/bin/env bash
2
+
3
+ input_file=${1} ; shift
4
+ output_file=${1} ; shift
5
+ map_script=${1-/bin/cat} ; shift
6
+ reduce_script=${1-/usr/bin/uniq} ; shift
7
+ partfields=${1-2} ; shift
8
+ sortfields=${1-2} ; shift
9
+
10
+ if [ "$output_file" == "" ] ; then echo "$0 input_file output_file [mapper=/bin/cat] [reducer=/usr/bin/uniq] [partfields=2] [sortfields=2] [extra_args]" ; exit ; fi
11
+
12
+ HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
13
+
14
+ cmd="${HADOOP_HOME}/bin/hadoop \
15
+ jar ${HADOOP_HOME}/contrib/streaming/hadoop-*streaming*.jar
16
+ $@
17
+ -D num.key.fields.for.partition=\"$partfields\"
18
+ -D stream.num.map.output.key.fields=\"$sortfields\"
19
+ -D stream.map.output.field.separator=\"'/t'\"
20
+ -D mapred.text.key.partitioner.options=\"-k1,$partfields\"
21
+ -D mapred.job.name=\"`basename $0`-$map_script-$input_file-$output_file\"
22
+ -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner
23
+ -mapper \"$map_script\"
24
+ -reducer \"$reduce_script\"
25
+ -input \"$input_file\"
26
+ -output \"$output_file\"
27
+ "
28
+
29
+ echo "$cmd"
30
+
31
+ $cmd
32
+
33
+ # For a map-side-only job specify
34
+ # -jobconf mapred.reduce.tasks=0 \
35
+
36
+ # Maybe?
37
+ #
38
+ # -inputformat org.apache.hadoop.mapred.KeyValueTextInputFormat \
39
+ # -mapper org.apache.hadoop.mapred.lib.IdentityMapper \
40
+ #
data/bin/hdp-stream ADDED
@@ -0,0 +1,40 @@
1
+ #!/usr/bin/env bash
2
+
3
+ input_file=${1} ; shift
4
+ output_file=${1} ; shift
5
+ map_script=${1-/bin/cat} ; shift
6
+ reduce_script=${1-/usr/bin/uniq} ; shift
7
+ partfields=${1-2} ; shift
8
+ sortfields=${1-2} ; shift
9
+
10
+ if [ "$output_file" == "" ] ; then echo "$0 input_file output_file [mapper=/bin/cat] [reducer=/usr/bin/uniq] [partfields=2] [sortfields=2] [extra_args]" ; exit ; fi
11
+
12
+ HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
13
+
14
+ cmd="${HADOOP_HOME}/bin/hadoop \
15
+ jar ${HADOOP_HOME}/contrib/streaming/hadoop-*streaming*.jar
16
+ $@
17
+ -D num.key.fields.for.partition=\"$partfields\"
18
+ -D stream.num.map.output.key.fields=\"$sortfields\"
19
+ -D stream.map.output.field.separator=\"'/t'\"
20
+ -D mapred.text.key.partitioner.options=\"-k1,$partfields\"
21
+ -D mapred.job.name=\"`basename $0`-$map_script-$input_file-$output_file\"
22
+ -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner
23
+ -mapper \"$map_script\"
24
+ -reducer \"$reduce_script\"
25
+ -input \"$input_file\"
26
+ -output \"$output_file\"
27
+ "
28
+
29
+ echo "$cmd"
30
+
31
+ $cmd
32
+
33
+ # For a map-side-only job specify
34
+ # -jobconf mapred.reduce.tasks=0 \
35
+
36
+ # Maybe?
37
+ #
38
+ # -inputformat org.apache.hadoop.mapred.KeyValueTextInputFormat \
39
+ # -mapper org.apache.hadoop.mapred.lib.IdentityMapper \
40
+ #
@@ -0,0 +1,22 @@
1
+ #!/usr/bin/env bash
2
+
3
+ input_file="${1}" ; shift
4
+ output_file="${1}" ; shift
5
+ map_script="${1-/bin/cat}" ; shift
6
+ reduce_script="${1-/usr/bin/uniq}" ; shift
7
+
8
+ if [ "$output_file" == "" ] ; then echo "$0 input_file output_file [mapper=/bin/cat] [reducer=/usr/bin/uniq] [extra_args]" ; exit ; fi
9
+
10
+ HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
11
+
12
+ # Can add fun stuff like
13
+ # -Dmapred.reduce.tasks=0 \
14
+
15
+ exec ${HADOOP_HOME}/bin/hadoop \
16
+ jar ${HADOOP_HOME}/contrib/streaming/hadoop-*streaming*.jar \
17
+ "$@" \
18
+ -Dmapred.job.name=`basename $0`-$map_script-$input_file-$output_file \
19
+ -mapper "$map_script" \
20
+ -reducer "$reduce_script" \
21
+ -input "$input_file" \
22
+ -output "$output_file"
data/bin/hdp-stream2 ADDED
@@ -0,0 +1,39 @@
1
+ #!/usr/bin/env ruby
2
+ require 'wukong'
3
+
4
+ # Example usage:
5
+ #
6
+ # ~/ics/wukong/bin/hdp-stream2 input_path1,input_path2 output_path \
7
+ # "`which cuttab` 2,3,7" "`which uniq` -c" 1 3 -jobconf mapred.reduce.tasks=23
8
+
9
+
10
+ # options = Settings[:runner_defaults].dup
11
+
12
+ # cmdline_opts = Hash.zip(
13
+ # [ :input_file, :output_file,
14
+ # :map_command, :reduce_command,
15
+ # :partition_fields, :sort_fields],
16
+ # ARGV.map{|s| s.blank? ? nil : s }
17
+ # )
18
+ # argvs = ARGV.slice!(0..5) ;
19
+ # ARGV.unshift cmdline_opts[:input_file];
20
+ # ARGV.unshift cmdline_opts[:output_file]
21
+ # p [argvs, ARGV]
22
+ #
23
+ # # cmdline_opts[:map_command] = `which cat`.chomp if cmdline_opts[:map_command].blank?
24
+ # # cmdline_opts[:reduce_command] = nil if cmdline_opts[:reduce_command].blank?
25
+ # cmdline_opts[:dry_run] = true
26
+ # cmdline_opts[:run] = true
27
+
28
+ #p cmdline_opts, Settings[:runner_defaults]
29
+
30
+ # Go script go!
31
+ runner = Wukong::Script.new(
32
+ nil, # use mapper_command
33
+ nil, # use reducer_command
34
+ :run => true
35
+ )
36
+ # runner.options.merge cmdline_opts
37
+ runner.options[:reuse_jvms] = true if runner.options[:reuse_jvms].blank?
38
+
39
+ runner.run