wukong-hadoop 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (59) hide show
  1. data/.gitignore +59 -0
  2. data/.rspec +2 -0
  3. data/Gemfile +3 -0
  4. data/README.md +339 -0
  5. data/Rakefile +13 -0
  6. data/bin/hdp-bin +44 -0
  7. data/bin/hdp-bzip +23 -0
  8. data/bin/hdp-cat +3 -0
  9. data/bin/hdp-catd +3 -0
  10. data/bin/hdp-cp +3 -0
  11. data/bin/hdp-du +86 -0
  12. data/bin/hdp-get +3 -0
  13. data/bin/hdp-kill +3 -0
  14. data/bin/hdp-kill-task +3 -0
  15. data/bin/hdp-ls +11 -0
  16. data/bin/hdp-mkdir +2 -0
  17. data/bin/hdp-mkdirp +12 -0
  18. data/bin/hdp-mv +3 -0
  19. data/bin/hdp-parts_to_keys.rb +77 -0
  20. data/bin/hdp-ps +3 -0
  21. data/bin/hdp-put +3 -0
  22. data/bin/hdp-rm +32 -0
  23. data/bin/hdp-sort +40 -0
  24. data/bin/hdp-stream +40 -0
  25. data/bin/hdp-stream-flat +22 -0
  26. data/bin/hdp-stream2 +39 -0
  27. data/bin/hdp-sync +17 -0
  28. data/bin/hdp-wc +67 -0
  29. data/bin/wu-hadoop +14 -0
  30. data/examples/counter.rb +17 -0
  31. data/examples/map_only.rb +28 -0
  32. data/examples/processors.rb +4 -0
  33. data/examples/sonnet_18.txt +14 -0
  34. data/examples/tokenizer.rb +28 -0
  35. data/examples/word_count.rb +44 -0
  36. data/features/step_definitions/wu_hadoop_steps.rb +4 -0
  37. data/features/support/env.rb +1 -0
  38. data/features/wu_hadoop.feature +113 -0
  39. data/lib/wukong-hadoop.rb +21 -0
  40. data/lib/wukong-hadoop/configuration.rb +133 -0
  41. data/lib/wukong-hadoop/driver.rb +190 -0
  42. data/lib/wukong-hadoop/driver/hadoop_invocation.rb +184 -0
  43. data/lib/wukong-hadoop/driver/inputs_and_outputs.rb +27 -0
  44. data/lib/wukong-hadoop/driver/local_invocation.rb +48 -0
  45. data/lib/wukong-hadoop/driver/map_logic.rb +104 -0
  46. data/lib/wukong-hadoop/driver/reduce_logic.rb +129 -0
  47. data/lib/wukong-hadoop/extensions.rb +2 -0
  48. data/lib/wukong-hadoop/hadoop_env_methods.rb +80 -0
  49. data/lib/wukong-hadoop/version.rb +6 -0
  50. data/spec/spec_helper.rb +21 -0
  51. data/spec/support/driver_helper.rb +15 -0
  52. data/spec/support/integration_helper.rb +39 -0
  53. data/spec/wukong-hadoop/driver_spec.rb +117 -0
  54. data/spec/wukong-hadoop/hadoop_env_methods_spec.rb +14 -0
  55. data/spec/wukong-hadoop/hadoop_mode_spec.rb +78 -0
  56. data/spec/wukong-hadoop/local_mode_spec.rb +22 -0
  57. data/spec/wukong-hadoop/wu_hadoop_spec.rb +34 -0
  58. data/wukong-hadoop.gemspec +33 -0
  59. metadata +168 -0
data/bin/hdp-bzip ADDED
@@ -0,0 +1,23 @@
1
+ #!/bin/bash
2
+
3
+ HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
4
+
5
+ input_file=${1} ; shift
6
+ output_file=${1} ; shift
7
+
8
+ if [ "$output_file" == "" ] ; then echo "$0 input_file output_file" ; exit ; fi
9
+
10
+ HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
11
+
12
+ cmd="${HADOOP_HOME}/bin/hadoop \
13
+ jar ${HADOOP_HOME}/contrib/streaming/hadoop-*streaming*.jar \
14
+ -Dmapred.output.compress=true \
15
+ -Dmapred.output.compression.codec=org.apache.hadoop.io.compress.BZip2Codec \
16
+ -Dmapred.reduce.tasks=1 \
17
+ -mapper \"/bin/cat\" \
18
+ -reducer \"/bin/cat\" \
19
+ -input \"$input_file\" \
20
+ -output \"$output_file\" \
21
+ "
22
+ echo $cmd
23
+ $cmd
data/bin/hdp-cat ADDED
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env bash
2
+
3
+ exec hadoop dfs -cat "$@"
data/bin/hdp-catd ADDED
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env bash
2
+ args=`echo "$@" | ruby -ne 'a = $_.split(/\s+/); puts a.map{|arg| arg+"/[^_]*" }.join(" ")'`
3
+ exec hadoop dfs -cat $args
data/bin/hdp-cp ADDED
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env bash
2
+
3
+ exec hadoop dfs -cp "$@"
data/bin/hdp-du ADDED
@@ -0,0 +1,86 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ OPTIONS={}
4
+
5
+ #
6
+ # grok options
7
+ #
8
+ if ARGV[0] =~ /\A-[sh]+\z/
9
+ flags = ARGV.shift
10
+ OPTIONS[:summary] = flags.include?('s')
11
+ OPTIONS[:humanize] = flags.include?('h')
12
+ end
13
+
14
+ #
15
+ # Prepare command
16
+ #
17
+ def prepare_command
18
+ dfs_cmd = OPTIONS[:summary] ? 'dus' : 'du'
19
+ dfs_args = ((!ARGV[0]) || ARGV[0]=='') ? '.' : "'#{ARGV.join("' '")}'"
20
+ %Q{ hadoop dfs -#{dfs_cmd} #{dfs_args} }
21
+ end
22
+
23
+ Numeric.class_eval do
24
+ def bytes() self ; end
25
+ alias :byte :bytes
26
+ def kilobytes() self * 1024 ; end
27
+ alias :kilobyte :kilobytes
28
+ def megabytes() self * 1024.kilobytes ; end
29
+ alias :megabyte :megabytes
30
+ def gigabytes() self * 1024.megabytes ; end
31
+ alias :gigabyte :gigabytes
32
+ def terabytes() self * 1024.gigabytes ; end
33
+ alias :terabyte :terabytes
34
+ def petabytes() self * 1024.terabytes ; end
35
+ alias :petabyte :petabytes
36
+ def exabytes() self * 1024.petabytes ; end
37
+ alias :exabyte :exabytes
38
+ end
39
+
40
+ # Formats the bytes in +size+ into a more understandable representation
41
+ # (e.g., giving it 1500 yields 1.5 KB). This method is useful for
42
+ # reporting file sizes to users. This method returns nil if
43
+ # +size+ cannot be converted into a number. You can change the default
44
+ # precision of 1 using the precision parameter +precision+.
45
+ #
46
+ # ==== Examples
47
+ # number_to_human_size(123) # => 123 Bytes
48
+ # number_to_human_size(1234) # => 1.2 KB
49
+ # number_to_human_size(12345) # => 12.1 KB
50
+ # number_to_human_size(1234567) # => 1.2 MB
51
+ # number_to_human_size(1234567890) # => 1.1 GB
52
+ # number_to_human_size(1234567890123) # => 1.1 TB
53
+ # number_to_human_size(1234567, 2) # => 1.18 MB
54
+ # number_to_human_size(483989, 0) # => 4 MB
55
+ def number_to_human_size(size, precision=1)
56
+ size = Kernel.Float(size)
57
+ case
58
+ when size.to_i == 1; "1 Byte"
59
+ when size < 1.kilobyte; "%d Bytes" % size
60
+ when size < 1.megabyte; "%.#{precision}f KB" % (size / 1.0.kilobyte)
61
+ when size < 1.gigabyte; "%.#{precision}f MB" % (size / 1.0.megabyte)
62
+ when size < 1.terabyte; "%.#{precision}f GB" % (size / 1.0.gigabyte)
63
+ else "%.#{precision}f TB" % (size / 1.0.terabyte)
64
+ end #.sub(/([0-9]\.\d*?)0+ /, '\1 ' ).sub(/\. /,' ')
65
+ rescue
66
+ nil
67
+ end
68
+
69
+ OUTPUT_LINE_FMT = "%-71s\t%15d\t%15s"
70
+ def format_output file, size
71
+ human_size = number_to_human_size(size) || ""
72
+ file = file.gsub(%r{hdfs://[^/]+/}, '/') # kill off hdfs paths, otherwise leave it alone
73
+ OUTPUT_LINE_FMT % [file, size.to_i, human_size]
74
+ end
75
+
76
+ entries_count = 0
77
+ total_size = 0
78
+ %x{ #{prepare_command} }.split("\n").each do |line|
79
+ if line =~ /^Found \d+ items$/ then puts line ; next end
80
+ info = line.split(/\s+/)
81
+ if OPTIONS[:summary] then file, size = info else size, file = info end
82
+ puts format_output(file, size)
83
+ total_size += size.to_i
84
+ entries_count += 1
85
+ end
86
+ $stderr.puts OUTPUT_LINE_FMT%[" #{"%55d"%entries_count} entries", total_size, number_to_human_size(total_size)]
data/bin/hdp-get ADDED
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env bash
2
+
3
+ exec hadoop dfs -copyToLocal "$1" "$2"
data/bin/hdp-kill ADDED
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env bash
2
+
3
+ exec hadoop job -kill "$@"
data/bin/hdp-kill-task ADDED
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env bash
2
+
3
+ exec hadoop job -kill-task "$1"
data/bin/hdp-ls ADDED
@@ -0,0 +1,11 @@
1
+ #!/usr/bin/env bash
2
+
3
+ if [ "$1" == "-r" ] || [ "$1" == "-R" ] ; then
4
+ shift
5
+ action=lsr
6
+ else
7
+ action=ls
8
+ fi
9
+
10
+ HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
11
+ exec $HADOOP_HOME/bin/hadoop dfs -$action "$@"
data/bin/hdp-mkdir ADDED
@@ -0,0 +1,2 @@
1
+ #!/usr/bin/env bash
2
+ exec hadoop fs -mkdir "$@"
data/bin/hdp-mkdirp ADDED
@@ -0,0 +1,12 @@
1
+ #!/usr/bin/env bash
2
+
3
+ #
4
+ # Despite arguments and deliberation, this IS a necessary script. Azkaban, should you choose to
5
+ # use it, will fail if (it seems) ANY of its spawned subprocesses fails
6
+ #
7
+
8
+ hadoop fs -test -e "$@"
9
+ if [ "$?" != "0" ] ; then
10
+ # echo "File does not exist, making..."
11
+ exec hadoop fs -mkdir "$@"
12
+ fi
data/bin/hdp-mv ADDED
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env bash
2
+
3
+ exec hadoop dfs -mv "$@"
@@ -0,0 +1,77 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ dir_to_rename = ARGV[0]
4
+ dest_ext = '.tsv'
5
+
6
+ unless dir_to_rename && (! dir_to_rename.empty?)
7
+ warn "Need a directory or file spec to rename."
8
+ exit
9
+ end
10
+
11
+ #
12
+ # Setup
13
+ #
14
+ warn "\nPlease IGNORE the 'cat: Unable to write to output stream.' errors\n"
15
+
16
+ #
17
+ # Examine the files
18
+ #
19
+ file_listings = `hdp-ls #{dir_to_rename}`.split("\n")
20
+ command_lists = { }
21
+ file_listings[1..-1].each do |file_listing|
22
+ m = %r{[-drwx]+\s+[\-\d]+\s+\w+\s+\w+\s+(\d+)\s+[\d\-]+\s+[\d\:]+\s+(.+)$}.match(file_listing)
23
+ if !m then warn "Couldn't grok #{file_listing}" ; next ; end
24
+ size, filename = m.captures
25
+ case
26
+ when size.to_i == 0 then (command_lists[:deletes]||=[]) << filename
27
+ else
28
+ firstline = `hdp-cat #{filename} | head -qn1 `
29
+ file_key, _ = firstline.split("\t", 2)
30
+ unless file_key && (file_key =~ /\A[\w\-\.]+\z/)
31
+ warn "Don't want to rename to '#{file_key}'... skipping"
32
+ next
33
+ end
34
+ dirname = File.dirname(filename)
35
+ destfile = File.join(dirname, file_key)+dest_ext
36
+ (command_lists[:moves]||=[]) << "hdp-mv #{filename} #{destfile}"
37
+ end
38
+ end
39
+
40
+ #
41
+ # Execute the command_lists
42
+ #
43
+ command_lists.each do |type, command_list|
44
+ case type
45
+ when :deletes
46
+ command = "hdp-rm #{command_list.join(" ")}"
47
+ puts command
48
+ `#{command}`
49
+ when :moves
50
+ command_list.each do |command|
51
+ puts command
52
+ `#{command}`
53
+ end
54
+ end
55
+ end
56
+
57
+
58
+ # -rw-r--r-- 3 flip supergroup 0 2008-12-20 05:51 /user/flip/out/sorted-tweets-20081220/part-00010
59
+
60
+ # # Killing empty files
61
+ # find . -size 0 -print -exec rm {} \;
62
+ #
63
+ # for foo in part-0* ; do
64
+ # newname=`
65
+ # head -n1 $foo |
66
+ # cut -d' ' -f1 |
67
+ # ruby -ne 'puts $_.chomp.gsub(/[^\-\w]/){|s| s.bytes.map{|c| "%%%02X" % c }}'
68
+ # `.tsv ;
69
+ # echo "moving $foo to $newname"
70
+ # mv "$foo" "$newname"
71
+ # done
72
+ #
73
+ # # dir=`basename $PWD`
74
+ # # for foo in *.tsv ; do
75
+ # # echo "Compressing $dir"
76
+ # # bzip2 -c $foo > ../$dir-bz2/$foo.bz2
77
+ # # done
data/bin/hdp-ps ADDED
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env bash
2
+
3
+ exec hadoop job -list all
data/bin/hdp-put ADDED
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env bash
2
+
3
+ exec hadoop dfs -put "$@"
data/bin/hdp-rm ADDED
@@ -0,0 +1,32 @@
1
+ #!/usr/bin/env bash
2
+
3
+ #
4
+ # Documentation for hadoop fs -rmr says "acts just like Unix rm -rf command". If this
5
+ # is true then we need to ignore directories that don't exist and still return 0.
6
+ #
7
+
8
+ #
9
+ # All the dirty conditional logic here does is test whether a directory exists. If so, remove it
10
+ #
11
+ if [ "$1" == "-r" ] ; then
12
+ shift
13
+ if [ "$1" == "-skipTrash" ] ; then
14
+ shift
15
+ hadoop fs -test -e "$@"
16
+ if [ "$?" == "0" ] ; then
17
+ # echo "File exists, skipping trash, removing it..."
18
+ echo hadoop dfs -rmr -skipTrash "$@"
19
+ exec hadoop dfs -rmr -skipTrash "$@"
20
+ fi
21
+ else
22
+ hadoop fs -test -e "$@"
23
+ if [ "$?" == "0" ] ; then
24
+ # echo "File exists, removing it..."
25
+ echo hadoop dfs -rmr "$@"
26
+ exec hadoop dfs -rmr "$@"
27
+ fi
28
+ fi
29
+ else
30
+ echo hadoop dfs -rm "$@"
31
+ exec hadoop dfs -rm "$@"
32
+ fi
data/bin/hdp-sort ADDED
@@ -0,0 +1,40 @@
1
+ #!/usr/bin/env bash
2
+
3
+ input_file=${1} ; shift
4
+ output_file=${1} ; shift
5
+ map_script=${1-/bin/cat} ; shift
6
+ reduce_script=${1-/usr/bin/uniq} ; shift
7
+ partfields=${1-2} ; shift
8
+ sortfields=${1-2} ; shift
9
+
10
+ if [ "$output_file" == "" ] ; then echo "$0 input_file output_file [mapper=/bin/cat] [reducer=/usr/bin/uniq] [partfields=2] [sortfields=2] [extra_args]" ; exit ; fi
11
+
12
+ HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
13
+
14
+ cmd="${HADOOP_HOME}/bin/hadoop \
15
+ jar ${HADOOP_HOME}/contrib/streaming/hadoop-*streaming*.jar
16
+ $@
17
+ -D num.key.fields.for.partition=\"$partfields\"
18
+ -D stream.num.map.output.key.fields=\"$sortfields\"
19
+ -D stream.map.output.field.separator=\"'/t'\"
20
+ -D mapred.text.key.partitioner.options=\"-k1,$partfields\"
21
+ -D mapred.job.name=\"`basename $0`-$map_script-$input_file-$output_file\"
22
+ -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner
23
+ -mapper \"$map_script\"
24
+ -reducer \"$reduce_script\"
25
+ -input \"$input_file\"
26
+ -output \"$output_file\"
27
+ "
28
+
29
+ echo "$cmd"
30
+
31
+ $cmd
32
+
33
+ # For a map-side-only job specify
34
+ # -jobconf mapred.reduce.tasks=0 \
35
+
36
+ # Maybe?
37
+ #
38
+ # -inputformat org.apache.hadoop.mapred.KeyValueTextInputFormat \
39
+ # -mapper org.apache.hadoop.mapred.lib.IdentityMapper \
40
+ #
data/bin/hdp-stream ADDED
@@ -0,0 +1,40 @@
1
+ #!/usr/bin/env bash
2
+
3
+ input_file=${1} ; shift
4
+ output_file=${1} ; shift
5
+ map_script=${1-/bin/cat} ; shift
6
+ reduce_script=${1-/usr/bin/uniq} ; shift
7
+ partfields=${1-2} ; shift
8
+ sortfields=${1-2} ; shift
9
+
10
+ if [ "$output_file" == "" ] ; then echo "$0 input_file output_file [mapper=/bin/cat] [reducer=/usr/bin/uniq] [partfields=2] [sortfields=2] [extra_args]" ; exit ; fi
11
+
12
+ HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
13
+
14
+ cmd="${HADOOP_HOME}/bin/hadoop \
15
+ jar ${HADOOP_HOME}/contrib/streaming/hadoop-*streaming*.jar
16
+ $@
17
+ -D num.key.fields.for.partition=\"$partfields\"
18
+ -D stream.num.map.output.key.fields=\"$sortfields\"
19
+ -D stream.map.output.field.separator=\"'/t'\"
20
+ -D mapred.text.key.partitioner.options=\"-k1,$partfields\"
21
+ -D mapred.job.name=\"`basename $0`-$map_script-$input_file-$output_file\"
22
+ -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner
23
+ -mapper \"$map_script\"
24
+ -reducer \"$reduce_script\"
25
+ -input \"$input_file\"
26
+ -output \"$output_file\"
27
+ "
28
+
29
+ echo "$cmd"
30
+
31
+ $cmd
32
+
33
+ # For a map-side-only job specify
34
+ # -jobconf mapred.reduce.tasks=0 \
35
+
36
+ # Maybe?
37
+ #
38
+ # -inputformat org.apache.hadoop.mapred.KeyValueTextInputFormat \
39
+ # -mapper org.apache.hadoop.mapred.lib.IdentityMapper \
40
+ #
@@ -0,0 +1,22 @@
1
+ #!/usr/bin/env bash
2
+
3
+ input_file="${1}" ; shift
4
+ output_file="${1}" ; shift
5
+ map_script="${1-/bin/cat}" ; shift
6
+ reduce_script="${1-/usr/bin/uniq}" ; shift
7
+
8
+ if [ "$output_file" == "" ] ; then echo "$0 input_file output_file [mapper=/bin/cat] [reducer=/usr/bin/uniq] [extra_args]" ; exit ; fi
9
+
10
+ HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
11
+
12
+ # Can add fun stuff like
13
+ # -Dmapred.reduce.tasks=0 \
14
+
15
+ exec ${HADOOP_HOME}/bin/hadoop \
16
+ jar ${HADOOP_HOME}/contrib/streaming/hadoop-*streaming*.jar \
17
+ "$@" \
18
+ -Dmapred.job.name=`basename $0`-$map_script-$input_file-$output_file \
19
+ -mapper "$map_script" \
20
+ -reducer "$reduce_script" \
21
+ -input "$input_file" \
22
+ -output "$output_file"
data/bin/hdp-stream2 ADDED
@@ -0,0 +1,39 @@
1
+ #!/usr/bin/env ruby
2
+ require 'wukong'
3
+
4
+ # Example usage:
5
+ #
6
+ # ~/ics/wukong/bin/hdp-stream2 input_path1,input_path2 output_path \
7
+ # "`which cuttab` 2,3,7" "`which uniq` -c" 1 3 -jobconf mapred.reduce.tasks=23
8
+
9
+
10
+ # options = Settings[:runner_defaults].dup
11
+
12
+ # cmdline_opts = Hash.zip(
13
+ # [ :input_file, :output_file,
14
+ # :map_command, :reduce_command,
15
+ # :partition_fields, :sort_fields],
16
+ # ARGV.map{|s| s.blank? ? nil : s }
17
+ # )
18
+ # argvs = ARGV.slice!(0..5) ;
19
+ # ARGV.unshift cmdline_opts[:input_file];
20
+ # ARGV.unshift cmdline_opts[:output_file]
21
+ # p [argvs, ARGV]
22
+ #
23
+ # # cmdline_opts[:map_command] = `which cat`.chomp if cmdline_opts[:map_command].blank?
24
+ # # cmdline_opts[:reduce_command] = nil if cmdline_opts[:reduce_command].blank?
25
+ # cmdline_opts[:dry_run] = true
26
+ # cmdline_opts[:run] = true
27
+
28
+ #p cmdline_opts, Settings[:runner_defaults]
29
+
30
+ # Go script go!
31
+ runner = Wukong::Script.new(
32
+ nil, # use mapper_command
33
+ nil, # use reducer_command
34
+ :run => true
35
+ )
36
+ # runner.options.merge cmdline_opts
37
+ runner.options[:reuse_jvms] = true if runner.options[:reuse_jvms].blank?
38
+
39
+ runner.run