wukong 1.4.7 → 1.4.9

Sign up to get free protection for your applications and to get access to all the features.
Files changed (62) hide show
  1. data/CHANGELOG.textile +9 -0
  2. data/README.textile +1 -1
  3. data/bin/hdp-bzip +28 -0
  4. data/bin/hdp-mkdir +1 -1
  5. data/bin/hdp-stream-flat +3 -2
  6. data/bin/wu-lign +32 -18
  7. data/docpages/pig/cookbook.html +481 -0
  8. data/docpages/pig/images/hadoop-logo.jpg +0 -0
  9. data/docpages/pig/images/instruction_arrow.png +0 -0
  10. data/docpages/pig/images/pig-logo.gif +0 -0
  11. data/docpages/pig/piglatin_ref1.html +1103 -0
  12. data/docpages/pig/piglatin_ref2.html +14340 -0
  13. data/docpages/pig/setup.html +505 -0
  14. data/docpages/pig/skin/basic.css +166 -0
  15. data/docpages/pig/skin/breadcrumbs.js +237 -0
  16. data/docpages/pig/skin/fontsize.js +166 -0
  17. data/docpages/pig/skin/getBlank.js +40 -0
  18. data/docpages/pig/skin/getMenu.js +45 -0
  19. data/docpages/pig/skin/images/chapter.gif +0 -0
  20. data/docpages/pig/skin/images/chapter_open.gif +0 -0
  21. data/docpages/pig/skin/images/current.gif +0 -0
  22. data/docpages/pig/skin/images/external-link.gif +0 -0
  23. data/docpages/pig/skin/images/header_white_line.gif +0 -0
  24. data/docpages/pig/skin/images/page.gif +0 -0
  25. data/docpages/pig/skin/images/pdfdoc.gif +0 -0
  26. data/docpages/pig/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
  27. data/docpages/pig/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
  28. data/docpages/pig/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  29. data/docpages/pig/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
  30. data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
  31. data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  32. data/docpages/pig/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
  33. data/docpages/pig/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
  34. data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  35. data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  36. data/docpages/pig/skin/print.css +54 -0
  37. data/docpages/pig/skin/profile.css +181 -0
  38. data/docpages/pig/skin/screen.css +587 -0
  39. data/docpages/pig/tutorial.html +1059 -0
  40. data/docpages/pig/udf.html +1509 -0
  41. data/examples/keystore/conditional_outputter_example.rb +70 -0
  42. data/examples/{graph → network_graph}/adjacency_list.rb +0 -0
  43. data/examples/{graph → network_graph}/breadth_first_search.rb +0 -0
  44. data/examples/{graph → network_graph}/gen_2paths.rb +0 -0
  45. data/examples/{graph → network_graph}/gen_multi_edge.rb +0 -0
  46. data/examples/{graph → network_graph}/gen_symmetric_links.rb +0 -0
  47. data/examples/pagerank/run_pagerank.sh +10 -8
  48. data/examples/{apache_log_parser.rb → server_logs/apache_log_parser.rb} +0 -0
  49. data/examples/stupidly_simple_filter.rb +43 -0
  50. data/lib/wukong/extensions/hash.rb +13 -0
  51. data/lib/wukong/extensions/hash_like.rb +7 -0
  52. data/lib/wukong/keystore/cassandra_conditional_outputter.rb +122 -0
  53. data/lib/wukong/script.rb +27 -22
  54. data/lib/wukong/script/hadoop_command.rb +5 -3
  55. data/lib/wukong/streamer/accumulating_reducer.rb +2 -1
  56. data/wukong.gemspec +64 -26
  57. metadata +89 -31
  58. data/docpages/pig/PigLatinReferenceManual.html +0 -19134
  59. data/examples/foo.rb +0 -9
  60. data/examples/package-local.rb +0 -100
  61. data/examples/package.rb +0 -96
  62. data/examples/run_all.sh +0 -47
data/examples/foo.rb DELETED
@@ -1,9 +0,0 @@
1
- #!/usr/bin/env ruby
2
- $: << File.expand_path("~/ics/backend/configliere/lib")
3
-
4
- require "wukong"
5
-
6
- p Wukong::Script.new(nil,nil).options
7
- p Wukong::Script.new(nil,nil).non_wukong_params
8
-
9
- Wukong::Script.new(nil,nil).run
@@ -1,100 +0,0 @@
1
- #!/usr/bin/env ruby
2
- $: << File.dirname(__FILE__)+'/../lib'
3
-
4
- require 'wukong'
5
-
6
- #
7
- # This is so very very kludgey
8
- #
9
- # Input is an 'ls' file, listing files to .bz2 package.
10
- #
11
- # Mapper takes each in turn and creates, within a parallel directory tree under
12
- # ~/pkgd on the HDFS, a .bz2 compressed version of the file.
13
- #
14
- # So, the file
15
- # /user/me/fixd/all-20090103
16
- # is packaged onto the DFS as
17
- # /user/me/pkgd/user/me/fixd/all-20090103
18
- #
19
- # listing=tmp/fixd-all-package-listing
20
- # hdp-rm $listing
21
- # hadoop dfs -lsr fixd | egrep '(part-|\.tsv$)' | hdp-put - $listing ;
22
- #
23
- # ./package.rb --run --rm --map_tasks=1 $listing $pkgd_log
24
- #
25
- module ExportPackager
26
- PKGD_DIR = '/workspace/flip/pkgd'
27
-
28
- #
29
- #
30
- class Reducer < Wukong::Streamer::Base
31
- def announce *args
32
- $stdout.puts *args
33
- $stderr.puts *args
34
- end
35
-
36
- def handle_existing_target output_filename
37
- return true unless File.exist?(output_filename)
38
- # announce "Exists! #{output_filename}"
39
- # return false
40
- announce "Removing target file #{output_filename}"
41
- begin announce `rm #{output_filename}`
42
- rescue Exception => e ; announce e ; end
43
- true
44
- end
45
-
46
- def mkdir_target_safely output_filename
47
- output_dir = File.dirname(output_filename)
48
- announce "Ensuring directory #{output_dir} exists"
49
- begin announce `mkdir -p #{output_dir}`
50
- rescue Exception => e ; announce e ; end
51
- end
52
-
53
- def bzip_into_pkgd_file input_filename, output_filename
54
- announce "bzip'ing into #{output_filename}"
55
- announce `( hadoop dfs -cat #{input_filename}/[^_]\** ) | bzip2 -c > #{output_filename}`
56
- end
57
-
58
- def gen_output_filename input_filename
59
- input_filename += '.tsv' unless input_filename =~ /.*\.\w{2,}/
60
- "%s/%s.bz2" % [PKGD_DIR, input_filename.gsub(/^\//, '')]
61
- end
62
-
63
- def rsync host, local_path, remote_path=nil
64
- remote_path ||= local_path
65
- announce `/usr/bin/rsync -Cuvrtlp #{local_path} #{host}:#{remote_path}`
66
- sleep 5
67
- end
68
-
69
- def process input_filename
70
- output_filename = gen_output_filename(input_filename)
71
- handle_existing_target(output_filename) or return
72
- mkdir_target_safely output_filename
73
- bzip_into_pkgd_file input_filename, output_filename
74
- rsync :lab3, output_filename
75
- #
76
- end
77
-
78
- def recordize line
79
- # handle ls or straight file list, either
80
- line.split(/\s/).last
81
- end
82
-
83
- def stream
84
- super
85
- rsync :lab3, PKGD_DIR+'/'
86
- end
87
- end
88
-
89
- class Script < Wukong::Script
90
- def default_options
91
- super.merge :map_tasks => 1,
92
- :max_node_reduce_tasks => 1, # only one reducer per local filesystem
93
- :timeout => 40 * 60 * 1000 # timeout in ms
94
- end
95
- end
96
- # Execute the script
97
- Script.new(nil, Reducer).run
98
- end
99
-
100
-
data/examples/package.rb DELETED
@@ -1,96 +0,0 @@
1
- #!/usr/bin/env ruby
2
- $: << ENV['WUKONG_PATH'] if ENV['WUKONG_PATH']
3
- require 'wukong'
4
-
5
- #
6
- # This is so very very kludgey
7
- #
8
- # Input is an 'ls' file, listing files to .bz2 package.
9
- #
10
- # Reducer takes each in turn and creates, within a parallel directory tree under
11
- # ~/pkgd on the HDFS, a .bz2 compressed version of the file.
12
- #
13
- # So, the file
14
- # /user/me/fixd/all-20090103
15
- # is packaged onto the DFS as
16
- # /user/me/pkgd/user/me/fixd/all-20090103
17
- #
18
- # listing=tmp/fixd-all-package-listing
19
- # hdp-rm $listing
20
- # hadoop dfs -lsr fixd | egrep '(part-|\.tsv$)' | hdp-put - $listing ;
21
- #
22
- # ./package.rb --run --rm --map_tasks=1 $listing $pkgd_log
23
- #
24
- module ExportPackager
25
- PKGD_DIR = 'pkgd'
26
-
27
- #
28
- #
29
- class Reducer < Wukong::Streamer::Base
30
- def announce str
31
- return if str.blank?
32
- $stderr.puts str
33
- $stdout.puts str
34
- end
35
-
36
- def remove_target_filename output_filename
37
- begin announce "rm\t#{"%-70s"%output_filename}\t" +
38
- `( hadoop dfs -rmr #{output_filename} ) 2>&1`
39
- rescue ; nil ; end
40
- end
41
-
42
- def mkdir_target_safely output_filename
43
- output_dir = File.dirname(output_filename)
44
- begin announce "mkdir\t#{"%-70s"%output_dir}\t" +
45
- `( hadoop dfs -mkdir #{output_dir} ) 2>&1`
46
- rescue ; nil ; end
47
- end
48
-
49
- def bzip_into_pkgd_file input_filename, output_filename
50
- announce "cat|bz\t#{"%-70s"%input_filename}\t" +
51
- `( hadoop dfs -cat #{input_filename}/[^_]\\* | bzip2 -c | hadoop dfs -put - #{output_filename} ) 2>&1`
52
- end
53
-
54
- def verify input_filename, output_filename
55
- announce "sha1sum\t#{"%-70s"%output_filename}\t" +
56
- `( hadoop dfs -cat #{output_filename} | bzcat - | sha1sum ) 2>&1`
57
- announce "sha1sum\t#{"%-70s"%input_filename}\t" +
58
- `( hadoop dfs -cat #{input_filename}/[^_]\\* | sha1sum ) 2>&1`
59
- end
60
-
61
- def gen_output_filename input_filename
62
- "%s/%s.bz2" % [PKGD_DIR, input_filename.gsub(%r{^/},"")]
63
- end
64
-
65
- def process input_filename, output_filename
66
- # remove_target_filename output_filename
67
- # mkdir_target_safely output_filename
68
- bzip_into_pkgd_file input_filename, output_filename
69
- verify input_filename, output_filename
70
- end
71
-
72
- def stream
73
- announce `hostname`
74
- $stdin.each do |input_filename|
75
- # handle ls or straight file list, either
76
- input_filename = input_filename.chomp.strip.split(/\s/).last
77
- output_filename = gen_output_filename input_filename
78
- announce "********************************************************"
79
- announce "Packing\t#{"%-70s"%input_filename}\t#{output_filename}"
80
- process input_filename, output_filename
81
- announce "Done\t#{"%-70s"%input_filename}\t#{output_filename}\n\n"
82
- end
83
- end
84
- end
85
-
86
- class Script < Wukong::Script
87
- def default_options
88
- super.merge :timeout => (24 * 60 * 60 * 1000) # milliseconds in one day
89
- end
90
- end
91
- end
92
-
93
- #
94
- # Execute the script
95
- #
96
- ExportPackager::Script.new(nil, ExportPackager::Reducer, :reduce_tasks => 1000).run
data/examples/run_all.sh DELETED
@@ -1,47 +0,0 @@
1
- #!/usr/bin/env bash
2
-
3
- src_path="tmp/README.textile"
4
- out_root="tmp/test"
5
- hdp_opts="--map_tasks=1 --reduce_tasks=1"
6
-
7
- # ---------------------------------------------------------------------------
8
- #
9
- # Set up directories and copy over sample input
10
- #
11
-
12
- # hdp-rm ${src_path}
13
- # hdp-put `dirname $0`/../README.textile tmp/
14
- # hdp-mkdir $out_root
15
-
16
- # ---------------------------------------------------------------------------
17
- #
18
- # Run scripts
19
- #
20
-
21
- cmd="word_count"
22
- # hdp-rm -r ${out_root}/${cmd}
23
- # ./examples/${cmd}.rb --run $hdp_opts $src_path ${out_root}/${cmd}
24
- # hdp-catd ${out_root}/${cmd} | head -n 20
25
- word_count=${out_root}/${cmd}
26
-
27
- cmd="sample_records"
28
- # hdp-rm -r ${out_root}/${cmd}
29
- # ./examples/${cmd}.rb --sampling_fraction=0.8 \
30
- # --run $hdp_opts $src_path ${out_root}/${cmd}
31
- # hdp-catd ${out_root}/${cmd} | head -n 200 | tail -n 20
32
- sample_records=${out_root}/${cmd}
33
-
34
-
35
- # cmd="size"
36
- # hdp-rm -r ${out_root}/${cmd}
37
- # ./examples/${cmd}.rb --run $hdp_opts $src_path ${out_root}/${cmd}
38
- # hdp-catd ${out_root}/${cmd}
39
- # size=${out_root}/${cmd}
40
-
41
-
42
- cmd="count_keys"
43
- hdp-rm -r ${out_root}/${cmd}
44
- ./examples/${cmd}.rb --run $hdp_opts $word_count ${out_root}/${cmd}
45
- hdp-catd ${out_root}/${cmd} | head -n 200 | tail -n 20
46
- count_keys=${out_root}/${cmd}
47
-