wukong 1.4.7 → 1.4.9
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG.textile +9 -0
- data/README.textile +1 -1
- data/bin/hdp-bzip +28 -0
- data/bin/hdp-mkdir +1 -1
- data/bin/hdp-stream-flat +3 -2
- data/bin/wu-lign +32 -18
- data/docpages/pig/cookbook.html +481 -0
- data/docpages/pig/images/hadoop-logo.jpg +0 -0
- data/docpages/pig/images/instruction_arrow.png +0 -0
- data/docpages/pig/images/pig-logo.gif +0 -0
- data/docpages/pig/piglatin_ref1.html +1103 -0
- data/docpages/pig/piglatin_ref2.html +14340 -0
- data/docpages/pig/setup.html +505 -0
- data/docpages/pig/skin/basic.css +166 -0
- data/docpages/pig/skin/breadcrumbs.js +237 -0
- data/docpages/pig/skin/fontsize.js +166 -0
- data/docpages/pig/skin/getBlank.js +40 -0
- data/docpages/pig/skin/getMenu.js +45 -0
- data/docpages/pig/skin/images/chapter.gif +0 -0
- data/docpages/pig/skin/images/chapter_open.gif +0 -0
- data/docpages/pig/skin/images/current.gif +0 -0
- data/docpages/pig/skin/images/external-link.gif +0 -0
- data/docpages/pig/skin/images/header_white_line.gif +0 -0
- data/docpages/pig/skin/images/page.gif +0 -0
- data/docpages/pig/skin/images/pdfdoc.gif +0 -0
- data/docpages/pig/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
- data/docpages/pig/skin/print.css +54 -0
- data/docpages/pig/skin/profile.css +181 -0
- data/docpages/pig/skin/screen.css +587 -0
- data/docpages/pig/tutorial.html +1059 -0
- data/docpages/pig/udf.html +1509 -0
- data/examples/keystore/conditional_outputter_example.rb +70 -0
- data/examples/{graph → network_graph}/adjacency_list.rb +0 -0
- data/examples/{graph → network_graph}/breadth_first_search.rb +0 -0
- data/examples/{graph → network_graph}/gen_2paths.rb +0 -0
- data/examples/{graph → network_graph}/gen_multi_edge.rb +0 -0
- data/examples/{graph → network_graph}/gen_symmetric_links.rb +0 -0
- data/examples/pagerank/run_pagerank.sh +10 -8
- data/examples/{apache_log_parser.rb → server_logs/apache_log_parser.rb} +0 -0
- data/examples/stupidly_simple_filter.rb +43 -0
- data/lib/wukong/extensions/hash.rb +13 -0
- data/lib/wukong/extensions/hash_like.rb +7 -0
- data/lib/wukong/keystore/cassandra_conditional_outputter.rb +122 -0
- data/lib/wukong/script.rb +27 -22
- data/lib/wukong/script/hadoop_command.rb +5 -3
- data/lib/wukong/streamer/accumulating_reducer.rb +2 -1
- data/wukong.gemspec +64 -26
- metadata +89 -31
- data/docpages/pig/PigLatinReferenceManual.html +0 -19134
- data/examples/foo.rb +0 -9
- data/examples/package-local.rb +0 -100
- data/examples/package.rb +0 -96
- data/examples/run_all.sh +0 -47
data/examples/foo.rb
DELETED
data/examples/package-local.rb
DELETED
@@ -1,100 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
$: << File.dirname(__FILE__)+'/../lib'
|
3
|
-
|
4
|
-
require 'wukong'
|
5
|
-
|
6
|
-
#
|
7
|
-
# This is so very very kludgey
|
8
|
-
#
|
9
|
-
# Input is an 'ls' file, listing files to .bz2 package.
|
10
|
-
#
|
11
|
-
# Mapper takes each in turn and creates, within a parallel directory tree under
|
12
|
-
# ~/pkgd on the HDFS, a .bz2 compressed version of the file.
|
13
|
-
#
|
14
|
-
# So, the file
|
15
|
-
# /user/me/fixd/all-20090103
|
16
|
-
# is packaged onto the DFS as
|
17
|
-
# /user/me/pkgd/user/me/fixd/all-20090103
|
18
|
-
#
|
19
|
-
# listing=tmp/fixd-all-package-listing
|
20
|
-
# hdp-rm $listing
|
21
|
-
# hadoop dfs -lsr fixd | egrep '(part-|\.tsv$)' | hdp-put - $listing ;
|
22
|
-
#
|
23
|
-
# ./package.rb --run --rm --map_tasks=1 $listing $pkgd_log
|
24
|
-
#
|
25
|
-
module ExportPackager
|
26
|
-
PKGD_DIR = '/workspace/flip/pkgd'
|
27
|
-
|
28
|
-
#
|
29
|
-
#
|
30
|
-
class Reducer < Wukong::Streamer::Base
|
31
|
-
def announce *args
|
32
|
-
$stdout.puts *args
|
33
|
-
$stderr.puts *args
|
34
|
-
end
|
35
|
-
|
36
|
-
def handle_existing_target output_filename
|
37
|
-
return true unless File.exist?(output_filename)
|
38
|
-
# announce "Exists! #{output_filename}"
|
39
|
-
# return false
|
40
|
-
announce "Removing target file #{output_filename}"
|
41
|
-
begin announce `rm #{output_filename}`
|
42
|
-
rescue Exception => e ; announce e ; end
|
43
|
-
true
|
44
|
-
end
|
45
|
-
|
46
|
-
def mkdir_target_safely output_filename
|
47
|
-
output_dir = File.dirname(output_filename)
|
48
|
-
announce "Ensuring directory #{output_dir} exists"
|
49
|
-
begin announce `mkdir -p #{output_dir}`
|
50
|
-
rescue Exception => e ; announce e ; end
|
51
|
-
end
|
52
|
-
|
53
|
-
def bzip_into_pkgd_file input_filename, output_filename
|
54
|
-
announce "bzip'ing into #{output_filename}"
|
55
|
-
announce `( hadoop dfs -cat #{input_filename}/[^_]\** ) | bzip2 -c > #{output_filename}`
|
56
|
-
end
|
57
|
-
|
58
|
-
def gen_output_filename input_filename
|
59
|
-
input_filename += '.tsv' unless input_filename =~ /.*\.\w{2,}/
|
60
|
-
"%s/%s.bz2" % [PKGD_DIR, input_filename.gsub(/^\//, '')]
|
61
|
-
end
|
62
|
-
|
63
|
-
def rsync host, local_path, remote_path=nil
|
64
|
-
remote_path ||= local_path
|
65
|
-
announce `/usr/bin/rsync -Cuvrtlp #{local_path} #{host}:#{remote_path}`
|
66
|
-
sleep 5
|
67
|
-
end
|
68
|
-
|
69
|
-
def process input_filename
|
70
|
-
output_filename = gen_output_filename(input_filename)
|
71
|
-
handle_existing_target(output_filename) or return
|
72
|
-
mkdir_target_safely output_filename
|
73
|
-
bzip_into_pkgd_file input_filename, output_filename
|
74
|
-
rsync :lab3, output_filename
|
75
|
-
#
|
76
|
-
end
|
77
|
-
|
78
|
-
def recordize line
|
79
|
-
# handle ls or straight file list, either
|
80
|
-
line.split(/\s/).last
|
81
|
-
end
|
82
|
-
|
83
|
-
def stream
|
84
|
-
super
|
85
|
-
rsync :lab3, PKGD_DIR+'/'
|
86
|
-
end
|
87
|
-
end
|
88
|
-
|
89
|
-
class Script < Wukong::Script
|
90
|
-
def default_options
|
91
|
-
super.merge :map_tasks => 1,
|
92
|
-
:max_node_reduce_tasks => 1, # only one reducer per local filesystem
|
93
|
-
:timeout => 40 * 60 * 1000 # timeout in ms
|
94
|
-
end
|
95
|
-
end
|
96
|
-
# Execute the script
|
97
|
-
Script.new(nil, Reducer).run
|
98
|
-
end
|
99
|
-
|
100
|
-
|
data/examples/package.rb
DELETED
@@ -1,96 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
$: << ENV['WUKONG_PATH'] if ENV['WUKONG_PATH']
|
3
|
-
require 'wukong'
|
4
|
-
|
5
|
-
#
|
6
|
-
# This is so very very kludgey
|
7
|
-
#
|
8
|
-
# Input is an 'ls' file, listing files to .bz2 package.
|
9
|
-
#
|
10
|
-
# Reducer takes each in turn and creates, within a parallel directory tree under
|
11
|
-
# ~/pkgd on the HDFS, a .bz2 compressed version of the file.
|
12
|
-
#
|
13
|
-
# So, the file
|
14
|
-
# /user/me/fixd/all-20090103
|
15
|
-
# is packaged onto the DFS as
|
16
|
-
# /user/me/pkgd/user/me/fixd/all-20090103
|
17
|
-
#
|
18
|
-
# listing=tmp/fixd-all-package-listing
|
19
|
-
# hdp-rm $listing
|
20
|
-
# hadoop dfs -lsr fixd | egrep '(part-|\.tsv$)' | hdp-put - $listing ;
|
21
|
-
#
|
22
|
-
# ./package.rb --run --rm --map_tasks=1 $listing $pkgd_log
|
23
|
-
#
|
24
|
-
module ExportPackager
|
25
|
-
PKGD_DIR = 'pkgd'
|
26
|
-
|
27
|
-
#
|
28
|
-
#
|
29
|
-
class Reducer < Wukong::Streamer::Base
|
30
|
-
def announce str
|
31
|
-
return if str.blank?
|
32
|
-
$stderr.puts str
|
33
|
-
$stdout.puts str
|
34
|
-
end
|
35
|
-
|
36
|
-
def remove_target_filename output_filename
|
37
|
-
begin announce "rm\t#{"%-70s"%output_filename}\t" +
|
38
|
-
`( hadoop dfs -rmr #{output_filename} ) 2>&1`
|
39
|
-
rescue ; nil ; end
|
40
|
-
end
|
41
|
-
|
42
|
-
def mkdir_target_safely output_filename
|
43
|
-
output_dir = File.dirname(output_filename)
|
44
|
-
begin announce "mkdir\t#{"%-70s"%output_dir}\t" +
|
45
|
-
`( hadoop dfs -mkdir #{output_dir} ) 2>&1`
|
46
|
-
rescue ; nil ; end
|
47
|
-
end
|
48
|
-
|
49
|
-
def bzip_into_pkgd_file input_filename, output_filename
|
50
|
-
announce "cat|bz\t#{"%-70s"%input_filename}\t" +
|
51
|
-
`( hadoop dfs -cat #{input_filename}/[^_]\\* | bzip2 -c | hadoop dfs -put - #{output_filename} ) 2>&1`
|
52
|
-
end
|
53
|
-
|
54
|
-
def verify input_filename, output_filename
|
55
|
-
announce "sha1sum\t#{"%-70s"%output_filename}\t" +
|
56
|
-
`( hadoop dfs -cat #{output_filename} | bzcat - | sha1sum ) 2>&1`
|
57
|
-
announce "sha1sum\t#{"%-70s"%input_filename}\t" +
|
58
|
-
`( hadoop dfs -cat #{input_filename}/[^_]\\* | sha1sum ) 2>&1`
|
59
|
-
end
|
60
|
-
|
61
|
-
def gen_output_filename input_filename
|
62
|
-
"%s/%s.bz2" % [PKGD_DIR, input_filename.gsub(%r{^/},"")]
|
63
|
-
end
|
64
|
-
|
65
|
-
def process input_filename, output_filename
|
66
|
-
# remove_target_filename output_filename
|
67
|
-
# mkdir_target_safely output_filename
|
68
|
-
bzip_into_pkgd_file input_filename, output_filename
|
69
|
-
verify input_filename, output_filename
|
70
|
-
end
|
71
|
-
|
72
|
-
def stream
|
73
|
-
announce `hostname`
|
74
|
-
$stdin.each do |input_filename|
|
75
|
-
# handle ls or straight file list, either
|
76
|
-
input_filename = input_filename.chomp.strip.split(/\s/).last
|
77
|
-
output_filename = gen_output_filename input_filename
|
78
|
-
announce "********************************************************"
|
79
|
-
announce "Packing\t#{"%-70s"%input_filename}\t#{output_filename}"
|
80
|
-
process input_filename, output_filename
|
81
|
-
announce "Done\t#{"%-70s"%input_filename}\t#{output_filename}\n\n"
|
82
|
-
end
|
83
|
-
end
|
84
|
-
end
|
85
|
-
|
86
|
-
class Script < Wukong::Script
|
87
|
-
def default_options
|
88
|
-
super.merge :timeout => (24 * 60 * 60 * 1000) # milliseconds in one day
|
89
|
-
end
|
90
|
-
end
|
91
|
-
end
|
92
|
-
|
93
|
-
#
|
94
|
-
# Execute the script
|
95
|
-
#
|
96
|
-
ExportPackager::Script.new(nil, ExportPackager::Reducer, :reduce_tasks => 1000).run
|
data/examples/run_all.sh
DELETED
@@ -1,47 +0,0 @@
|
|
1
|
-
#!/usr/bin/env bash
|
2
|
-
|
3
|
-
src_path="tmp/README.textile"
|
4
|
-
out_root="tmp/test"
|
5
|
-
hdp_opts="--map_tasks=1 --reduce_tasks=1"
|
6
|
-
|
7
|
-
# ---------------------------------------------------------------------------
|
8
|
-
#
|
9
|
-
# Set up directories and copy over sample input
|
10
|
-
#
|
11
|
-
|
12
|
-
# hdp-rm ${src_path}
|
13
|
-
# hdp-put `dirname $0`/../README.textile tmp/
|
14
|
-
# hdp-mkdir $out_root
|
15
|
-
|
16
|
-
# ---------------------------------------------------------------------------
|
17
|
-
#
|
18
|
-
# Run scripts
|
19
|
-
#
|
20
|
-
|
21
|
-
cmd="word_count"
|
22
|
-
# hdp-rm -r ${out_root}/${cmd}
|
23
|
-
# ./examples/${cmd}.rb --run $hdp_opts $src_path ${out_root}/${cmd}
|
24
|
-
# hdp-catd ${out_root}/${cmd} | head -n 20
|
25
|
-
word_count=${out_root}/${cmd}
|
26
|
-
|
27
|
-
cmd="sample_records"
|
28
|
-
# hdp-rm -r ${out_root}/${cmd}
|
29
|
-
# ./examples/${cmd}.rb --sampling_fraction=0.8 \
|
30
|
-
# --run $hdp_opts $src_path ${out_root}/${cmd}
|
31
|
-
# hdp-catd ${out_root}/${cmd} | head -n 200 | tail -n 20
|
32
|
-
sample_records=${out_root}/${cmd}
|
33
|
-
|
34
|
-
|
35
|
-
# cmd="size"
|
36
|
-
# hdp-rm -r ${out_root}/${cmd}
|
37
|
-
# ./examples/${cmd}.rb --run $hdp_opts $src_path ${out_root}/${cmd}
|
38
|
-
# hdp-catd ${out_root}/${cmd}
|
39
|
-
# size=${out_root}/${cmd}
|
40
|
-
|
41
|
-
|
42
|
-
cmd="count_keys"
|
43
|
-
hdp-rm -r ${out_root}/${cmd}
|
44
|
-
./examples/${cmd}.rb --run $hdp_opts $word_count ${out_root}/${cmd}
|
45
|
-
hdp-catd ${out_root}/${cmd} | head -n 200 | tail -n 20
|
46
|
-
count_keys=${out_root}/${cmd}
|
47
|
-
|