wukong 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE.textile +107 -0
- data/README.textile +166 -0
- data/bin/cutc +30 -0
- data/bin/cuttab +5 -0
- data/bin/greptrue +8 -0
- data/bin/hdp-cat +3 -0
- data/bin/hdp-catd +3 -0
- data/bin/hdp-du +81 -0
- data/bin/hdp-get +3 -0
- data/bin/hdp-kill +3 -0
- data/bin/hdp-ls +10 -0
- data/bin/hdp-mkdir +3 -0
- data/bin/hdp-mv +3 -0
- data/bin/hdp-parts_to_keys.rb +77 -0
- data/bin/hdp-ps +3 -0
- data/bin/hdp-put +3 -0
- data/bin/hdp-rm +11 -0
- data/bin/hdp-sort +29 -0
- data/bin/hdp-stream +29 -0
- data/bin/hdp-stream-flat +18 -0
- data/bin/hdp-sync +17 -0
- data/bin/hdp-wc +67 -0
- data/bin/md5sort +20 -0
- data/bin/tabchar +5 -0
- data/bin/uniqc +3 -0
- data/bin/wu-hist +3 -0
- data/bin/wu-lign +177 -0
- data/bin/wu-sum +30 -0
- data/doc/INSTALL.textile +41 -0
- data/doc/LICENSE.textile +107 -0
- data/doc/README-tutorial.textile +163 -0
- data/doc/README-wulign.textile +59 -0
- data/doc/README-wutils.textile +128 -0
- data/doc/TODO.textile +61 -0
- data/doc/UsingWukong-part1-setup.textile +2 -0
- data/doc/UsingWukong-part2-scraping.textile +2 -0
- data/doc/UsingWukong-part3-parsing.textile +132 -0
- data/doc/code/api_response_example.txt +20 -0
- data/doc/code/parser_skeleton.rb +38 -0
- data/doc/hadoop-nfs.textile +51 -0
- data/doc/hadoop-setup.textile +29 -0
- data/doc/index.textile +124 -0
- data/doc/intro_to_map_reduce/MapReduceDiagram.graffle +0 -0
- data/doc/links.textile +42 -0
- data/doc/overview.textile +91 -0
- data/doc/pig/PigLatinExpressionsList.txt +122 -0
- data/doc/pig/PigLatinReferenceManual.html +19134 -0
- data/doc/pig/PigLatinReferenceManual.txt +1640 -0
- data/doc/tips.textile +116 -0
- data/doc/usage.textile +102 -0
- data/doc/utils.textile +48 -0
- data/examples/README.txt +17 -0
- data/examples/and_pig/sample_queries.rb +128 -0
- data/examples/apache_log_parser.rb +53 -0
- data/examples/count_keys.rb +56 -0
- data/examples/count_keys_at_mapper.rb +57 -0
- data/examples/graph/adjacency_list.rb +74 -0
- data/examples/graph/breadth_first_search.rb +79 -0
- data/examples/graph/gen_2paths.rb +68 -0
- data/examples/graph/gen_multi_edge.rb +103 -0
- data/examples/graph/gen_symmetric_links.rb +53 -0
- data/examples/package-local.rb +100 -0
- data/examples/package.rb +96 -0
- data/examples/pagerank/README.textile +6 -0
- data/examples/pagerank/gen_initial_pagerank_graph.pig +57 -0
- data/examples/pagerank/pagerank.rb +88 -0
- data/examples/pagerank/pagerank_initialize.rb +46 -0
- data/examples/pagerank/run_pagerank.sh +19 -0
- data/examples/rank_and_bin.rb +173 -0
- data/examples/run_all.sh +47 -0
- data/examples/sample_records.rb +44 -0
- data/examples/size.rb +60 -0
- data/examples/word_count.rb +95 -0
- data/lib/wukong.rb +11 -0
- data/lib/wukong/and_pig.rb +62 -0
- data/lib/wukong/and_pig/README.textile +12 -0
- data/lib/wukong/and_pig/as.rb +37 -0
- data/lib/wukong/and_pig/data_types.rb +30 -0
- data/lib/wukong/and_pig/functions.rb +50 -0
- data/lib/wukong/and_pig/generate.rb +85 -0
- data/lib/wukong/and_pig/generate/variable_inflections.rb +82 -0
- data/lib/wukong/and_pig/junk.rb +51 -0
- data/lib/wukong/and_pig/operators.rb +8 -0
- data/lib/wukong/and_pig/operators/compound.rb +29 -0
- data/lib/wukong/and_pig/operators/evaluators.rb +7 -0
- data/lib/wukong/and_pig/operators/execution.rb +15 -0
- data/lib/wukong/and_pig/operators/file_methods.rb +29 -0
- data/lib/wukong/and_pig/operators/foreach.rb +98 -0
- data/lib/wukong/and_pig/operators/groupies.rb +212 -0
- data/lib/wukong/and_pig/operators/load_store.rb +65 -0
- data/lib/wukong/and_pig/operators/meta.rb +42 -0
- data/lib/wukong/and_pig/operators/relational.rb +129 -0
- data/lib/wukong/and_pig/pig_struct.rb +48 -0
- data/lib/wukong/and_pig/pig_var.rb +95 -0
- data/lib/wukong/and_pig/symbol.rb +29 -0
- data/lib/wukong/and_pig/utils.rb +0 -0
- data/lib/wukong/bad_record.rb +18 -0
- data/lib/wukong/boot.rb +47 -0
- data/lib/wukong/datatypes.rb +24 -0
- data/lib/wukong/datatypes/enum.rb +123 -0
- data/lib/wukong/dfs.rb +80 -0
- data/lib/wukong/encoding.rb +111 -0
- data/lib/wukong/extensions.rb +15 -0
- data/lib/wukong/extensions/array.rb +18 -0
- data/lib/wukong/extensions/blank.rb +93 -0
- data/lib/wukong/extensions/class.rb +189 -0
- data/lib/wukong/extensions/date_time.rb +24 -0
- data/lib/wukong/extensions/emittable.rb +82 -0
- data/lib/wukong/extensions/hash.rb +120 -0
- data/lib/wukong/extensions/hash_like.rb +119 -0
- data/lib/wukong/extensions/hashlike_class.rb +47 -0
- data/lib/wukong/extensions/module.rb +2 -0
- data/lib/wukong/extensions/pathname.rb +27 -0
- data/lib/wukong/extensions/string.rb +65 -0
- data/lib/wukong/extensions/struct.rb +17 -0
- data/lib/wukong/extensions/symbol.rb +11 -0
- data/lib/wukong/logger.rb +53 -0
- data/lib/wukong/models/graph.rb +27 -0
- data/lib/wukong/rdf.rb +104 -0
- data/lib/wukong/schema.rb +37 -0
- data/lib/wukong/script.rb +265 -0
- data/lib/wukong/script/hadoop_command.rb +111 -0
- data/lib/wukong/script/local_command.rb +14 -0
- data/lib/wukong/streamer.rb +13 -0
- data/lib/wukong/streamer/accumulating_reducer.rb +89 -0
- data/lib/wukong/streamer/base.rb +76 -0
- data/lib/wukong/streamer/count_keys.rb +30 -0
- data/lib/wukong/streamer/count_lines.rb +26 -0
- data/lib/wukong/streamer/filter.rb +20 -0
- data/lib/wukong/streamer/line_streamer.rb +12 -0
- data/lib/wukong/streamer/list_reducer.rb +20 -0
- data/lib/wukong/streamer/preprocess_with_pipe_streamer.rb +22 -0
- data/lib/wukong/streamer/rank_and_bin_reducer.rb +145 -0
- data/lib/wukong/streamer/set_reducer.rb +14 -0
- data/lib/wukong/streamer/struct_streamer.rb +48 -0
- data/lib/wukong/streamer/summing_reducer.rb +29 -0
- data/lib/wukong/streamer/uniq_by_last_reducer.rb +44 -0
- data/lib/wukong/typed_struct.rb +12 -0
- data/lib/wukong/wukong_class.rb +21 -0
- data/spec/bin/hdp-wc_spec.rb +4 -0
- data/spec/spec_helper.rb +0 -0
- data/wukong.gemspec +179 -0
- metadata +214 -0
@@ -0,0 +1,100 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
$: << File.dirname(__FILE__)+'/../lib'
|
3
|
+
|
4
|
+
require 'wukong'
|
5
|
+
|
6
|
+
#
|
7
|
+
# This is so very very kludgey
|
8
|
+
#
|
9
|
+
# Input is an 'ls' file, listing files to .bz2 package.
|
10
|
+
#
|
11
|
+
# Mapper takes each in turn and creates, within a parallel directory tree under
|
12
|
+
# ~/pkgd on the HDFS, a .bz2 compressed version of the file.
|
13
|
+
#
|
14
|
+
# So, the file
|
15
|
+
# /user/me/fixd/all-20090103
|
16
|
+
# is packaged onto the DFS as
|
17
|
+
# /user/me/pkgd/user/me/fixd/all-20090103
|
18
|
+
#
|
19
|
+
# listing=tmp/fixd-all-package-listing
|
20
|
+
# hdp-rm $listing
|
21
|
+
# hadoop dfs -lsr fixd | egrep '(part-|\.tsv$)' | hdp-put - $listing ;
|
22
|
+
#
|
23
|
+
# ./package.rb --run --rm --map_tasks=1 $listing $pkgd_log
|
24
|
+
#
|
25
|
+
module ExportPackager
|
26
|
+
PKGD_DIR = '/workspace/flip/pkgd'
|
27
|
+
|
28
|
+
#
|
29
|
+
#
|
30
|
+
class Reducer < Wukong::Streamer::Base
|
31
|
+
def announce *args
|
32
|
+
$stdout.puts *args
|
33
|
+
$stderr.puts *args
|
34
|
+
end
|
35
|
+
|
36
|
+
def handle_existing_target output_filename
|
37
|
+
return true unless File.exist?(output_filename)
|
38
|
+
# announce "Exists! #{output_filename}"
|
39
|
+
# return false
|
40
|
+
announce "Removing target file #{output_filename}"
|
41
|
+
begin announce `rm #{output_filename}`
|
42
|
+
rescue Exception => e ; announce e ; end
|
43
|
+
true
|
44
|
+
end
|
45
|
+
|
46
|
+
def mkdir_target_safely output_filename
|
47
|
+
output_dir = File.dirname(output_filename)
|
48
|
+
announce "Ensuring directory #{output_dir} exists"
|
49
|
+
begin announce `mkdir -p #{output_dir}`
|
50
|
+
rescue Exception => e ; announce e ; end
|
51
|
+
end
|
52
|
+
|
53
|
+
def bzip_into_pkgd_file input_filename, output_filename
|
54
|
+
announce "bzip'ing into #{output_filename}"
|
55
|
+
announce `( hadoop dfs -cat #{input_filename}/[^_]\** ) | bzip2 -c > #{output_filename}`
|
56
|
+
end
|
57
|
+
|
58
|
+
def gen_output_filename input_filename
|
59
|
+
input_filename += '.tsv' unless input_filename =~ /.*\.\w{2,}/
|
60
|
+
"%s/%s.bz2" % [PKGD_DIR, input_filename.gsub(/^\//, '')]
|
61
|
+
end
|
62
|
+
|
63
|
+
def rsync host, local_path, remote_path=nil
|
64
|
+
remote_path ||= local_path
|
65
|
+
announce `/usr/bin/rsync -Cuvrtlp #{local_path} #{host}:#{remote_path}`
|
66
|
+
sleep 5
|
67
|
+
end
|
68
|
+
|
69
|
+
def process input_filename
|
70
|
+
output_filename = gen_output_filename(input_filename)
|
71
|
+
handle_existing_target(output_filename) or return
|
72
|
+
mkdir_target_safely output_filename
|
73
|
+
bzip_into_pkgd_file input_filename, output_filename
|
74
|
+
rsync :lab3, output_filename
|
75
|
+
#
|
76
|
+
end
|
77
|
+
|
78
|
+
def recordize line
|
79
|
+
# handle ls or straight file list, either
|
80
|
+
line.split(/\s/).last
|
81
|
+
end
|
82
|
+
|
83
|
+
def stream
|
84
|
+
super
|
85
|
+
rsync :lab3, PKGD_DIR+'/'
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
class Script < Wukong::Script
|
90
|
+
def default_options
|
91
|
+
super.merge :map_tasks => 1,
|
92
|
+
:max_node_reduce_tasks => 1, # only one reducer per local filesystem
|
93
|
+
:timeout => 40 * 60 * 1000 # timeout in ms
|
94
|
+
end
|
95
|
+
end
|
96
|
+
# Execute the script
|
97
|
+
Script.new(nil, Reducer).run
|
98
|
+
end
|
99
|
+
|
100
|
+
|
data/examples/package.rb
ADDED
@@ -0,0 +1,96 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
$: << ENV['WUKONG_PATH'] if ENV['WUKONG_PATH']
|
3
|
+
require 'wukong'
|
4
|
+
|
5
|
+
#
|
6
|
+
# This is so very very kludgey
|
7
|
+
#
|
8
|
+
# Input is an 'ls' file, listing files to .bz2 package.
|
9
|
+
#
|
10
|
+
# Reducer takes each in turn and creates, within a parallel directory tree under
|
11
|
+
# ~/pkgd on the HDFS, a .bz2 compressed version of the file.
|
12
|
+
#
|
13
|
+
# So, the file
|
14
|
+
# /user/me/fixd/all-20090103
|
15
|
+
# is packaged onto the DFS as
|
16
|
+
# /user/me/pkgd/user/me/fixd/all-20090103
|
17
|
+
#
|
18
|
+
# listing=tmp/fixd-all-package-listing
|
19
|
+
# hdp-rm $listing
|
20
|
+
# hadoop dfs -lsr fixd | egrep '(part-|\.tsv$)' | hdp-put - $listing ;
|
21
|
+
#
|
22
|
+
# ./package.rb --run --rm --map_tasks=1 $listing $pkgd_log
|
23
|
+
#
|
24
|
+
module ExportPackager
|
25
|
+
PKGD_DIR = 'pkgd'
|
26
|
+
|
27
|
+
#
|
28
|
+
#
|
29
|
+
class Reducer < Wukong::Streamer::Base
|
30
|
+
def announce str
|
31
|
+
return if str.blank?
|
32
|
+
$stderr.puts str
|
33
|
+
$stdout.puts str
|
34
|
+
end
|
35
|
+
|
36
|
+
def remove_target_filename output_filename
|
37
|
+
begin announce "rm\t#{"%-70s"%output_filename}\t" +
|
38
|
+
`( hadoop dfs -rmr #{output_filename} ) 2>&1`
|
39
|
+
rescue ; nil ; end
|
40
|
+
end
|
41
|
+
|
42
|
+
def mkdir_target_safely output_filename
|
43
|
+
output_dir = File.dirname(output_filename)
|
44
|
+
begin announce "mkdir\t#{"%-70s"%output_dir}\t" +
|
45
|
+
`( hadoop dfs -mkdir #{output_dir} ) 2>&1`
|
46
|
+
rescue ; nil ; end
|
47
|
+
end
|
48
|
+
|
49
|
+
def bzip_into_pkgd_file input_filename, output_filename
|
50
|
+
announce "cat|bz\t#{"%-70s"%input_filename}\t" +
|
51
|
+
`( hadoop dfs -cat #{input_filename}/[^_]\\* | bzip2 -c | hadoop dfs -put - #{output_filename} ) 2>&1`
|
52
|
+
end
|
53
|
+
|
54
|
+
def verify input_filename, output_filename
|
55
|
+
announce "sha1sum\t#{"%-70s"%output_filename}\t" +
|
56
|
+
`( hadoop dfs -cat #{output_filename} | bzcat - | sha1sum ) 2>&1`
|
57
|
+
announce "sha1sum\t#{"%-70s"%input_filename}\t" +
|
58
|
+
`( hadoop dfs -cat #{input_filename}/[^_]\\* | sha1sum ) 2>&1`
|
59
|
+
end
|
60
|
+
|
61
|
+
def gen_output_filename input_filename
|
62
|
+
"%s/%s.bz2" % [PKGD_DIR, input_filename.gsub(%r{^/},"")]
|
63
|
+
end
|
64
|
+
|
65
|
+
def process input_filename, output_filename
|
66
|
+
# remove_target_filename output_filename
|
67
|
+
# mkdir_target_safely output_filename
|
68
|
+
bzip_into_pkgd_file input_filename, output_filename
|
69
|
+
verify input_filename, output_filename
|
70
|
+
end
|
71
|
+
|
72
|
+
def stream
|
73
|
+
announce `hostname`
|
74
|
+
$stdin.each do |input_filename|
|
75
|
+
# handle ls or straight file list, either
|
76
|
+
input_filename = input_filename.chomp.strip.split(/\s/).last
|
77
|
+
output_filename = gen_output_filename input_filename
|
78
|
+
announce "********************************************************"
|
79
|
+
announce "Packing\t#{"%-70s"%input_filename}\t#{output_filename}"
|
80
|
+
process input_filename, output_filename
|
81
|
+
announce "Done\t#{"%-70s"%input_filename}\t#{output_filename}\n\n"
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
class Script < Wukong::Script
|
87
|
+
def default_options
|
88
|
+
super.merge :timeout => (24 * 60 * 60 * 1000) # milliseconds in one day
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
#
|
94
|
+
# Execute the script
|
95
|
+
#
|
96
|
+
ExportPackager::Script.new(nil, ExportPackager::Reducer, :reduce_tasks => 1000).run
|
@@ -0,0 +1,57 @@
|
|
1
|
+
|
2
|
+
-- ===========================================================================
|
3
|
+
--
|
4
|
+
-- Load Graph
|
5
|
+
--
|
6
|
+
AFollowsB = LOAD 'twnew/all/a_follows_b' AS (rsrc: chararray, user_a_id: int, user_b_id: int) ;
|
7
|
+
FollEdges_0 = FOREACH AFollowsB GENERATE user_a_id AS src, user_b_id AS dest ;
|
8
|
+
|
9
|
+
InitPagerankFoll_0 = GROUP FollEdges_0 BY src ;
|
10
|
+
InitPagerankFoll_1 = FOREACH InitPagerankFoll_0 GENERATE
|
11
|
+
group AS src,
|
12
|
+
1.0F AS pagerank:float,
|
13
|
+
FollEdges_0.(dest) AS dests
|
14
|
+
;
|
15
|
+
rmf twnew/pagerank-foll/pagerank_graph_000 ;
|
16
|
+
STORE InitPagerankFoll_1 INTO 'twnew/pagerank-foll/pagerank_graph_000';
|
17
|
+
|
18
|
+
|
19
|
+
-- MultiEdge = LOAD 'twnew/all/multi_edge' AS (
|
20
|
+
-- rsrc: chararray, src: int, dest: int,
|
21
|
+
-- fo: int, fr: int,
|
22
|
+
-- re_out: int, re_in: int,
|
23
|
+
-- at_out: int, at_in: int,
|
24
|
+
-- rt_out: int, rt_in: int,
|
25
|
+
-- fv_out: int, fv_in: int) ;
|
26
|
+
--
|
27
|
+
-- SymmEdges_0 = FOREACH MultiEdge GENERATE src, dest, fo, fr ;
|
28
|
+
-- SymmEdges_1 = FILTER SymmEdges_0 BY (fo >= 1.0) AND (fr >= 1.0) ;
|
29
|
+
-- SymmEdges = FOREACH SymmEdges_1 GENERATE src, dest ;
|
30
|
+
-- -- rm twnew/graphs/symm_edges; STORE SymmEdges INTO 'twnew/graphs/symm_edges' ;
|
31
|
+
-- SymmEdges = LOAD 'twnew/graphs/symm_edges' AS (src:int , dest:int);
|
32
|
+
--
|
33
|
+
-- AnyoutEdges_0 = FOREACH MultiEdge GENERATE src, dest, fo, re_out, fv_out ;
|
34
|
+
-- AnyoutEdges_1 = FILTER AnyoutEdges_0 BY (fo >= 1.0) OR (re_out >= 1.0) OR (fv_out >= 1.0) ;
|
35
|
+
-- AnyoutEdges = FOREACH AnyoutEdges_1 GENERATE src, dest ;
|
36
|
+
-- -- rm twnew/graphs/anyout_edges; STORE AnyoutEdges INTO 'twnew/graphs/anyout_edges' ;
|
37
|
+
-- AnyoutEdges = LOAD 'twnew/graphs/anyout_edges' AS (src:int , dest:int);
|
38
|
+
--
|
39
|
+
--
|
40
|
+
-- InitPagerankSymm_0 = GROUP SymmEdges BY src ;
|
41
|
+
-- InitPagerankSymm_1 = FOREACH InitPagerankSymm_0 GENERATE
|
42
|
+
-- group AS src,
|
43
|
+
-- 1.0F AS pagerank:float,
|
44
|
+
-- SymmEdges.(dest) AS dests
|
45
|
+
-- ;
|
46
|
+
-- rm twnew/pagerank-symm/pagerank_graph_000 ;
|
47
|
+
-- STORE InitPagerankSymm_1 INTO 'twnew/pagerank-symm/pagerank_graph_000';
|
48
|
+
--
|
49
|
+
--
|
50
|
+
-- InitPagerankAnyout_0 = GROUP AnyoutEdges BY src ;
|
51
|
+
-- InitPagerankAnyout_1 = FOREACH InitPagerankAnyout_0 GENERATE
|
52
|
+
-- group AS src,
|
53
|
+
-- 1.0F AS pagerank:float,
|
54
|
+
-- AnyoutEdges.(dest) AS dests
|
55
|
+
-- ;
|
56
|
+
-- rm twnew/pagerank-anyout/pagerank_graph_000 ;
|
57
|
+
-- STORE InitPagerankAnyout_1 INTO 'twnew/pagerank-anyout/pagerank_graph_000';
|
@@ -0,0 +1,88 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
$: << File.dirname(__FILE__)+'/../../lib'
|
3
|
+
require 'wukong'
|
4
|
+
|
5
|
+
#
|
6
|
+
#
|
7
|
+
#
|
8
|
+
|
9
|
+
module PageRank
|
10
|
+
#
|
11
|
+
# Damping factor (prob. of a 'random' jump)
|
12
|
+
# 0.85 works well in practice. See http://en.wikipedia.org/wiki/Pagerank
|
13
|
+
#
|
14
|
+
DAMPING_FACTOR = 0.85
|
15
|
+
|
16
|
+
#
|
17
|
+
# Each user's line looks like
|
18
|
+
#
|
19
|
+
# user_a pagerank id1,id2,...,idN
|
20
|
+
#
|
21
|
+
# we need to disperse this user's pagerank to each of id1..idN, and
|
22
|
+
# rendezvous the list of outbound links at user_a's reducer as well.
|
23
|
+
#
|
24
|
+
module Iterating
|
25
|
+
class Mapper < Wukong::Streamer::Base
|
26
|
+
#
|
27
|
+
# Send pagerank to each page, and send the dests list back to self
|
28
|
+
#
|
29
|
+
def process src, pagerank, dests_str, &block
|
30
|
+
# This lets us use Pig to generate the input
|
31
|
+
dests_str = dests_str.gsub(/[\(\{\}\)]/, '')
|
32
|
+
dests = dests_str.split(",")
|
33
|
+
yield_pagerank_shares src, pagerank, dests, &block
|
34
|
+
yield_own_dest_list src, dests_str, &block
|
35
|
+
end
|
36
|
+
|
37
|
+
#
|
38
|
+
# Take the source node's pagerank and distribute it among all the out-nodes
|
39
|
+
#
|
40
|
+
def yield_pagerank_shares src, pagerank, dests
|
41
|
+
pagerank_share = pagerank.to_f / dests.length
|
42
|
+
dests.each do |dest|
|
43
|
+
yield [dest, 'p', pagerank_share]
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
#
|
48
|
+
# Dispatch this user's out-node list to rendezvous with itself.
|
49
|
+
#
|
50
|
+
def yield_own_dest_list src, dests_str
|
51
|
+
yield [src, 'd', dests_str]
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
class Reducer < Wukong::Streamer::AccumulatingReducer
|
56
|
+
attr_accessor :node_id, :pagerank, :dests_str
|
57
|
+
# Begin reduction with 0 accumulated pagerank and no dests as yet
|
58
|
+
def start! node_id, *args
|
59
|
+
self.node_id = node_id
|
60
|
+
self.pagerank = 0.0
|
61
|
+
self.dests_str = nil
|
62
|
+
end
|
63
|
+
# We'll receive fractional pagerank from all incoming edges,
|
64
|
+
# and the destination list from this node's map stage
|
65
|
+
def accumulate node_id, what, val
|
66
|
+
case what
|
67
|
+
when 'p' then self.pagerank += val.to_f
|
68
|
+
when 'd' then self.dests_str = val
|
69
|
+
else raise "Don't know how to accumulate #{[node_id, what, val].inspect}"
|
70
|
+
end
|
71
|
+
end
|
72
|
+
# To finalize, dump the damped pagerank and dest list
|
73
|
+
# in a form that can be fed back into this script
|
74
|
+
def finalize
|
75
|
+
damped_pagerank = (self.pagerank * DAMPING_FACTOR) + (1 - DAMPING_FACTOR)
|
76
|
+
self.dests_str = 'dummy' if self.dests_str.blank?
|
77
|
+
yield [node_id, damped_pagerank, dests_str]
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
class Script < Wukong::Script
|
82
|
+
def default_options
|
83
|
+
super.merge :extra_args => ' -jobconf io.sort.record.percent=0.25 '
|
84
|
+
end
|
85
|
+
end
|
86
|
+
Script.new(Mapper, Reducer).run
|
87
|
+
end
|
88
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
$: << File.dirname(__FILE__)+'/../../lib'
|
3
|
+
require 'wukong'
|
4
|
+
require 'wukong/streamer/set_reducer'
|
5
|
+
|
6
|
+
module PageRank
|
7
|
+
class Script < Wukong::Script
|
8
|
+
#
|
9
|
+
# Input format is
|
10
|
+
#
|
11
|
+
# rsrc src_id dest_id [... junk ...]
|
12
|
+
#
|
13
|
+
# All we want from the line are its src and dest IDs.
|
14
|
+
#
|
15
|
+
def map_command
|
16
|
+
%Q{/usr/bin/cut -d"\t" -f2,3}
|
17
|
+
end
|
18
|
+
|
19
|
+
def default_options
|
20
|
+
super.merge :extra_args => ' -jobconf io.sort.record.percent=0.25 '
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
#
|
25
|
+
# Accumulate the dests list in memory, dump as a whole. Multiple edges between
|
26
|
+
# any two nodes are permitted, and will accumulate pagerank according to the
|
27
|
+
# edge's multiplicity.
|
28
|
+
#
|
29
|
+
class Reducer < Wukong::Streamer::ListReducer
|
30
|
+
def accumulate src, dest
|
31
|
+
self.values << dest
|
32
|
+
end
|
33
|
+
|
34
|
+
# Emit src, initial pagerank, and flattened dests list
|
35
|
+
def finalize
|
36
|
+
self.values = ['dummy'] if self.values.blank?
|
37
|
+
yield [key, 1.0, self.values.to_a.join(",")]
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
# Execute the script
|
42
|
+
Script.new(nil, PageRank::Reducer).run
|
43
|
+
end
|
44
|
+
|
45
|
+
|
46
|
+
|
@@ -0,0 +1,19 @@
|
|
1
|
+
#!/usr/bin/env bash
|
2
|
+
|
3
|
+
# Directory to pagerank on.
|
4
|
+
work_dir=$1 ; shift
|
5
|
+
if [ "$work_dir" == '' ] ; then echo "Please specify the parent of the directory made by gen_initial_pagerank" ; exit ; fi
|
6
|
+
|
7
|
+
|
8
|
+
# How many rounds to run
|
9
|
+
max_iter=10
|
10
|
+
# this directory
|
11
|
+
script_dir="`dirname $0`"
|
12
|
+
|
13
|
+
for (( curr=0 , next=1 ; "$curr" < "$max_iter" ; curr++ , next++ )) ; do
|
14
|
+
curr_str=`printf "%03d" ${curr}`
|
15
|
+
next_str=`printf "%03d" ${next}`
|
16
|
+
curr_dir=$work_dir/pagerank_graph_${curr_str}
|
17
|
+
next_dir=$work_dir/pagerank_graph_${next_str}
|
18
|
+
$script_dir/pagerank.rb --rm --run $curr_dir $next_dir
|
19
|
+
done
|
@@ -0,0 +1,173 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
$: << File.dirname(__FILE__)+'/../lib'
|
3
|
+
require 'wukong'
|
4
|
+
require 'wukong/streamer/rank_and_bin_reducer'
|
5
|
+
|
6
|
+
#
|
7
|
+
# This example uses the classes from http://github.com/mrflip/twitter_friends
|
8
|
+
# (That's sloppy, and I apologize. I'm building this script for that, but it
|
9
|
+
# seems broadly useful and I'm not maintaining two copies. Once this script is
|
10
|
+
# more worky we'll make it standalone. Anyway you should get the picture.)
|
11
|
+
#
|
12
|
+
$: << File.dirname(__FILE__)+'/../../projects/twitter_friends/lib'
|
13
|
+
require 'twitter_friends';
|
14
|
+
require 'twitter_friends/struct_model' ; include TwitterFriends::StructModel
|
15
|
+
|
16
|
+
|
17
|
+
#
|
18
|
+
# attrs to bin
|
19
|
+
#
|
20
|
+
BINNABLE_ATTRS = {
|
21
|
+
:twitter_user => [
|
22
|
+
[:followers_count, :fo ],
|
23
|
+
[:friends_count, :fr ],
|
24
|
+
[:statuses_count, :st ],
|
25
|
+
[:favourites_count, :fv ],
|
26
|
+
[:created_at, :crat ]
|
27
|
+
]
|
28
|
+
|
29
|
+
}
|
30
|
+
RESOURCE_ALIASES = {
|
31
|
+
:twitter_user => :u,
|
32
|
+
:user_metrics => :um,
|
33
|
+
}
|
34
|
+
#
|
35
|
+
# KLUDGE This is not DRY at all but let's get it working first
|
36
|
+
#
|
37
|
+
BinUserMetrics = TypedStruct.new(
|
38
|
+
[:id, Integer],
|
39
|
+
*BINNABLE_ATTRS[:user_metrics].map{|attr, attr_abbr| [attr_abbr, Integer] }
|
40
|
+
)
|
41
|
+
BINNED_RESOURCE_ALIASES = {
|
42
|
+
:u => BinTwitterUser,
|
43
|
+
}
|
44
|
+
|
45
|
+
module RankAndBinAttrs
|
46
|
+
class ExplodeResourceMapper < Wukong::Streamer::StructStreamer
|
47
|
+
def get_and_format_attr thing, attr
|
48
|
+
val = thing.send(attr)
|
49
|
+
case thing.members_types[attr].to_s.to_sym
|
50
|
+
when :Integer then "%010d" % val.to_i
|
51
|
+
when :Float then "%020.7f" % val.to_f
|
52
|
+
when :Bignum then "%020d" % val.to_i
|
53
|
+
else
|
54
|
+
raise [val, thing.members_types[attr].to_s.to_sym].inspect
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
#
|
59
|
+
# The data expansion of this mapper is large enough that it makes sense to
|
60
|
+
# be a little responsible with what we emit. We'll use the RESOURCE_ALIASES
|
61
|
+
# and BINNABLE_ATTRS hashes, above, to dump a more parsimonious
|
62
|
+
# representation.
|
63
|
+
#
|
64
|
+
def process thing, *args, &block
|
65
|
+
attr_abbrs = BINNABLE_ATTRS[thing.class.resource_name]
|
66
|
+
return unless attr_abbrs
|
67
|
+
attr_abbrs.each do |attr, abbr|
|
68
|
+
yield [
|
69
|
+
RESOURCE_ALIASES[thing.class.resource_name],
|
70
|
+
abbr,
|
71
|
+
get_and_format_attr(thing, attr),
|
72
|
+
thing.id.to_i
|
73
|
+
]
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
class BinAttrReducer < Wukong::Streamer::RankAndBinReducer
|
79
|
+
attr_accessor :last_rsrc_attr
|
80
|
+
#
|
81
|
+
# Note that we might get several different resources at the same reducer
|
82
|
+
#
|
83
|
+
def get_key rsrc, attr, val, *args
|
84
|
+
if [rsrc, attr] != self.last_rsrc_attr
|
85
|
+
# Note: since each partition has the same cardinality, we don't need to
|
86
|
+
# fiddle around with the bin_size, etc -- just reset the order
|
87
|
+
# parameters' state.
|
88
|
+
reset_order_params!
|
89
|
+
self.last_rsrc_attr = [rsrc, attr]
|
90
|
+
end
|
91
|
+
val
|
92
|
+
end
|
93
|
+
|
94
|
+
#
|
95
|
+
# Note well -- we are rearranging the field order to
|
96
|
+
#
|
97
|
+
# resource_abbr id attr_abbr bin
|
98
|
+
#
|
99
|
+
# for proper sorting to the re-assembler
|
100
|
+
#
|
101
|
+
def emit record
|
102
|
+
rsrc, attr, val, id, numbering, rank, bin = record
|
103
|
+
super [rsrc, id, attr, bin]
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
class ReassembleObjectReducer < Wukong::Streamer::AccumulatingReducer
|
108
|
+
attr_accessor :thing
|
109
|
+
def klass_from_abbr rsrc_abbr
|
110
|
+
BINNED_RESOURCE_ALIASES[rsrc_abbr.to_sym]
|
111
|
+
end
|
112
|
+
def get_key rsrc_abbr, id, *args
|
113
|
+
[rsrc_abbr, id.to_i]
|
114
|
+
end
|
115
|
+
|
116
|
+
def start! rsrc_abbr, id, *args
|
117
|
+
klass = klass_from_abbr(rsrc_abbr)
|
118
|
+
self.thing = klass.new id.to_i
|
119
|
+
end
|
120
|
+
|
121
|
+
def accumulate rsrc, id, attr, bin
|
122
|
+
thing.send("#{attr}=", bin)
|
123
|
+
end
|
124
|
+
|
125
|
+
def finalize
|
126
|
+
yield thing
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
#
|
131
|
+
# Two-phase script
|
132
|
+
#
|
133
|
+
# FIXME -- We need a runner class to manage this.
|
134
|
+
#
|
135
|
+
class Script < Wukong::Script
|
136
|
+
attr_accessor :phase
|
137
|
+
# KLUDGE !!
|
138
|
+
def initialize
|
139
|
+
case
|
140
|
+
when ARGV.detect{|arg| arg =~ /--phase=1/}
|
141
|
+
# Phase 1 -- Steal underpants. Also, disassemble each object, and find
|
142
|
+
# the bin for each binnable attribute's value
|
143
|
+
self.phase = 1
|
144
|
+
self.mapper_klass, self.reducer_klass = [ExplodeResourceMapper, BinAttrReducer]
|
145
|
+
when ARGV.detect{|arg| arg =~ /--phase=2/}
|
146
|
+
# Phase 2 -- ????
|
147
|
+
raise "Phase 2 : ????"
|
148
|
+
when ARGV.detect{|arg| arg =~ /--phase=3/}
|
149
|
+
# Phase 3 -- profit. In this case, put records back together.
|
150
|
+
self.phase = 3
|
151
|
+
self.mapper_klass, self.reducer_klass = [nil, ReassembleObjectReducer]
|
152
|
+
else
|
153
|
+
raise "Please run me with a --phase= option"
|
154
|
+
end
|
155
|
+
super mapper_klass, reducer_klass
|
156
|
+
end
|
157
|
+
|
158
|
+
def default_options
|
159
|
+
extra_options =
|
160
|
+
case self.phase
|
161
|
+
# partition on [rsrc, attr]; sort on [rsrc, attr, val]
|
162
|
+
when 1 then { :sort_fields => 3, :partition_fields => 2 }
|
163
|
+
# sort on [rsrc, id]
|
164
|
+
when 3 then { :sort_fields => 2 }
|
165
|
+
else { }
|
166
|
+
end
|
167
|
+
super.merge extra_options
|
168
|
+
end
|
169
|
+
end
|
170
|
+
|
171
|
+
# execute script
|
172
|
+
Script.new.run
|
173
|
+
end
|