wukong 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE.textile +107 -0
- data/README.textile +166 -0
- data/bin/cutc +30 -0
- data/bin/cuttab +5 -0
- data/bin/greptrue +8 -0
- data/bin/hdp-cat +3 -0
- data/bin/hdp-catd +3 -0
- data/bin/hdp-du +81 -0
- data/bin/hdp-get +3 -0
- data/bin/hdp-kill +3 -0
- data/bin/hdp-ls +10 -0
- data/bin/hdp-mkdir +3 -0
- data/bin/hdp-mv +3 -0
- data/bin/hdp-parts_to_keys.rb +77 -0
- data/bin/hdp-ps +3 -0
- data/bin/hdp-put +3 -0
- data/bin/hdp-rm +11 -0
- data/bin/hdp-sort +29 -0
- data/bin/hdp-stream +29 -0
- data/bin/hdp-stream-flat +18 -0
- data/bin/hdp-sync +17 -0
- data/bin/hdp-wc +67 -0
- data/bin/md5sort +20 -0
- data/bin/tabchar +5 -0
- data/bin/uniqc +3 -0
- data/bin/wu-hist +3 -0
- data/bin/wu-lign +177 -0
- data/bin/wu-sum +30 -0
- data/doc/INSTALL.textile +41 -0
- data/doc/LICENSE.textile +107 -0
- data/doc/README-tutorial.textile +163 -0
- data/doc/README-wulign.textile +59 -0
- data/doc/README-wutils.textile +128 -0
- data/doc/TODO.textile +61 -0
- data/doc/UsingWukong-part1-setup.textile +2 -0
- data/doc/UsingWukong-part2-scraping.textile +2 -0
- data/doc/UsingWukong-part3-parsing.textile +132 -0
- data/doc/code/api_response_example.txt +20 -0
- data/doc/code/parser_skeleton.rb +38 -0
- data/doc/hadoop-nfs.textile +51 -0
- data/doc/hadoop-setup.textile +29 -0
- data/doc/index.textile +124 -0
- data/doc/intro_to_map_reduce/MapReduceDiagram.graffle +0 -0
- data/doc/links.textile +42 -0
- data/doc/overview.textile +91 -0
- data/doc/pig/PigLatinExpressionsList.txt +122 -0
- data/doc/pig/PigLatinReferenceManual.html +19134 -0
- data/doc/pig/PigLatinReferenceManual.txt +1640 -0
- data/doc/tips.textile +116 -0
- data/doc/usage.textile +102 -0
- data/doc/utils.textile +48 -0
- data/examples/README.txt +17 -0
- data/examples/and_pig/sample_queries.rb +128 -0
- data/examples/apache_log_parser.rb +53 -0
- data/examples/count_keys.rb +56 -0
- data/examples/count_keys_at_mapper.rb +57 -0
- data/examples/graph/adjacency_list.rb +74 -0
- data/examples/graph/breadth_first_search.rb +79 -0
- data/examples/graph/gen_2paths.rb +68 -0
- data/examples/graph/gen_multi_edge.rb +103 -0
- data/examples/graph/gen_symmetric_links.rb +53 -0
- data/examples/package-local.rb +100 -0
- data/examples/package.rb +96 -0
- data/examples/pagerank/README.textile +6 -0
- data/examples/pagerank/gen_initial_pagerank_graph.pig +57 -0
- data/examples/pagerank/pagerank.rb +88 -0
- data/examples/pagerank/pagerank_initialize.rb +46 -0
- data/examples/pagerank/run_pagerank.sh +19 -0
- data/examples/rank_and_bin.rb +173 -0
- data/examples/run_all.sh +47 -0
- data/examples/sample_records.rb +44 -0
- data/examples/size.rb +60 -0
- data/examples/word_count.rb +95 -0
- data/lib/wukong.rb +11 -0
- data/lib/wukong/and_pig.rb +62 -0
- data/lib/wukong/and_pig/README.textile +12 -0
- data/lib/wukong/and_pig/as.rb +37 -0
- data/lib/wukong/and_pig/data_types.rb +30 -0
- data/lib/wukong/and_pig/functions.rb +50 -0
- data/lib/wukong/and_pig/generate.rb +85 -0
- data/lib/wukong/and_pig/generate/variable_inflections.rb +82 -0
- data/lib/wukong/and_pig/junk.rb +51 -0
- data/lib/wukong/and_pig/operators.rb +8 -0
- data/lib/wukong/and_pig/operators/compound.rb +29 -0
- data/lib/wukong/and_pig/operators/evaluators.rb +7 -0
- data/lib/wukong/and_pig/operators/execution.rb +15 -0
- data/lib/wukong/and_pig/operators/file_methods.rb +29 -0
- data/lib/wukong/and_pig/operators/foreach.rb +98 -0
- data/lib/wukong/and_pig/operators/groupies.rb +212 -0
- data/lib/wukong/and_pig/operators/load_store.rb +65 -0
- data/lib/wukong/and_pig/operators/meta.rb +42 -0
- data/lib/wukong/and_pig/operators/relational.rb +129 -0
- data/lib/wukong/and_pig/pig_struct.rb +48 -0
- data/lib/wukong/and_pig/pig_var.rb +95 -0
- data/lib/wukong/and_pig/symbol.rb +29 -0
- data/lib/wukong/and_pig/utils.rb +0 -0
- data/lib/wukong/bad_record.rb +18 -0
- data/lib/wukong/boot.rb +47 -0
- data/lib/wukong/datatypes.rb +24 -0
- data/lib/wukong/datatypes/enum.rb +123 -0
- data/lib/wukong/dfs.rb +80 -0
- data/lib/wukong/encoding.rb +111 -0
- data/lib/wukong/extensions.rb +15 -0
- data/lib/wukong/extensions/array.rb +18 -0
- data/lib/wukong/extensions/blank.rb +93 -0
- data/lib/wukong/extensions/class.rb +189 -0
- data/lib/wukong/extensions/date_time.rb +24 -0
- data/lib/wukong/extensions/emittable.rb +82 -0
- data/lib/wukong/extensions/hash.rb +120 -0
- data/lib/wukong/extensions/hash_like.rb +119 -0
- data/lib/wukong/extensions/hashlike_class.rb +47 -0
- data/lib/wukong/extensions/module.rb +2 -0
- data/lib/wukong/extensions/pathname.rb +27 -0
- data/lib/wukong/extensions/string.rb +65 -0
- data/lib/wukong/extensions/struct.rb +17 -0
- data/lib/wukong/extensions/symbol.rb +11 -0
- data/lib/wukong/logger.rb +53 -0
- data/lib/wukong/models/graph.rb +27 -0
- data/lib/wukong/rdf.rb +104 -0
- data/lib/wukong/schema.rb +37 -0
- data/lib/wukong/script.rb +265 -0
- data/lib/wukong/script/hadoop_command.rb +111 -0
- data/lib/wukong/script/local_command.rb +14 -0
- data/lib/wukong/streamer.rb +13 -0
- data/lib/wukong/streamer/accumulating_reducer.rb +89 -0
- data/lib/wukong/streamer/base.rb +76 -0
- data/lib/wukong/streamer/count_keys.rb +30 -0
- data/lib/wukong/streamer/count_lines.rb +26 -0
- data/lib/wukong/streamer/filter.rb +20 -0
- data/lib/wukong/streamer/line_streamer.rb +12 -0
- data/lib/wukong/streamer/list_reducer.rb +20 -0
- data/lib/wukong/streamer/preprocess_with_pipe_streamer.rb +22 -0
- data/lib/wukong/streamer/rank_and_bin_reducer.rb +145 -0
- data/lib/wukong/streamer/set_reducer.rb +14 -0
- data/lib/wukong/streamer/struct_streamer.rb +48 -0
- data/lib/wukong/streamer/summing_reducer.rb +29 -0
- data/lib/wukong/streamer/uniq_by_last_reducer.rb +44 -0
- data/lib/wukong/typed_struct.rb +12 -0
- data/lib/wukong/wukong_class.rb +21 -0
- data/spec/bin/hdp-wc_spec.rb +4 -0
- data/spec/spec_helper.rb +0 -0
- data/wukong.gemspec +179 -0
- metadata +214 -0
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
$: << File.dirname(__FILE__)+'/../lib'
|
|
3
|
+
|
|
4
|
+
require 'wukong'
|
|
5
|
+
|
|
6
|
+
#
|
|
7
|
+
# This is so very very kludgey
|
|
8
|
+
#
|
|
9
|
+
# Input is an 'ls' file, listing files to .bz2 package.
|
|
10
|
+
#
|
|
11
|
+
# Mapper takes each in turn and creates, within a parallel directory tree under
|
|
12
|
+
# ~/pkgd on the HDFS, a .bz2 compressed version of the file.
|
|
13
|
+
#
|
|
14
|
+
# So, the file
|
|
15
|
+
# /user/me/fixd/all-20090103
|
|
16
|
+
# is packaged onto the DFS as
|
|
17
|
+
# /user/me/pkgd/user/me/fixd/all-20090103
|
|
18
|
+
#
|
|
19
|
+
# listing=tmp/fixd-all-package-listing
|
|
20
|
+
# hdp-rm $listing
|
|
21
|
+
# hadoop dfs -lsr fixd | egrep '(part-|\.tsv$)' | hdp-put - $listing ;
|
|
22
|
+
#
|
|
23
|
+
# ./package.rb --run --rm --map_tasks=1 $listing $pkgd_log
|
|
24
|
+
#
|
|
25
|
+
module ExportPackager
|
|
26
|
+
PKGD_DIR = '/workspace/flip/pkgd'
|
|
27
|
+
|
|
28
|
+
#
|
|
29
|
+
#
|
|
30
|
+
class Reducer < Wukong::Streamer::Base
|
|
31
|
+
def announce *args
|
|
32
|
+
$stdout.puts *args
|
|
33
|
+
$stderr.puts *args
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def handle_existing_target output_filename
|
|
37
|
+
return true unless File.exist?(output_filename)
|
|
38
|
+
# announce "Exists! #{output_filename}"
|
|
39
|
+
# return false
|
|
40
|
+
announce "Removing target file #{output_filename}"
|
|
41
|
+
begin announce `rm #{output_filename}`
|
|
42
|
+
rescue Exception => e ; announce e ; end
|
|
43
|
+
true
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def mkdir_target_safely output_filename
|
|
47
|
+
output_dir = File.dirname(output_filename)
|
|
48
|
+
announce "Ensuring directory #{output_dir} exists"
|
|
49
|
+
begin announce `mkdir -p #{output_dir}`
|
|
50
|
+
rescue Exception => e ; announce e ; end
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def bzip_into_pkgd_file input_filename, output_filename
|
|
54
|
+
announce "bzip'ing into #{output_filename}"
|
|
55
|
+
announce `( hadoop dfs -cat #{input_filename}/[^_]\** ) | bzip2 -c > #{output_filename}`
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def gen_output_filename input_filename
|
|
59
|
+
input_filename += '.tsv' unless input_filename =~ /.*\.\w{2,}/
|
|
60
|
+
"%s/%s.bz2" % [PKGD_DIR, input_filename.gsub(/^\//, '')]
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def rsync host, local_path, remote_path=nil
|
|
64
|
+
remote_path ||= local_path
|
|
65
|
+
announce `/usr/bin/rsync -Cuvrtlp #{local_path} #{host}:#{remote_path}`
|
|
66
|
+
sleep 5
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
def process input_filename
|
|
70
|
+
output_filename = gen_output_filename(input_filename)
|
|
71
|
+
handle_existing_target(output_filename) or return
|
|
72
|
+
mkdir_target_safely output_filename
|
|
73
|
+
bzip_into_pkgd_file input_filename, output_filename
|
|
74
|
+
rsync :lab3, output_filename
|
|
75
|
+
#
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def recordize line
|
|
79
|
+
# handle ls or straight file list, either
|
|
80
|
+
line.split(/\s/).last
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
def stream
|
|
84
|
+
super
|
|
85
|
+
rsync :lab3, PKGD_DIR+'/'
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
class Script < Wukong::Script
|
|
90
|
+
def default_options
|
|
91
|
+
super.merge :map_tasks => 1,
|
|
92
|
+
:max_node_reduce_tasks => 1, # only one reducer per local filesystem
|
|
93
|
+
:timeout => 40 * 60 * 1000 # timeout in ms
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
# Execute the script
|
|
97
|
+
Script.new(nil, Reducer).run
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
|
data/examples/package.rb
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
$: << ENV['WUKONG_PATH'] if ENV['WUKONG_PATH']
|
|
3
|
+
require 'wukong'
|
|
4
|
+
|
|
5
|
+
#
|
|
6
|
+
# This is so very very kludgey
|
|
7
|
+
#
|
|
8
|
+
# Input is an 'ls' file, listing files to .bz2 package.
|
|
9
|
+
#
|
|
10
|
+
# Reducer takes each in turn and creates, within a parallel directory tree under
|
|
11
|
+
# ~/pkgd on the HDFS, a .bz2 compressed version of the file.
|
|
12
|
+
#
|
|
13
|
+
# So, the file
|
|
14
|
+
# /user/me/fixd/all-20090103
|
|
15
|
+
# is packaged onto the DFS as
|
|
16
|
+
# /user/me/pkgd/user/me/fixd/all-20090103
|
|
17
|
+
#
|
|
18
|
+
# listing=tmp/fixd-all-package-listing
|
|
19
|
+
# hdp-rm $listing
|
|
20
|
+
# hadoop dfs -lsr fixd | egrep '(part-|\.tsv$)' | hdp-put - $listing ;
|
|
21
|
+
#
|
|
22
|
+
# ./package.rb --run --rm --map_tasks=1 $listing $pkgd_log
|
|
23
|
+
#
|
|
24
|
+
module ExportPackager
|
|
25
|
+
PKGD_DIR = 'pkgd'
|
|
26
|
+
|
|
27
|
+
#
|
|
28
|
+
#
|
|
29
|
+
class Reducer < Wukong::Streamer::Base
|
|
30
|
+
def announce str
|
|
31
|
+
return if str.blank?
|
|
32
|
+
$stderr.puts str
|
|
33
|
+
$stdout.puts str
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def remove_target_filename output_filename
|
|
37
|
+
begin announce "rm\t#{"%-70s"%output_filename}\t" +
|
|
38
|
+
`( hadoop dfs -rmr #{output_filename} ) 2>&1`
|
|
39
|
+
rescue ; nil ; end
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def mkdir_target_safely output_filename
|
|
43
|
+
output_dir = File.dirname(output_filename)
|
|
44
|
+
begin announce "mkdir\t#{"%-70s"%output_dir}\t" +
|
|
45
|
+
`( hadoop dfs -mkdir #{output_dir} ) 2>&1`
|
|
46
|
+
rescue ; nil ; end
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def bzip_into_pkgd_file input_filename, output_filename
|
|
50
|
+
announce "cat|bz\t#{"%-70s"%input_filename}\t" +
|
|
51
|
+
`( hadoop dfs -cat #{input_filename}/[^_]\\* | bzip2 -c | hadoop dfs -put - #{output_filename} ) 2>&1`
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def verify input_filename, output_filename
|
|
55
|
+
announce "sha1sum\t#{"%-70s"%output_filename}\t" +
|
|
56
|
+
`( hadoop dfs -cat #{output_filename} | bzcat - | sha1sum ) 2>&1`
|
|
57
|
+
announce "sha1sum\t#{"%-70s"%input_filename}\t" +
|
|
58
|
+
`( hadoop dfs -cat #{input_filename}/[^_]\\* | sha1sum ) 2>&1`
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def gen_output_filename input_filename
|
|
62
|
+
"%s/%s.bz2" % [PKGD_DIR, input_filename.gsub(%r{^/},"")]
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def process input_filename, output_filename
|
|
66
|
+
# remove_target_filename output_filename
|
|
67
|
+
# mkdir_target_safely output_filename
|
|
68
|
+
bzip_into_pkgd_file input_filename, output_filename
|
|
69
|
+
verify input_filename, output_filename
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
def stream
|
|
73
|
+
announce `hostname`
|
|
74
|
+
$stdin.each do |input_filename|
|
|
75
|
+
# handle ls or straight file list, either
|
|
76
|
+
input_filename = input_filename.chomp.strip.split(/\s/).last
|
|
77
|
+
output_filename = gen_output_filename input_filename
|
|
78
|
+
announce "********************************************************"
|
|
79
|
+
announce "Packing\t#{"%-70s"%input_filename}\t#{output_filename}"
|
|
80
|
+
process input_filename, output_filename
|
|
81
|
+
announce "Done\t#{"%-70s"%input_filename}\t#{output_filename}\n\n"
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
class Script < Wukong::Script
|
|
87
|
+
def default_options
|
|
88
|
+
super.merge :timeout => (24 * 60 * 60 * 1000) # milliseconds in one day
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
#
|
|
94
|
+
# Execute the script
|
|
95
|
+
#
|
|
96
|
+
ExportPackager::Script.new(nil, ExportPackager::Reducer, :reduce_tasks => 1000).run
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
|
|
2
|
+
-- ===========================================================================
|
|
3
|
+
--
|
|
4
|
+
-- Load Graph
|
|
5
|
+
--
|
|
6
|
+
AFollowsB = LOAD 'twnew/all/a_follows_b' AS (rsrc: chararray, user_a_id: int, user_b_id: int) ;
|
|
7
|
+
FollEdges_0 = FOREACH AFollowsB GENERATE user_a_id AS src, user_b_id AS dest ;
|
|
8
|
+
|
|
9
|
+
InitPagerankFoll_0 = GROUP FollEdges_0 BY src ;
|
|
10
|
+
InitPagerankFoll_1 = FOREACH InitPagerankFoll_0 GENERATE
|
|
11
|
+
group AS src,
|
|
12
|
+
1.0F AS pagerank:float,
|
|
13
|
+
FollEdges_0.(dest) AS dests
|
|
14
|
+
;
|
|
15
|
+
rmf twnew/pagerank-foll/pagerank_graph_000 ;
|
|
16
|
+
STORE InitPagerankFoll_1 INTO 'twnew/pagerank-foll/pagerank_graph_000';
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
-- MultiEdge = LOAD 'twnew/all/multi_edge' AS (
|
|
20
|
+
-- rsrc: chararray, src: int, dest: int,
|
|
21
|
+
-- fo: int, fr: int,
|
|
22
|
+
-- re_out: int, re_in: int,
|
|
23
|
+
-- at_out: int, at_in: int,
|
|
24
|
+
-- rt_out: int, rt_in: int,
|
|
25
|
+
-- fv_out: int, fv_in: int) ;
|
|
26
|
+
--
|
|
27
|
+
-- SymmEdges_0 = FOREACH MultiEdge GENERATE src, dest, fo, fr ;
|
|
28
|
+
-- SymmEdges_1 = FILTER SymmEdges_0 BY (fo >= 1.0) AND (fr >= 1.0) ;
|
|
29
|
+
-- SymmEdges = FOREACH SymmEdges_1 GENERATE src, dest ;
|
|
30
|
+
-- -- rm twnew/graphs/symm_edges; STORE SymmEdges INTO 'twnew/graphs/symm_edges' ;
|
|
31
|
+
-- SymmEdges = LOAD 'twnew/graphs/symm_edges' AS (src:int , dest:int);
|
|
32
|
+
--
|
|
33
|
+
-- AnyoutEdges_0 = FOREACH MultiEdge GENERATE src, dest, fo, re_out, fv_out ;
|
|
34
|
+
-- AnyoutEdges_1 = FILTER AnyoutEdges_0 BY (fo >= 1.0) OR (re_out >= 1.0) OR (fv_out >= 1.0) ;
|
|
35
|
+
-- AnyoutEdges = FOREACH AnyoutEdges_1 GENERATE src, dest ;
|
|
36
|
+
-- -- rm twnew/graphs/anyout_edges; STORE AnyoutEdges INTO 'twnew/graphs/anyout_edges' ;
|
|
37
|
+
-- AnyoutEdges = LOAD 'twnew/graphs/anyout_edges' AS (src:int , dest:int);
|
|
38
|
+
--
|
|
39
|
+
--
|
|
40
|
+
-- InitPagerankSymm_0 = GROUP SymmEdges BY src ;
|
|
41
|
+
-- InitPagerankSymm_1 = FOREACH InitPagerankSymm_0 GENERATE
|
|
42
|
+
-- group AS src,
|
|
43
|
+
-- 1.0F AS pagerank:float,
|
|
44
|
+
-- SymmEdges.(dest) AS dests
|
|
45
|
+
-- ;
|
|
46
|
+
-- rm twnew/pagerank-symm/pagerank_graph_000 ;
|
|
47
|
+
-- STORE InitPagerankSymm_1 INTO 'twnew/pagerank-symm/pagerank_graph_000';
|
|
48
|
+
--
|
|
49
|
+
--
|
|
50
|
+
-- InitPagerankAnyout_0 = GROUP AnyoutEdges BY src ;
|
|
51
|
+
-- InitPagerankAnyout_1 = FOREACH InitPagerankAnyout_0 GENERATE
|
|
52
|
+
-- group AS src,
|
|
53
|
+
-- 1.0F AS pagerank:float,
|
|
54
|
+
-- AnyoutEdges.(dest) AS dests
|
|
55
|
+
-- ;
|
|
56
|
+
-- rm twnew/pagerank-anyout/pagerank_graph_000 ;
|
|
57
|
+
-- STORE InitPagerankAnyout_1 INTO 'twnew/pagerank-anyout/pagerank_graph_000';
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
$: << File.dirname(__FILE__)+'/../../lib'
|
|
3
|
+
require 'wukong'
|
|
4
|
+
|
|
5
|
+
#
|
|
6
|
+
#
|
|
7
|
+
#
|
|
8
|
+
|
|
9
|
+
module PageRank
|
|
10
|
+
#
|
|
11
|
+
# Damping factor (prob. of a 'random' jump)
|
|
12
|
+
# 0.85 works well in practice. See http://en.wikipedia.org/wiki/Pagerank
|
|
13
|
+
#
|
|
14
|
+
DAMPING_FACTOR = 0.85
|
|
15
|
+
|
|
16
|
+
#
|
|
17
|
+
# Each user's line looks like
|
|
18
|
+
#
|
|
19
|
+
# user_a pagerank id1,id2,...,idN
|
|
20
|
+
#
|
|
21
|
+
# we need to disperse this user's pagerank to each of id1..idN, and
|
|
22
|
+
# rendezvous the list of outbound links at user_a's reducer as well.
|
|
23
|
+
#
|
|
24
|
+
module Iterating
|
|
25
|
+
class Mapper < Wukong::Streamer::Base
|
|
26
|
+
#
|
|
27
|
+
# Send pagerank to each page, and send the dests list back to self
|
|
28
|
+
#
|
|
29
|
+
def process src, pagerank, dests_str, &block
|
|
30
|
+
# This lets us use Pig to generate the input
|
|
31
|
+
dests_str = dests_str.gsub(/[\(\{\}\)]/, '')
|
|
32
|
+
dests = dests_str.split(",")
|
|
33
|
+
yield_pagerank_shares src, pagerank, dests, &block
|
|
34
|
+
yield_own_dest_list src, dests_str, &block
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
#
|
|
38
|
+
# Take the source node's pagerank and distribute it among all the out-nodes
|
|
39
|
+
#
|
|
40
|
+
def yield_pagerank_shares src, pagerank, dests
|
|
41
|
+
pagerank_share = pagerank.to_f / dests.length
|
|
42
|
+
dests.each do |dest|
|
|
43
|
+
yield [dest, 'p', pagerank_share]
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
#
|
|
48
|
+
# Dispatch this user's out-node list to rendezvous with itself.
|
|
49
|
+
#
|
|
50
|
+
def yield_own_dest_list src, dests_str
|
|
51
|
+
yield [src, 'd', dests_str]
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
class Reducer < Wukong::Streamer::AccumulatingReducer
|
|
56
|
+
attr_accessor :node_id, :pagerank, :dests_str
|
|
57
|
+
# Begin reduction with 0 accumulated pagerank and no dests as yet
|
|
58
|
+
def start! node_id, *args
|
|
59
|
+
self.node_id = node_id
|
|
60
|
+
self.pagerank = 0.0
|
|
61
|
+
self.dests_str = nil
|
|
62
|
+
end
|
|
63
|
+
# We'll receive fractional pagerank from all incoming edges,
|
|
64
|
+
# and the destination list from this node's map stage
|
|
65
|
+
def accumulate node_id, what, val
|
|
66
|
+
case what
|
|
67
|
+
when 'p' then self.pagerank += val.to_f
|
|
68
|
+
when 'd' then self.dests_str = val
|
|
69
|
+
else raise "Don't know how to accumulate #{[node_id, what, val].inspect}"
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
# To finalize, dump the damped pagerank and dest list
|
|
73
|
+
# in a form that can be fed back into this script
|
|
74
|
+
def finalize
|
|
75
|
+
damped_pagerank = (self.pagerank * DAMPING_FACTOR) + (1 - DAMPING_FACTOR)
|
|
76
|
+
self.dests_str = 'dummy' if self.dests_str.blank?
|
|
77
|
+
yield [node_id, damped_pagerank, dests_str]
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
class Script < Wukong::Script
|
|
82
|
+
def default_options
|
|
83
|
+
super.merge :extra_args => ' -jobconf io.sort.record.percent=0.25 '
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
Script.new(Mapper, Reducer).run
|
|
87
|
+
end
|
|
88
|
+
end
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
$: << File.dirname(__FILE__)+'/../../lib'
|
|
3
|
+
require 'wukong'
|
|
4
|
+
require 'wukong/streamer/set_reducer'
|
|
5
|
+
|
|
6
|
+
module PageRank
|
|
7
|
+
class Script < Wukong::Script
|
|
8
|
+
#
|
|
9
|
+
# Input format is
|
|
10
|
+
#
|
|
11
|
+
# rsrc src_id dest_id [... junk ...]
|
|
12
|
+
#
|
|
13
|
+
# All we want from the line are its src and dest IDs.
|
|
14
|
+
#
|
|
15
|
+
def map_command
|
|
16
|
+
%Q{/usr/bin/cut -d"\t" -f2,3}
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def default_options
|
|
20
|
+
super.merge :extra_args => ' -jobconf io.sort.record.percent=0.25 '
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
#
|
|
25
|
+
# Accumulate the dests list in memory, dump as a whole. Multiple edges between
|
|
26
|
+
# any two nodes are permitted, and will accumulate pagerank according to the
|
|
27
|
+
# edge's multiplicity.
|
|
28
|
+
#
|
|
29
|
+
class Reducer < Wukong::Streamer::ListReducer
|
|
30
|
+
def accumulate src, dest
|
|
31
|
+
self.values << dest
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# Emit src, initial pagerank, and flattened dests list
|
|
35
|
+
def finalize
|
|
36
|
+
self.values = ['dummy'] if self.values.blank?
|
|
37
|
+
yield [key, 1.0, self.values.to_a.join(",")]
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# Execute the script
|
|
42
|
+
Script.new(nil, PageRank::Reducer).run
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
|
|
3
|
+
# Directory to pagerank on.
|
|
4
|
+
work_dir=$1 ; shift
|
|
5
|
+
if [ "$work_dir" == '' ] ; then echo "Please specify the parent of the directory made by gen_initial_pagerank" ; exit ; fi
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
# How many rounds to run
|
|
9
|
+
max_iter=10
|
|
10
|
+
# this directory
|
|
11
|
+
script_dir="`dirname $0`"
|
|
12
|
+
|
|
13
|
+
for (( curr=0 , next=1 ; "$curr" < "$max_iter" ; curr++ , next++ )) ; do
|
|
14
|
+
curr_str=`printf "%03d" ${curr}`
|
|
15
|
+
next_str=`printf "%03d" ${next}`
|
|
16
|
+
curr_dir=$work_dir/pagerank_graph_${curr_str}
|
|
17
|
+
next_dir=$work_dir/pagerank_graph_${next_str}
|
|
18
|
+
$script_dir/pagerank.rb --rm --run $curr_dir $next_dir
|
|
19
|
+
done
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
$: << File.dirname(__FILE__)+'/../lib'
|
|
3
|
+
require 'wukong'
|
|
4
|
+
require 'wukong/streamer/rank_and_bin_reducer'
|
|
5
|
+
|
|
6
|
+
#
|
|
7
|
+
# This example uses the classes from http://github.com/mrflip/twitter_friends
|
|
8
|
+
# (That's sloppy, and I apologize. I'm building this script for that, but it
|
|
9
|
+
# seems broadly useful and I'm not maintaining two copies. Once this script is
|
|
10
|
+
# more worky we'll make it standalone. Anyway you should get the picture.)
|
|
11
|
+
#
|
|
12
|
+
$: << File.dirname(__FILE__)+'/../../projects/twitter_friends/lib'
|
|
13
|
+
require 'twitter_friends';
|
|
14
|
+
require 'twitter_friends/struct_model' ; include TwitterFriends::StructModel
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
#
|
|
18
|
+
# attrs to bin
|
|
19
|
+
#
|
|
20
|
+
BINNABLE_ATTRS = {
|
|
21
|
+
:twitter_user => [
|
|
22
|
+
[:followers_count, :fo ],
|
|
23
|
+
[:friends_count, :fr ],
|
|
24
|
+
[:statuses_count, :st ],
|
|
25
|
+
[:favourites_count, :fv ],
|
|
26
|
+
[:created_at, :crat ]
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
}
|
|
30
|
+
RESOURCE_ALIASES = {
|
|
31
|
+
:twitter_user => :u,
|
|
32
|
+
:user_metrics => :um,
|
|
33
|
+
}
|
|
34
|
+
#
|
|
35
|
+
# KLUDGE This is not DRY at all but let's get it working first
|
|
36
|
+
#
|
|
37
|
+
BinUserMetrics = TypedStruct.new(
|
|
38
|
+
[:id, Integer],
|
|
39
|
+
*BINNABLE_ATTRS[:user_metrics].map{|attr, attr_abbr| [attr_abbr, Integer] }
|
|
40
|
+
)
|
|
41
|
+
BINNED_RESOURCE_ALIASES = {
|
|
42
|
+
:u => BinTwitterUser,
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
module RankAndBinAttrs
|
|
46
|
+
class ExplodeResourceMapper < Wukong::Streamer::StructStreamer
|
|
47
|
+
def get_and_format_attr thing, attr
|
|
48
|
+
val = thing.send(attr)
|
|
49
|
+
case thing.members_types[attr].to_s.to_sym
|
|
50
|
+
when :Integer then "%010d" % val.to_i
|
|
51
|
+
when :Float then "%020.7f" % val.to_f
|
|
52
|
+
when :Bignum then "%020d" % val.to_i
|
|
53
|
+
else
|
|
54
|
+
raise [val, thing.members_types[attr].to_s.to_sym].inspect
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
#
|
|
59
|
+
# The data expansion of this mapper is large enough that it makes sense to
|
|
60
|
+
# be a little responsible with what we emit. We'll use the RESOURCE_ALIASES
|
|
61
|
+
# and BINNABLE_ATTRS hashes, above, to dump a more parsimonious
|
|
62
|
+
# representation.
|
|
63
|
+
#
|
|
64
|
+
def process thing, *args, &block
|
|
65
|
+
attr_abbrs = BINNABLE_ATTRS[thing.class.resource_name]
|
|
66
|
+
return unless attr_abbrs
|
|
67
|
+
attr_abbrs.each do |attr, abbr|
|
|
68
|
+
yield [
|
|
69
|
+
RESOURCE_ALIASES[thing.class.resource_name],
|
|
70
|
+
abbr,
|
|
71
|
+
get_and_format_attr(thing, attr),
|
|
72
|
+
thing.id.to_i
|
|
73
|
+
]
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
class BinAttrReducer < Wukong::Streamer::RankAndBinReducer
|
|
79
|
+
attr_accessor :last_rsrc_attr
|
|
80
|
+
#
|
|
81
|
+
# Note that we might get several different resources at the same reducer
|
|
82
|
+
#
|
|
83
|
+
def get_key rsrc, attr, val, *args
|
|
84
|
+
if [rsrc, attr] != self.last_rsrc_attr
|
|
85
|
+
# Note: since each partition has the same cardinality, we don't need to
|
|
86
|
+
# fiddle around with the bin_size, etc -- just reset the order
|
|
87
|
+
# parameters' state.
|
|
88
|
+
reset_order_params!
|
|
89
|
+
self.last_rsrc_attr = [rsrc, attr]
|
|
90
|
+
end
|
|
91
|
+
val
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
#
|
|
95
|
+
# Note well -- we are rearranging the field order to
|
|
96
|
+
#
|
|
97
|
+
# resource_abbr id attr_abbr bin
|
|
98
|
+
#
|
|
99
|
+
# for proper sorting to the re-assembler
|
|
100
|
+
#
|
|
101
|
+
def emit record
|
|
102
|
+
rsrc, attr, val, id, numbering, rank, bin = record
|
|
103
|
+
super [rsrc, id, attr, bin]
|
|
104
|
+
end
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
class ReassembleObjectReducer < Wukong::Streamer::AccumulatingReducer
|
|
108
|
+
attr_accessor :thing
|
|
109
|
+
def klass_from_abbr rsrc_abbr
|
|
110
|
+
BINNED_RESOURCE_ALIASES[rsrc_abbr.to_sym]
|
|
111
|
+
end
|
|
112
|
+
def get_key rsrc_abbr, id, *args
|
|
113
|
+
[rsrc_abbr, id.to_i]
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
def start! rsrc_abbr, id, *args
|
|
117
|
+
klass = klass_from_abbr(rsrc_abbr)
|
|
118
|
+
self.thing = klass.new id.to_i
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
def accumulate rsrc, id, attr, bin
|
|
122
|
+
thing.send("#{attr}=", bin)
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
def finalize
|
|
126
|
+
yield thing
|
|
127
|
+
end
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
#
|
|
131
|
+
# Two-phase script
|
|
132
|
+
#
|
|
133
|
+
# FIXME -- We need a runner class to manage this.
|
|
134
|
+
#
|
|
135
|
+
class Script < Wukong::Script
|
|
136
|
+
attr_accessor :phase
|
|
137
|
+
# KLUDGE !!
|
|
138
|
+
def initialize
|
|
139
|
+
case
|
|
140
|
+
when ARGV.detect{|arg| arg =~ /--phase=1/}
|
|
141
|
+
# Phase 1 -- Steal underpants. Also, disassemble each object, and find
|
|
142
|
+
# the bin for each binnable attribute's value
|
|
143
|
+
self.phase = 1
|
|
144
|
+
self.mapper_klass, self.reducer_klass = [ExplodeResourceMapper, BinAttrReducer]
|
|
145
|
+
when ARGV.detect{|arg| arg =~ /--phase=2/}
|
|
146
|
+
# Phase 2 -- ????
|
|
147
|
+
raise "Phase 2 : ????"
|
|
148
|
+
when ARGV.detect{|arg| arg =~ /--phase=3/}
|
|
149
|
+
# Phase 3 -- profit. In this case, put records back together.
|
|
150
|
+
self.phase = 3
|
|
151
|
+
self.mapper_klass, self.reducer_klass = [nil, ReassembleObjectReducer]
|
|
152
|
+
else
|
|
153
|
+
raise "Please run me with a --phase= option"
|
|
154
|
+
end
|
|
155
|
+
super mapper_klass, reducer_klass
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
def default_options
|
|
159
|
+
extra_options =
|
|
160
|
+
case self.phase
|
|
161
|
+
# partition on [rsrc, attr]; sort on [rsrc, attr, val]
|
|
162
|
+
when 1 then { :sort_fields => 3, :partition_fields => 2 }
|
|
163
|
+
# sort on [rsrc, id]
|
|
164
|
+
when 3 then { :sort_fields => 2 }
|
|
165
|
+
else { }
|
|
166
|
+
end
|
|
167
|
+
super.merge extra_options
|
|
168
|
+
end
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
# execute script
|
|
172
|
+
Script.new.run
|
|
173
|
+
end
|