wukong 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (143) hide show
  1. data/LICENSE.textile +107 -0
  2. data/README.textile +166 -0
  3. data/bin/cutc +30 -0
  4. data/bin/cuttab +5 -0
  5. data/bin/greptrue +8 -0
  6. data/bin/hdp-cat +3 -0
  7. data/bin/hdp-catd +3 -0
  8. data/bin/hdp-du +81 -0
  9. data/bin/hdp-get +3 -0
  10. data/bin/hdp-kill +3 -0
  11. data/bin/hdp-ls +10 -0
  12. data/bin/hdp-mkdir +3 -0
  13. data/bin/hdp-mv +3 -0
  14. data/bin/hdp-parts_to_keys.rb +77 -0
  15. data/bin/hdp-ps +3 -0
  16. data/bin/hdp-put +3 -0
  17. data/bin/hdp-rm +11 -0
  18. data/bin/hdp-sort +29 -0
  19. data/bin/hdp-stream +29 -0
  20. data/bin/hdp-stream-flat +18 -0
  21. data/bin/hdp-sync +17 -0
  22. data/bin/hdp-wc +67 -0
  23. data/bin/md5sort +20 -0
  24. data/bin/tabchar +5 -0
  25. data/bin/uniqc +3 -0
  26. data/bin/wu-hist +3 -0
  27. data/bin/wu-lign +177 -0
  28. data/bin/wu-sum +30 -0
  29. data/doc/INSTALL.textile +41 -0
  30. data/doc/LICENSE.textile +107 -0
  31. data/doc/README-tutorial.textile +163 -0
  32. data/doc/README-wulign.textile +59 -0
  33. data/doc/README-wutils.textile +128 -0
  34. data/doc/TODO.textile +61 -0
  35. data/doc/UsingWukong-part1-setup.textile +2 -0
  36. data/doc/UsingWukong-part2-scraping.textile +2 -0
  37. data/doc/UsingWukong-part3-parsing.textile +132 -0
  38. data/doc/code/api_response_example.txt +20 -0
  39. data/doc/code/parser_skeleton.rb +38 -0
  40. data/doc/hadoop-nfs.textile +51 -0
  41. data/doc/hadoop-setup.textile +29 -0
  42. data/doc/index.textile +124 -0
  43. data/doc/intro_to_map_reduce/MapReduceDiagram.graffle +0 -0
  44. data/doc/links.textile +42 -0
  45. data/doc/overview.textile +91 -0
  46. data/doc/pig/PigLatinExpressionsList.txt +122 -0
  47. data/doc/pig/PigLatinReferenceManual.html +19134 -0
  48. data/doc/pig/PigLatinReferenceManual.txt +1640 -0
  49. data/doc/tips.textile +116 -0
  50. data/doc/usage.textile +102 -0
  51. data/doc/utils.textile +48 -0
  52. data/examples/README.txt +17 -0
  53. data/examples/and_pig/sample_queries.rb +128 -0
  54. data/examples/apache_log_parser.rb +53 -0
  55. data/examples/count_keys.rb +56 -0
  56. data/examples/count_keys_at_mapper.rb +57 -0
  57. data/examples/graph/adjacency_list.rb +74 -0
  58. data/examples/graph/breadth_first_search.rb +79 -0
  59. data/examples/graph/gen_2paths.rb +68 -0
  60. data/examples/graph/gen_multi_edge.rb +103 -0
  61. data/examples/graph/gen_symmetric_links.rb +53 -0
  62. data/examples/package-local.rb +100 -0
  63. data/examples/package.rb +96 -0
  64. data/examples/pagerank/README.textile +6 -0
  65. data/examples/pagerank/gen_initial_pagerank_graph.pig +57 -0
  66. data/examples/pagerank/pagerank.rb +88 -0
  67. data/examples/pagerank/pagerank_initialize.rb +46 -0
  68. data/examples/pagerank/run_pagerank.sh +19 -0
  69. data/examples/rank_and_bin.rb +173 -0
  70. data/examples/run_all.sh +47 -0
  71. data/examples/sample_records.rb +44 -0
  72. data/examples/size.rb +60 -0
  73. data/examples/word_count.rb +95 -0
  74. data/lib/wukong.rb +11 -0
  75. data/lib/wukong/and_pig.rb +62 -0
  76. data/lib/wukong/and_pig/README.textile +12 -0
  77. data/lib/wukong/and_pig/as.rb +37 -0
  78. data/lib/wukong/and_pig/data_types.rb +30 -0
  79. data/lib/wukong/and_pig/functions.rb +50 -0
  80. data/lib/wukong/and_pig/generate.rb +85 -0
  81. data/lib/wukong/and_pig/generate/variable_inflections.rb +82 -0
  82. data/lib/wukong/and_pig/junk.rb +51 -0
  83. data/lib/wukong/and_pig/operators.rb +8 -0
  84. data/lib/wukong/and_pig/operators/compound.rb +29 -0
  85. data/lib/wukong/and_pig/operators/evaluators.rb +7 -0
  86. data/lib/wukong/and_pig/operators/execution.rb +15 -0
  87. data/lib/wukong/and_pig/operators/file_methods.rb +29 -0
  88. data/lib/wukong/and_pig/operators/foreach.rb +98 -0
  89. data/lib/wukong/and_pig/operators/groupies.rb +212 -0
  90. data/lib/wukong/and_pig/operators/load_store.rb +65 -0
  91. data/lib/wukong/and_pig/operators/meta.rb +42 -0
  92. data/lib/wukong/and_pig/operators/relational.rb +129 -0
  93. data/lib/wukong/and_pig/pig_struct.rb +48 -0
  94. data/lib/wukong/and_pig/pig_var.rb +95 -0
  95. data/lib/wukong/and_pig/symbol.rb +29 -0
  96. data/lib/wukong/and_pig/utils.rb +0 -0
  97. data/lib/wukong/bad_record.rb +18 -0
  98. data/lib/wukong/boot.rb +47 -0
  99. data/lib/wukong/datatypes.rb +24 -0
  100. data/lib/wukong/datatypes/enum.rb +123 -0
  101. data/lib/wukong/dfs.rb +80 -0
  102. data/lib/wukong/encoding.rb +111 -0
  103. data/lib/wukong/extensions.rb +15 -0
  104. data/lib/wukong/extensions/array.rb +18 -0
  105. data/lib/wukong/extensions/blank.rb +93 -0
  106. data/lib/wukong/extensions/class.rb +189 -0
  107. data/lib/wukong/extensions/date_time.rb +24 -0
  108. data/lib/wukong/extensions/emittable.rb +82 -0
  109. data/lib/wukong/extensions/hash.rb +120 -0
  110. data/lib/wukong/extensions/hash_like.rb +119 -0
  111. data/lib/wukong/extensions/hashlike_class.rb +47 -0
  112. data/lib/wukong/extensions/module.rb +2 -0
  113. data/lib/wukong/extensions/pathname.rb +27 -0
  114. data/lib/wukong/extensions/string.rb +65 -0
  115. data/lib/wukong/extensions/struct.rb +17 -0
  116. data/lib/wukong/extensions/symbol.rb +11 -0
  117. data/lib/wukong/logger.rb +53 -0
  118. data/lib/wukong/models/graph.rb +27 -0
  119. data/lib/wukong/rdf.rb +104 -0
  120. data/lib/wukong/schema.rb +37 -0
  121. data/lib/wukong/script.rb +265 -0
  122. data/lib/wukong/script/hadoop_command.rb +111 -0
  123. data/lib/wukong/script/local_command.rb +14 -0
  124. data/lib/wukong/streamer.rb +13 -0
  125. data/lib/wukong/streamer/accumulating_reducer.rb +89 -0
  126. data/lib/wukong/streamer/base.rb +76 -0
  127. data/lib/wukong/streamer/count_keys.rb +30 -0
  128. data/lib/wukong/streamer/count_lines.rb +26 -0
  129. data/lib/wukong/streamer/filter.rb +20 -0
  130. data/lib/wukong/streamer/line_streamer.rb +12 -0
  131. data/lib/wukong/streamer/list_reducer.rb +20 -0
  132. data/lib/wukong/streamer/preprocess_with_pipe_streamer.rb +22 -0
  133. data/lib/wukong/streamer/rank_and_bin_reducer.rb +145 -0
  134. data/lib/wukong/streamer/set_reducer.rb +14 -0
  135. data/lib/wukong/streamer/struct_streamer.rb +48 -0
  136. data/lib/wukong/streamer/summing_reducer.rb +29 -0
  137. data/lib/wukong/streamer/uniq_by_last_reducer.rb +44 -0
  138. data/lib/wukong/typed_struct.rb +12 -0
  139. data/lib/wukong/wukong_class.rb +21 -0
  140. data/spec/bin/hdp-wc_spec.rb +4 -0
  141. data/spec/spec_helper.rb +0 -0
  142. data/wukong.gemspec +179 -0
  143. metadata +214 -0
@@ -0,0 +1,100 @@
1
+ #!/usr/bin/env ruby
2
+ $: << File.dirname(__FILE__)+'/../lib'
3
+
4
+ require 'wukong'
5
+
6
+ #
7
+ # This is so very very kludgey
8
+ #
9
+ # Input is an 'ls' file, listing files to .bz2 package.
10
+ #
11
+ # Mapper takes each in turn and creates, within a parallel directory tree under
12
+ # ~/pkgd on the HDFS, a .bz2 compressed version of the file.
13
+ #
14
+ # So, the file
15
+ # /user/me/fixd/all-20090103
16
+ # is packaged onto the DFS as
17
+ # /user/me/pkgd/user/me/fixd/all-20090103
18
+ #
19
+ # listing=tmp/fixd-all-package-listing
20
+ # hdp-rm $listing
21
+ # hadoop dfs -lsr fixd | egrep '(part-|\.tsv$)' | hdp-put - $listing ;
22
+ #
23
+ # ./package.rb --run --rm --map_tasks=1 $listing $pkgd_log
24
+ #
25
+ module ExportPackager
26
+ PKGD_DIR = '/workspace/flip/pkgd'
27
+
28
+ #
29
+ #
30
+ class Reducer < Wukong::Streamer::Base
31
+ def announce *args
32
+ $stdout.puts *args
33
+ $stderr.puts *args
34
+ end
35
+
36
+ def handle_existing_target output_filename
37
+ return true unless File.exist?(output_filename)
38
+ # announce "Exists! #{output_filename}"
39
+ # return false
40
+ announce "Removing target file #{output_filename}"
41
+ begin announce `rm #{output_filename}`
42
+ rescue Exception => e ; announce e ; end
43
+ true
44
+ end
45
+
46
+ def mkdir_target_safely output_filename
47
+ output_dir = File.dirname(output_filename)
48
+ announce "Ensuring directory #{output_dir} exists"
49
+ begin announce `mkdir -p #{output_dir}`
50
+ rescue Exception => e ; announce e ; end
51
+ end
52
+
53
+ def bzip_into_pkgd_file input_filename, output_filename
54
+ announce "bzip'ing into #{output_filename}"
55
+ announce `( hadoop dfs -cat #{input_filename}/[^_]\** ) | bzip2 -c > #{output_filename}`
56
+ end
57
+
58
+ def gen_output_filename input_filename
59
+ input_filename += '.tsv' unless input_filename =~ /.*\.\w{2,}/
60
+ "%s/%s.bz2" % [PKGD_DIR, input_filename.gsub(/^\//, '')]
61
+ end
62
+
63
+ def rsync host, local_path, remote_path=nil
64
+ remote_path ||= local_path
65
+ announce `/usr/bin/rsync -Cuvrtlp #{local_path} #{host}:#{remote_path}`
66
+ sleep 5
67
+ end
68
+
69
+ def process input_filename
70
+ output_filename = gen_output_filename(input_filename)
71
+ handle_existing_target(output_filename) or return
72
+ mkdir_target_safely output_filename
73
+ bzip_into_pkgd_file input_filename, output_filename
74
+ rsync :lab3, output_filename
75
+ #
76
+ end
77
+
78
+ def recordize line
79
+ # handle ls or straight file list, either
80
+ line.split(/\s/).last
81
+ end
82
+
83
+ def stream
84
+ super
85
+ rsync :lab3, PKGD_DIR+'/'
86
+ end
87
+ end
88
+
89
+ class Script < Wukong::Script
90
+ def default_options
91
+ super.merge :map_tasks => 1,
92
+ :max_node_reduce_tasks => 1, # only one reducer per local filesystem
93
+ :timeout => 40 * 60 * 1000 # timeout in ms
94
+ end
95
+ end
96
+ # Execute the script
97
+ Script.new(nil, Reducer).run
98
+ end
99
+
100
+
@@ -0,0 +1,96 @@
1
+ #!/usr/bin/env ruby
2
+ $: << ENV['WUKONG_PATH'] if ENV['WUKONG_PATH']
3
+ require 'wukong'
4
+
5
+ #
6
+ # This is so very very kludgey
7
+ #
8
+ # Input is an 'ls' file, listing files to .bz2 package.
9
+ #
10
+ # Reducer takes each in turn and creates, within a parallel directory tree under
11
+ # ~/pkgd on the HDFS, a .bz2 compressed version of the file.
12
+ #
13
+ # So, the file
14
+ # /user/me/fixd/all-20090103
15
+ # is packaged onto the DFS as
16
+ # /user/me/pkgd/user/me/fixd/all-20090103
17
+ #
18
+ # listing=tmp/fixd-all-package-listing
19
+ # hdp-rm $listing
20
+ # hadoop dfs -lsr fixd | egrep '(part-|\.tsv$)' | hdp-put - $listing ;
21
+ #
22
+ # ./package.rb --run --rm --map_tasks=1 $listing $pkgd_log
23
+ #
24
+ module ExportPackager
25
+ PKGD_DIR = 'pkgd'
26
+
27
+ #
28
+ #
29
+ class Reducer < Wukong::Streamer::Base
30
+ def announce str
31
+ return if str.blank?
32
+ $stderr.puts str
33
+ $stdout.puts str
34
+ end
35
+
36
+ def remove_target_filename output_filename
37
+ begin announce "rm\t#{"%-70s"%output_filename}\t" +
38
+ `( hadoop dfs -rmr #{output_filename} ) 2>&1`
39
+ rescue ; nil ; end
40
+ end
41
+
42
+ def mkdir_target_safely output_filename
43
+ output_dir = File.dirname(output_filename)
44
+ begin announce "mkdir\t#{"%-70s"%output_dir}\t" +
45
+ `( hadoop dfs -mkdir #{output_dir} ) 2>&1`
46
+ rescue ; nil ; end
47
+ end
48
+
49
+ def bzip_into_pkgd_file input_filename, output_filename
50
+ announce "cat|bz\t#{"%-70s"%input_filename}\t" +
51
+ `( hadoop dfs -cat #{input_filename}/[^_]\\* | bzip2 -c | hadoop dfs -put - #{output_filename} ) 2>&1`
52
+ end
53
+
54
+ def verify input_filename, output_filename
55
+ announce "sha1sum\t#{"%-70s"%output_filename}\t" +
56
+ `( hadoop dfs -cat #{output_filename} | bzcat - | sha1sum ) 2>&1`
57
+ announce "sha1sum\t#{"%-70s"%input_filename}\t" +
58
+ `( hadoop dfs -cat #{input_filename}/[^_]\\* | sha1sum ) 2>&1`
59
+ end
60
+
61
+ def gen_output_filename input_filename
62
+ "%s/%s.bz2" % [PKGD_DIR, input_filename.gsub(%r{^/},"")]
63
+ end
64
+
65
+ def process input_filename, output_filename
66
+ # remove_target_filename output_filename
67
+ # mkdir_target_safely output_filename
68
+ bzip_into_pkgd_file input_filename, output_filename
69
+ verify input_filename, output_filename
70
+ end
71
+
72
+ def stream
73
+ announce `hostname`
74
+ $stdin.each do |input_filename|
75
+ # handle ls or straight file list, either
76
+ input_filename = input_filename.chomp.strip.split(/\s/).last
77
+ output_filename = gen_output_filename input_filename
78
+ announce "********************************************************"
79
+ announce "Packing\t#{"%-70s"%input_filename}\t#{output_filename}"
80
+ process input_filename, output_filename
81
+ announce "Done\t#{"%-70s"%input_filename}\t#{output_filename}\n\n"
82
+ end
83
+ end
84
+ end
85
+
86
+ class Script < Wukong::Script
87
+ def default_options
88
+ super.merge :timeout => (24 * 60 * 60 * 1000) # milliseconds in one day
89
+ end
90
+ end
91
+ end
92
+
93
+ #
94
+ # Execute the script
95
+ #
96
+ ExportPackager::Script.new(nil, ExportPackager::Reducer, :reduce_tasks => 1000).run
@@ -0,0 +1,6 @@
1
+
2
+ Calculate pagerank for a tab-separated adjacency list.
3
+
4
+ See
5
+ * Heretrix pagerank util
6
+ * http://www.umiacs.umd.edu/~jimmylin/cloud9/docs/exercises/pagerank.html
@@ -0,0 +1,57 @@
1
+
2
+ -- ===========================================================================
3
+ --
4
+ -- Load Graph
5
+ --
6
+ AFollowsB = LOAD 'twnew/all/a_follows_b' AS (rsrc: chararray, user_a_id: int, user_b_id: int) ;
7
+ FollEdges_0 = FOREACH AFollowsB GENERATE user_a_id AS src, user_b_id AS dest ;
8
+
9
+ InitPagerankFoll_0 = GROUP FollEdges_0 BY src ;
10
+ InitPagerankFoll_1 = FOREACH InitPagerankFoll_0 GENERATE
11
+ group AS src,
12
+ 1.0F AS pagerank:float,
13
+ FollEdges_0.(dest) AS dests
14
+ ;
15
+ rmf twnew/pagerank-foll/pagerank_graph_000 ;
16
+ STORE InitPagerankFoll_1 INTO 'twnew/pagerank-foll/pagerank_graph_000';
17
+
18
+
19
+ -- MultiEdge = LOAD 'twnew/all/multi_edge' AS (
20
+ -- rsrc: chararray, src: int, dest: int,
21
+ -- fo: int, fr: int,
22
+ -- re_out: int, re_in: int,
23
+ -- at_out: int, at_in: int,
24
+ -- rt_out: int, rt_in: int,
25
+ -- fv_out: int, fv_in: int) ;
26
+ --
27
+ -- SymmEdges_0 = FOREACH MultiEdge GENERATE src, dest, fo, fr ;
28
+ -- SymmEdges_1 = FILTER SymmEdges_0 BY (fo >= 1.0) AND (fr >= 1.0) ;
29
+ -- SymmEdges = FOREACH SymmEdges_1 GENERATE src, dest ;
30
+ -- -- rm twnew/graphs/symm_edges; STORE SymmEdges INTO 'twnew/graphs/symm_edges' ;
31
+ -- SymmEdges = LOAD 'twnew/graphs/symm_edges' AS (src:int , dest:int);
32
+ --
33
+ -- AnyoutEdges_0 = FOREACH MultiEdge GENERATE src, dest, fo, re_out, fv_out ;
34
+ -- AnyoutEdges_1 = FILTER AnyoutEdges_0 BY (fo >= 1.0) OR (re_out >= 1.0) OR (fv_out >= 1.0) ;
35
+ -- AnyoutEdges = FOREACH AnyoutEdges_1 GENERATE src, dest ;
36
+ -- -- rm twnew/graphs/anyout_edges; STORE AnyoutEdges INTO 'twnew/graphs/anyout_edges' ;
37
+ -- AnyoutEdges = LOAD 'twnew/graphs/anyout_edges' AS (src:int , dest:int);
38
+ --
39
+ --
40
+ -- InitPagerankSymm_0 = GROUP SymmEdges BY src ;
41
+ -- InitPagerankSymm_1 = FOREACH InitPagerankSymm_0 GENERATE
42
+ -- group AS src,
43
+ -- 1.0F AS pagerank:float,
44
+ -- SymmEdges.(dest) AS dests
45
+ -- ;
46
+ -- rm twnew/pagerank-symm/pagerank_graph_000 ;
47
+ -- STORE InitPagerankSymm_1 INTO 'twnew/pagerank-symm/pagerank_graph_000';
48
+ --
49
+ --
50
+ -- InitPagerankAnyout_0 = GROUP AnyoutEdges BY src ;
51
+ -- InitPagerankAnyout_1 = FOREACH InitPagerankAnyout_0 GENERATE
52
+ -- group AS src,
53
+ -- 1.0F AS pagerank:float,
54
+ -- AnyoutEdges.(dest) AS dests
55
+ -- ;
56
+ -- rm twnew/pagerank-anyout/pagerank_graph_000 ;
57
+ -- STORE InitPagerankAnyout_1 INTO 'twnew/pagerank-anyout/pagerank_graph_000';
@@ -0,0 +1,88 @@
1
+ #!/usr/bin/env ruby
2
+ $: << File.dirname(__FILE__)+'/../../lib'
3
+ require 'wukong'
4
+
5
+ #
6
+ #
7
+ #
8
+
9
+ module PageRank
10
+ #
11
+ # Damping factor (prob. of a 'random' jump)
12
+ # 0.85 works well in practice. See http://en.wikipedia.org/wiki/Pagerank
13
+ #
14
+ DAMPING_FACTOR = 0.85
15
+
16
+ #
17
+ # Each user's line looks like
18
+ #
19
+ # user_a pagerank id1,id2,...,idN
20
+ #
21
+ # we need to disperse this user's pagerank to each of id1..idN, and
22
+ # rendezvous the list of outbound links at user_a's reducer as well.
23
+ #
24
+ module Iterating
25
+ class Mapper < Wukong::Streamer::Base
26
+ #
27
+ # Send pagerank to each page, and send the dests list back to self
28
+ #
29
+ def process src, pagerank, dests_str, &block
30
+ # This lets us use Pig to generate the input
31
+ dests_str = dests_str.gsub(/[\(\{\}\)]/, '')
32
+ dests = dests_str.split(",")
33
+ yield_pagerank_shares src, pagerank, dests, &block
34
+ yield_own_dest_list src, dests_str, &block
35
+ end
36
+
37
+ #
38
+ # Take the source node's pagerank and distribute it among all the out-nodes
39
+ #
40
+ def yield_pagerank_shares src, pagerank, dests
41
+ pagerank_share = pagerank.to_f / dests.length
42
+ dests.each do |dest|
43
+ yield [dest, 'p', pagerank_share]
44
+ end
45
+ end
46
+
47
+ #
48
+ # Dispatch this user's out-node list to rendezvous with itself.
49
+ #
50
+ def yield_own_dest_list src, dests_str
51
+ yield [src, 'd', dests_str]
52
+ end
53
+ end
54
+
55
+ class Reducer < Wukong::Streamer::AccumulatingReducer
56
+ attr_accessor :node_id, :pagerank, :dests_str
57
+ # Begin reduction with 0 accumulated pagerank and no dests as yet
58
+ def start! node_id, *args
59
+ self.node_id = node_id
60
+ self.pagerank = 0.0
61
+ self.dests_str = nil
62
+ end
63
+ # We'll receive fractional pagerank from all incoming edges,
64
+ # and the destination list from this node's map stage
65
+ def accumulate node_id, what, val
66
+ case what
67
+ when 'p' then self.pagerank += val.to_f
68
+ when 'd' then self.dests_str = val
69
+ else raise "Don't know how to accumulate #{[node_id, what, val].inspect}"
70
+ end
71
+ end
72
+ # To finalize, dump the damped pagerank and dest list
73
+ # in a form that can be fed back into this script
74
+ def finalize
75
+ damped_pagerank = (self.pagerank * DAMPING_FACTOR) + (1 - DAMPING_FACTOR)
76
+ self.dests_str = 'dummy' if self.dests_str.blank?
77
+ yield [node_id, damped_pagerank, dests_str]
78
+ end
79
+ end
80
+
81
+ class Script < Wukong::Script
82
+ def default_options
83
+ super.merge :extra_args => ' -jobconf io.sort.record.percent=0.25 '
84
+ end
85
+ end
86
+ Script.new(Mapper, Reducer).run
87
+ end
88
+ end
@@ -0,0 +1,46 @@
1
+ #!/usr/bin/env ruby
2
+ $: << File.dirname(__FILE__)+'/../../lib'
3
+ require 'wukong'
4
+ require 'wukong/streamer/set_reducer'
5
+
6
+ module PageRank
7
+ class Script < Wukong::Script
8
+ #
9
+ # Input format is
10
+ #
11
+ # rsrc src_id dest_id [... junk ...]
12
+ #
13
+ # All we want from the line are its src and dest IDs.
14
+ #
15
+ def map_command
16
+ %Q{/usr/bin/cut -d"\t" -f2,3}
17
+ end
18
+
19
+ def default_options
20
+ super.merge :extra_args => ' -jobconf io.sort.record.percent=0.25 '
21
+ end
22
+ end
23
+
24
+ #
25
+ # Accumulate the dests list in memory, dump as a whole. Multiple edges between
26
+ # any two nodes are permitted, and will accumulate pagerank according to the
27
+ # edge's multiplicity.
28
+ #
29
+ class Reducer < Wukong::Streamer::ListReducer
30
+ def accumulate src, dest
31
+ self.values << dest
32
+ end
33
+
34
+ # Emit src, initial pagerank, and flattened dests list
35
+ def finalize
36
+ self.values = ['dummy'] if self.values.blank?
37
+ yield [key, 1.0, self.values.to_a.join(",")]
38
+ end
39
+ end
40
+
41
+ # Execute the script
42
+ Script.new(nil, PageRank::Reducer).run
43
+ end
44
+
45
+
46
+
@@ -0,0 +1,19 @@
1
+ #!/usr/bin/env bash
2
+
3
+ # Directory to pagerank on.
4
+ work_dir=$1 ; shift
5
+ if [ "$work_dir" == '' ] ; then echo "Please specify the parent of the directory made by gen_initial_pagerank" ; exit ; fi
6
+
7
+
8
+ # How many rounds to run
9
+ max_iter=10
10
+ # this directory
11
+ script_dir="`dirname $0`"
12
+
13
+ for (( curr=0 , next=1 ; "$curr" < "$max_iter" ; curr++ , next++ )) ; do
14
+ curr_str=`printf "%03d" ${curr}`
15
+ next_str=`printf "%03d" ${next}`
16
+ curr_dir=$work_dir/pagerank_graph_${curr_str}
17
+ next_dir=$work_dir/pagerank_graph_${next_str}
18
+ $script_dir/pagerank.rb --rm --run $curr_dir $next_dir
19
+ done
@@ -0,0 +1,173 @@
1
+ #!/usr/bin/env ruby
2
+ $: << File.dirname(__FILE__)+'/../lib'
3
+ require 'wukong'
4
+ require 'wukong/streamer/rank_and_bin_reducer'
5
+
6
+ #
7
+ # This example uses the classes from http://github.com/mrflip/twitter_friends
8
+ # (That's sloppy, and I apologize. I'm building this script for that, but it
9
+ # seems broadly useful and I'm not maintaining two copies. Once this script is
10
+ # more worky we'll make it standalone. Anyway you should get the picture.)
11
+ #
12
+ $: << File.dirname(__FILE__)+'/../../projects/twitter_friends/lib'
13
+ require 'twitter_friends';
14
+ require 'twitter_friends/struct_model' ; include TwitterFriends::StructModel
15
+
16
+
17
+ #
18
+ # attrs to bin
19
+ #
20
+ BINNABLE_ATTRS = {
21
+ :twitter_user => [
22
+ [:followers_count, :fo ],
23
+ [:friends_count, :fr ],
24
+ [:statuses_count, :st ],
25
+ [:favourites_count, :fv ],
26
+ [:created_at, :crat ]
27
+ ]
28
+
29
+ }
30
+ RESOURCE_ALIASES = {
31
+ :twitter_user => :u,
32
+ :user_metrics => :um,
33
+ }
34
+ #
35
+ # KLUDGE This is not DRY at all but let's get it working first
36
+ #
37
+ BinUserMetrics = TypedStruct.new(
38
+ [:id, Integer],
39
+ *BINNABLE_ATTRS[:user_metrics].map{|attr, attr_abbr| [attr_abbr, Integer] }
40
+ )
41
+ BINNED_RESOURCE_ALIASES = {
42
+ :u => BinTwitterUser,
43
+ }
44
+
45
+ module RankAndBinAttrs
46
+ class ExplodeResourceMapper < Wukong::Streamer::StructStreamer
47
+ def get_and_format_attr thing, attr
48
+ val = thing.send(attr)
49
+ case thing.members_types[attr].to_s.to_sym
50
+ when :Integer then "%010d" % val.to_i
51
+ when :Float then "%020.7f" % val.to_f
52
+ when :Bignum then "%020d" % val.to_i
53
+ else
54
+ raise [val, thing.members_types[attr].to_s.to_sym].inspect
55
+ end
56
+ end
57
+
58
+ #
59
+ # The data expansion of this mapper is large enough that it makes sense to
60
+ # be a little responsible with what we emit. We'll use the RESOURCE_ALIASES
61
+ # and BINNABLE_ATTRS hashes, above, to dump a more parsimonious
62
+ # representation.
63
+ #
64
+ def process thing, *args, &block
65
+ attr_abbrs = BINNABLE_ATTRS[thing.class.resource_name]
66
+ return unless attr_abbrs
67
+ attr_abbrs.each do |attr, abbr|
68
+ yield [
69
+ RESOURCE_ALIASES[thing.class.resource_name],
70
+ abbr,
71
+ get_and_format_attr(thing, attr),
72
+ thing.id.to_i
73
+ ]
74
+ end
75
+ end
76
+ end
77
+
78
+ class BinAttrReducer < Wukong::Streamer::RankAndBinReducer
79
+ attr_accessor :last_rsrc_attr
80
+ #
81
+ # Note that we might get several different resources at the same reducer
82
+ #
83
+ def get_key rsrc, attr, val, *args
84
+ if [rsrc, attr] != self.last_rsrc_attr
85
+ # Note: since each partition has the same cardinality, we don't need to
86
+ # fiddle around with the bin_size, etc -- just reset the order
87
+ # parameters' state.
88
+ reset_order_params!
89
+ self.last_rsrc_attr = [rsrc, attr]
90
+ end
91
+ val
92
+ end
93
+
94
+ #
95
+ # Note well -- we are rearranging the field order to
96
+ #
97
+ # resource_abbr id attr_abbr bin
98
+ #
99
+ # for proper sorting to the re-assembler
100
+ #
101
+ def emit record
102
+ rsrc, attr, val, id, numbering, rank, bin = record
103
+ super [rsrc, id, attr, bin]
104
+ end
105
+ end
106
+
107
+ class ReassembleObjectReducer < Wukong::Streamer::AccumulatingReducer
108
+ attr_accessor :thing
109
+ def klass_from_abbr rsrc_abbr
110
+ BINNED_RESOURCE_ALIASES[rsrc_abbr.to_sym]
111
+ end
112
+ def get_key rsrc_abbr, id, *args
113
+ [rsrc_abbr, id.to_i]
114
+ end
115
+
116
+ def start! rsrc_abbr, id, *args
117
+ klass = klass_from_abbr(rsrc_abbr)
118
+ self.thing = klass.new id.to_i
119
+ end
120
+
121
+ def accumulate rsrc, id, attr, bin
122
+ thing.send("#{attr}=", bin)
123
+ end
124
+
125
+ def finalize
126
+ yield thing
127
+ end
128
+ end
129
+
130
+ #
131
+ # Two-phase script
132
+ #
133
+ # FIXME -- We need a runner class to manage this.
134
+ #
135
+ class Script < Wukong::Script
136
+ attr_accessor :phase
137
+ # KLUDGE !!
138
+ def initialize
139
+ case
140
+ when ARGV.detect{|arg| arg =~ /--phase=1/}
141
+ # Phase 1 -- Steal underpants. Also, disassemble each object, and find
142
+ # the bin for each binnable attribute's value
143
+ self.phase = 1
144
+ self.mapper_klass, self.reducer_klass = [ExplodeResourceMapper, BinAttrReducer]
145
+ when ARGV.detect{|arg| arg =~ /--phase=2/}
146
+ # Phase 2 -- ????
147
+ raise "Phase 2 : ????"
148
+ when ARGV.detect{|arg| arg =~ /--phase=3/}
149
+ # Phase 3 -- profit. In this case, put records back together.
150
+ self.phase = 3
151
+ self.mapper_klass, self.reducer_klass = [nil, ReassembleObjectReducer]
152
+ else
153
+ raise "Please run me with a --phase= option"
154
+ end
155
+ super mapper_klass, reducer_klass
156
+ end
157
+
158
+ def default_options
159
+ extra_options =
160
+ case self.phase
161
+ # partition on [rsrc, attr]; sort on [rsrc, attr, val]
162
+ when 1 then { :sort_fields => 3, :partition_fields => 2 }
163
+ # sort on [rsrc, id]
164
+ when 3 then { :sort_fields => 2 }
165
+ else { }
166
+ end
167
+ super.merge extra_options
168
+ end
169
+ end
170
+
171
+ # execute script
172
+ Script.new.run
173
+ end