wukong 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. data/LICENSE.textile +107 -0
  2. data/README.textile +166 -0
  3. data/bin/cutc +30 -0
  4. data/bin/cuttab +5 -0
  5. data/bin/greptrue +8 -0
  6. data/bin/hdp-cat +3 -0
  7. data/bin/hdp-catd +3 -0
  8. data/bin/hdp-du +81 -0
  9. data/bin/hdp-get +3 -0
  10. data/bin/hdp-kill +3 -0
  11. data/bin/hdp-ls +10 -0
  12. data/bin/hdp-mkdir +3 -0
  13. data/bin/hdp-mv +3 -0
  14. data/bin/hdp-parts_to_keys.rb +77 -0
  15. data/bin/hdp-ps +3 -0
  16. data/bin/hdp-put +3 -0
  17. data/bin/hdp-rm +11 -0
  18. data/bin/hdp-sort +29 -0
  19. data/bin/hdp-stream +29 -0
  20. data/bin/hdp-stream-flat +18 -0
  21. data/bin/hdp-sync +17 -0
  22. data/bin/hdp-wc +67 -0
  23. data/bin/md5sort +20 -0
  24. data/bin/tabchar +5 -0
  25. data/bin/uniqc +3 -0
  26. data/bin/wu-hist +3 -0
  27. data/bin/wu-lign +177 -0
  28. data/bin/wu-sum +30 -0
  29. data/doc/INSTALL.textile +41 -0
  30. data/doc/LICENSE.textile +107 -0
  31. data/doc/README-tutorial.textile +163 -0
  32. data/doc/README-wulign.textile +59 -0
  33. data/doc/README-wutils.textile +128 -0
  34. data/doc/TODO.textile +61 -0
  35. data/doc/UsingWukong-part1-setup.textile +2 -0
  36. data/doc/UsingWukong-part2-scraping.textile +2 -0
  37. data/doc/UsingWukong-part3-parsing.textile +132 -0
  38. data/doc/code/api_response_example.txt +20 -0
  39. data/doc/code/parser_skeleton.rb +38 -0
  40. data/doc/hadoop-nfs.textile +51 -0
  41. data/doc/hadoop-setup.textile +29 -0
  42. data/doc/index.textile +124 -0
  43. data/doc/intro_to_map_reduce/MapReduceDiagram.graffle +0 -0
  44. data/doc/links.textile +42 -0
  45. data/doc/overview.textile +91 -0
  46. data/doc/pig/PigLatinExpressionsList.txt +122 -0
  47. data/doc/pig/PigLatinReferenceManual.html +19134 -0
  48. data/doc/pig/PigLatinReferenceManual.txt +1640 -0
  49. data/doc/tips.textile +116 -0
  50. data/doc/usage.textile +102 -0
  51. data/doc/utils.textile +48 -0
  52. data/examples/README.txt +17 -0
  53. data/examples/and_pig/sample_queries.rb +128 -0
  54. data/examples/apache_log_parser.rb +53 -0
  55. data/examples/count_keys.rb +56 -0
  56. data/examples/count_keys_at_mapper.rb +57 -0
  57. data/examples/graph/adjacency_list.rb +74 -0
  58. data/examples/graph/breadth_first_search.rb +79 -0
  59. data/examples/graph/gen_2paths.rb +68 -0
  60. data/examples/graph/gen_multi_edge.rb +103 -0
  61. data/examples/graph/gen_symmetric_links.rb +53 -0
  62. data/examples/package-local.rb +100 -0
  63. data/examples/package.rb +96 -0
  64. data/examples/pagerank/README.textile +6 -0
  65. data/examples/pagerank/gen_initial_pagerank_graph.pig +57 -0
  66. data/examples/pagerank/pagerank.rb +88 -0
  67. data/examples/pagerank/pagerank_initialize.rb +46 -0
  68. data/examples/pagerank/run_pagerank.sh +19 -0
  69. data/examples/rank_and_bin.rb +173 -0
  70. data/examples/run_all.sh +47 -0
  71. data/examples/sample_records.rb +44 -0
  72. data/examples/size.rb +60 -0
  73. data/examples/word_count.rb +95 -0
  74. data/lib/wukong.rb +11 -0
  75. data/lib/wukong/and_pig.rb +62 -0
  76. data/lib/wukong/and_pig/README.textile +12 -0
  77. data/lib/wukong/and_pig/as.rb +37 -0
  78. data/lib/wukong/and_pig/data_types.rb +30 -0
  79. data/lib/wukong/and_pig/functions.rb +50 -0
  80. data/lib/wukong/and_pig/generate.rb +85 -0
  81. data/lib/wukong/and_pig/generate/variable_inflections.rb +82 -0
  82. data/lib/wukong/and_pig/junk.rb +51 -0
  83. data/lib/wukong/and_pig/operators.rb +8 -0
  84. data/lib/wukong/and_pig/operators/compound.rb +29 -0
  85. data/lib/wukong/and_pig/operators/evaluators.rb +7 -0
  86. data/lib/wukong/and_pig/operators/execution.rb +15 -0
  87. data/lib/wukong/and_pig/operators/file_methods.rb +29 -0
  88. data/lib/wukong/and_pig/operators/foreach.rb +98 -0
  89. data/lib/wukong/and_pig/operators/groupies.rb +212 -0
  90. data/lib/wukong/and_pig/operators/load_store.rb +65 -0
  91. data/lib/wukong/and_pig/operators/meta.rb +42 -0
  92. data/lib/wukong/and_pig/operators/relational.rb +129 -0
  93. data/lib/wukong/and_pig/pig_struct.rb +48 -0
  94. data/lib/wukong/and_pig/pig_var.rb +95 -0
  95. data/lib/wukong/and_pig/symbol.rb +29 -0
  96. data/lib/wukong/and_pig/utils.rb +0 -0
  97. data/lib/wukong/bad_record.rb +18 -0
  98. data/lib/wukong/boot.rb +47 -0
  99. data/lib/wukong/datatypes.rb +24 -0
  100. data/lib/wukong/datatypes/enum.rb +123 -0
  101. data/lib/wukong/dfs.rb +80 -0
  102. data/lib/wukong/encoding.rb +111 -0
  103. data/lib/wukong/extensions.rb +15 -0
  104. data/lib/wukong/extensions/array.rb +18 -0
  105. data/lib/wukong/extensions/blank.rb +93 -0
  106. data/lib/wukong/extensions/class.rb +189 -0
  107. data/lib/wukong/extensions/date_time.rb +24 -0
  108. data/lib/wukong/extensions/emittable.rb +82 -0
  109. data/lib/wukong/extensions/hash.rb +120 -0
  110. data/lib/wukong/extensions/hash_like.rb +119 -0
  111. data/lib/wukong/extensions/hashlike_class.rb +47 -0
  112. data/lib/wukong/extensions/module.rb +2 -0
  113. data/lib/wukong/extensions/pathname.rb +27 -0
  114. data/lib/wukong/extensions/string.rb +65 -0
  115. data/lib/wukong/extensions/struct.rb +17 -0
  116. data/lib/wukong/extensions/symbol.rb +11 -0
  117. data/lib/wukong/logger.rb +53 -0
  118. data/lib/wukong/models/graph.rb +27 -0
  119. data/lib/wukong/rdf.rb +104 -0
  120. data/lib/wukong/schema.rb +37 -0
  121. data/lib/wukong/script.rb +265 -0
  122. data/lib/wukong/script/hadoop_command.rb +111 -0
  123. data/lib/wukong/script/local_command.rb +14 -0
  124. data/lib/wukong/streamer.rb +13 -0
  125. data/lib/wukong/streamer/accumulating_reducer.rb +89 -0
  126. data/lib/wukong/streamer/base.rb +76 -0
  127. data/lib/wukong/streamer/count_keys.rb +30 -0
  128. data/lib/wukong/streamer/count_lines.rb +26 -0
  129. data/lib/wukong/streamer/filter.rb +20 -0
  130. data/lib/wukong/streamer/line_streamer.rb +12 -0
  131. data/lib/wukong/streamer/list_reducer.rb +20 -0
  132. data/lib/wukong/streamer/preprocess_with_pipe_streamer.rb +22 -0
  133. data/lib/wukong/streamer/rank_and_bin_reducer.rb +145 -0
  134. data/lib/wukong/streamer/set_reducer.rb +14 -0
  135. data/lib/wukong/streamer/struct_streamer.rb +48 -0
  136. data/lib/wukong/streamer/summing_reducer.rb +29 -0
  137. data/lib/wukong/streamer/uniq_by_last_reducer.rb +44 -0
  138. data/lib/wukong/typed_struct.rb +12 -0
  139. data/lib/wukong/wukong_class.rb +21 -0
  140. data/spec/bin/hdp-wc_spec.rb +4 -0
  141. data/spec/spec_helper.rb +0 -0
  142. data/wukong.gemspec +179 -0
  143. metadata +214 -0
@@ -0,0 +1,100 @@
1
+ #!/usr/bin/env ruby
2
+ $: << File.dirname(__FILE__)+'/../lib'
3
+
4
+ require 'wukong'
5
+
6
+ #
7
+ # This is so very very kludgey
8
+ #
9
+ # Input is an 'ls' file, listing files to .bz2 package.
10
+ #
11
+ # Mapper takes each in turn and creates, within a parallel directory tree under
12
+ # ~/pkgd on the HDFS, a .bz2 compressed version of the file.
13
+ #
14
+ # So, the file
15
+ # /user/me/fixd/all-20090103
16
+ # is packaged onto the DFS as
17
+ # /user/me/pkgd/user/me/fixd/all-20090103
18
+ #
19
+ # listing=tmp/fixd-all-package-listing
20
+ # hdp-rm $listing
21
+ # hadoop dfs -lsr fixd | egrep '(part-|\.tsv$)' | hdp-put - $listing ;
22
+ #
23
+ # ./package.rb --run --rm --map_tasks=1 $listing $pkgd_log
24
+ #
25
+ module ExportPackager
26
+ PKGD_DIR = '/workspace/flip/pkgd'
27
+
28
+ #
29
+ #
30
+ class Reducer < Wukong::Streamer::Base
31
+ def announce *args
32
+ $stdout.puts *args
33
+ $stderr.puts *args
34
+ end
35
+
36
+ def handle_existing_target output_filename
37
+ return true unless File.exist?(output_filename)
38
+ # announce "Exists! #{output_filename}"
39
+ # return false
40
+ announce "Removing target file #{output_filename}"
41
+ begin announce `rm #{output_filename}`
42
+ rescue Exception => e ; announce e ; end
43
+ true
44
+ end
45
+
46
+ def mkdir_target_safely output_filename
47
+ output_dir = File.dirname(output_filename)
48
+ announce "Ensuring directory #{output_dir} exists"
49
+ begin announce `mkdir -p #{output_dir}`
50
+ rescue Exception => e ; announce e ; end
51
+ end
52
+
53
+ def bzip_into_pkgd_file input_filename, output_filename
54
+ announce "bzip'ing into #{output_filename}"
55
+ announce `( hadoop dfs -cat #{input_filename}/[^_]\** ) | bzip2 -c > #{output_filename}`
56
+ end
57
+
58
+ def gen_output_filename input_filename
59
+ input_filename += '.tsv' unless input_filename =~ /.*\.\w{2,}/
60
+ "%s/%s.bz2" % [PKGD_DIR, input_filename.gsub(/^\//, '')]
61
+ end
62
+
63
+ def rsync host, local_path, remote_path=nil
64
+ remote_path ||= local_path
65
+ announce `/usr/bin/rsync -Cuvrtlp #{local_path} #{host}:#{remote_path}`
66
+ sleep 5
67
+ end
68
+
69
+ def process input_filename
70
+ output_filename = gen_output_filename(input_filename)
71
+ handle_existing_target(output_filename) or return
72
+ mkdir_target_safely output_filename
73
+ bzip_into_pkgd_file input_filename, output_filename
74
+ rsync :lab3, output_filename
75
+ #
76
+ end
77
+
78
+ def recordize line
79
+ # handle ls or straight file list, either
80
+ line.split(/\s/).last
81
+ end
82
+
83
+ def stream
84
+ super
85
+ rsync :lab3, PKGD_DIR+'/'
86
+ end
87
+ end
88
+
89
+ class Script < Wukong::Script
90
+ def default_options
91
+ super.merge :map_tasks => 1,
92
+ :max_node_reduce_tasks => 1, # only one reducer per local filesystem
93
+ :timeout => 40 * 60 * 1000 # timeout in ms
94
+ end
95
+ end
96
+ # Execute the script
97
+ Script.new(nil, Reducer).run
98
+ end
99
+
100
+
@@ -0,0 +1,96 @@
1
+ #!/usr/bin/env ruby
2
+ $: << ENV['WUKONG_PATH'] if ENV['WUKONG_PATH']
3
+ require 'wukong'
4
+
5
+ #
6
+ # This is so very very kludgey
7
+ #
8
+ # Input is an 'ls' file, listing files to .bz2 package.
9
+ #
10
+ # Reducer takes each in turn and creates, within a parallel directory tree under
11
+ # ~/pkgd on the HDFS, a .bz2 compressed version of the file.
12
+ #
13
+ # So, the file
14
+ # /user/me/fixd/all-20090103
15
+ # is packaged onto the DFS as
16
+ # /user/me/pkgd/user/me/fixd/all-20090103
17
+ #
18
+ # listing=tmp/fixd-all-package-listing
19
+ # hdp-rm $listing
20
+ # hadoop dfs -lsr fixd | egrep '(part-|\.tsv$)' | hdp-put - $listing ;
21
+ #
22
+ # ./package.rb --run --rm --map_tasks=1 $listing $pkgd_log
23
+ #
24
+ module ExportPackager
25
+ PKGD_DIR = 'pkgd'
26
+
27
+ #
28
+ #
29
+ class Reducer < Wukong::Streamer::Base
30
+ def announce str
31
+ return if str.blank?
32
+ $stderr.puts str
33
+ $stdout.puts str
34
+ end
35
+
36
+ def remove_target_filename output_filename
37
+ begin announce "rm\t#{"%-70s"%output_filename}\t" +
38
+ `( hadoop dfs -rmr #{output_filename} ) 2>&1`
39
+ rescue ; nil ; end
40
+ end
41
+
42
+ def mkdir_target_safely output_filename
43
+ output_dir = File.dirname(output_filename)
44
+ begin announce "mkdir\t#{"%-70s"%output_dir}\t" +
45
+ `( hadoop dfs -mkdir #{output_dir} ) 2>&1`
46
+ rescue ; nil ; end
47
+ end
48
+
49
+ def bzip_into_pkgd_file input_filename, output_filename
50
+ announce "cat|bz\t#{"%-70s"%input_filename}\t" +
51
+ `( hadoop dfs -cat #{input_filename}/[^_]\\* | bzip2 -c | hadoop dfs -put - #{output_filename} ) 2>&1`
52
+ end
53
+
54
+ def verify input_filename, output_filename
55
+ announce "sha1sum\t#{"%-70s"%output_filename}\t" +
56
+ `( hadoop dfs -cat #{output_filename} | bzcat - | sha1sum ) 2>&1`
57
+ announce "sha1sum\t#{"%-70s"%input_filename}\t" +
58
+ `( hadoop dfs -cat #{input_filename}/[^_]\\* | sha1sum ) 2>&1`
59
+ end
60
+
61
+ def gen_output_filename input_filename
62
+ "%s/%s.bz2" % [PKGD_DIR, input_filename.gsub(%r{^/},"")]
63
+ end
64
+
65
+ def process input_filename, output_filename
66
+ # remove_target_filename output_filename
67
+ # mkdir_target_safely output_filename
68
+ bzip_into_pkgd_file input_filename, output_filename
69
+ verify input_filename, output_filename
70
+ end
71
+
72
+ def stream
73
+ announce `hostname`
74
+ $stdin.each do |input_filename|
75
+ # handle ls or straight file list, either
76
+ input_filename = input_filename.chomp.strip.split(/\s/).last
77
+ output_filename = gen_output_filename input_filename
78
+ announce "********************************************************"
79
+ announce "Packing\t#{"%-70s"%input_filename}\t#{output_filename}"
80
+ process input_filename, output_filename
81
+ announce "Done\t#{"%-70s"%input_filename}\t#{output_filename}\n\n"
82
+ end
83
+ end
84
+ end
85
+
86
+ class Script < Wukong::Script
87
+ def default_options
88
+ super.merge :timeout => (24 * 60 * 60 * 1000) # milliseconds in one day
89
+ end
90
+ end
91
+ end
92
+
93
+ #
94
+ # Execute the script
95
+ #
96
+ ExportPackager::Script.new(nil, ExportPackager::Reducer, :reduce_tasks => 1000).run
@@ -0,0 +1,6 @@
1
+
2
+ Calculate pagerank for a tab-separated adjacency list.
3
+
4
+ See
5
+ * Heretrix pagerank util
6
+ * http://www.umiacs.umd.edu/~jimmylin/cloud9/docs/exercises/pagerank.html
@@ -0,0 +1,57 @@
1
+
2
+ -- ===========================================================================
3
+ --
4
+ -- Load Graph
5
+ --
6
+ AFollowsB = LOAD 'twnew/all/a_follows_b' AS (rsrc: chararray, user_a_id: int, user_b_id: int) ;
7
+ FollEdges_0 = FOREACH AFollowsB GENERATE user_a_id AS src, user_b_id AS dest ;
8
+
9
+ InitPagerankFoll_0 = GROUP FollEdges_0 BY src ;
10
+ InitPagerankFoll_1 = FOREACH InitPagerankFoll_0 GENERATE
11
+ group AS src,
12
+ 1.0F AS pagerank:float,
13
+ FollEdges_0.(dest) AS dests
14
+ ;
15
+ rmf twnew/pagerank-foll/pagerank_graph_000 ;
16
+ STORE InitPagerankFoll_1 INTO 'twnew/pagerank-foll/pagerank_graph_000';
17
+
18
+
19
+ -- MultiEdge = LOAD 'twnew/all/multi_edge' AS (
20
+ -- rsrc: chararray, src: int, dest: int,
21
+ -- fo: int, fr: int,
22
+ -- re_out: int, re_in: int,
23
+ -- at_out: int, at_in: int,
24
+ -- rt_out: int, rt_in: int,
25
+ -- fv_out: int, fv_in: int) ;
26
+ --
27
+ -- SymmEdges_0 = FOREACH MultiEdge GENERATE src, dest, fo, fr ;
28
+ -- SymmEdges_1 = FILTER SymmEdges_0 BY (fo >= 1.0) AND (fr >= 1.0) ;
29
+ -- SymmEdges = FOREACH SymmEdges_1 GENERATE src, dest ;
30
+ -- -- rm twnew/graphs/symm_edges; STORE SymmEdges INTO 'twnew/graphs/symm_edges' ;
31
+ -- SymmEdges = LOAD 'twnew/graphs/symm_edges' AS (src:int , dest:int);
32
+ --
33
+ -- AnyoutEdges_0 = FOREACH MultiEdge GENERATE src, dest, fo, re_out, fv_out ;
34
+ -- AnyoutEdges_1 = FILTER AnyoutEdges_0 BY (fo >= 1.0) OR (re_out >= 1.0) OR (fv_out >= 1.0) ;
35
+ -- AnyoutEdges = FOREACH AnyoutEdges_1 GENERATE src, dest ;
36
+ -- -- rm twnew/graphs/anyout_edges; STORE AnyoutEdges INTO 'twnew/graphs/anyout_edges' ;
37
+ -- AnyoutEdges = LOAD 'twnew/graphs/anyout_edges' AS (src:int , dest:int);
38
+ --
39
+ --
40
+ -- InitPagerankSymm_0 = GROUP SymmEdges BY src ;
41
+ -- InitPagerankSymm_1 = FOREACH InitPagerankSymm_0 GENERATE
42
+ -- group AS src,
43
+ -- 1.0F AS pagerank:float,
44
+ -- SymmEdges.(dest) AS dests
45
+ -- ;
46
+ -- rm twnew/pagerank-symm/pagerank_graph_000 ;
47
+ -- STORE InitPagerankSymm_1 INTO 'twnew/pagerank-symm/pagerank_graph_000';
48
+ --
49
+ --
50
+ -- InitPagerankAnyout_0 = GROUP AnyoutEdges BY src ;
51
+ -- InitPagerankAnyout_1 = FOREACH InitPagerankAnyout_0 GENERATE
52
+ -- group AS src,
53
+ -- 1.0F AS pagerank:float,
54
+ -- AnyoutEdges.(dest) AS dests
55
+ -- ;
56
+ -- rm twnew/pagerank-anyout/pagerank_graph_000 ;
57
+ -- STORE InitPagerankAnyout_1 INTO 'twnew/pagerank-anyout/pagerank_graph_000';
@@ -0,0 +1,88 @@
1
+ #!/usr/bin/env ruby
2
+ $: << File.dirname(__FILE__)+'/../../lib'
3
+ require 'wukong'
4
+
5
+ #
6
+ #
7
+ #
8
+
9
+ module PageRank
10
+ #
11
+ # Damping factor (prob. of a 'random' jump)
12
+ # 0.85 works well in practice. See http://en.wikipedia.org/wiki/Pagerank
13
+ #
14
+ DAMPING_FACTOR = 0.85
15
+
16
+ #
17
+ # Each user's line looks like
18
+ #
19
+ # user_a pagerank id1,id2,...,idN
20
+ #
21
+ # we need to disperse this user's pagerank to each of id1..idN, and
22
+ # rendezvous the list of outbound links at user_a's reducer as well.
23
+ #
24
+ module Iterating
25
+ class Mapper < Wukong::Streamer::Base
26
+ #
27
+ # Send pagerank to each page, and send the dests list back to self
28
+ #
29
+ def process src, pagerank, dests_str, &block
30
+ # This lets us use Pig to generate the input
31
+ dests_str = dests_str.gsub(/[\(\{\}\)]/, '')
32
+ dests = dests_str.split(",")
33
+ yield_pagerank_shares src, pagerank, dests, &block
34
+ yield_own_dest_list src, dests_str, &block
35
+ end
36
+
37
+ #
38
+ # Take the source node's pagerank and distribute it among all the out-nodes
39
+ #
40
+ def yield_pagerank_shares src, pagerank, dests
41
+ pagerank_share = pagerank.to_f / dests.length
42
+ dests.each do |dest|
43
+ yield [dest, 'p', pagerank_share]
44
+ end
45
+ end
46
+
47
+ #
48
+ # Dispatch this user's out-node list to rendezvous with itself.
49
+ #
50
+ def yield_own_dest_list src, dests_str
51
+ yield [src, 'd', dests_str]
52
+ end
53
+ end
54
+
55
+ class Reducer < Wukong::Streamer::AccumulatingReducer
56
+ attr_accessor :node_id, :pagerank, :dests_str
57
+ # Begin reduction with 0 accumulated pagerank and no dests as yet
58
+ def start! node_id, *args
59
+ self.node_id = node_id
60
+ self.pagerank = 0.0
61
+ self.dests_str = nil
62
+ end
63
+ # We'll receive fractional pagerank from all incoming edges,
64
+ # and the destination list from this node's map stage
65
+ def accumulate node_id, what, val
66
+ case what
67
+ when 'p' then self.pagerank += val.to_f
68
+ when 'd' then self.dests_str = val
69
+ else raise "Don't know how to accumulate #{[node_id, what, val].inspect}"
70
+ end
71
+ end
72
+ # To finalize, dump the damped pagerank and dest list
73
+ # in a form that can be fed back into this script
74
+ def finalize
75
+ damped_pagerank = (self.pagerank * DAMPING_FACTOR) + (1 - DAMPING_FACTOR)
76
+ self.dests_str = 'dummy' if self.dests_str.blank?
77
+ yield [node_id, damped_pagerank, dests_str]
78
+ end
79
+ end
80
+
81
+ class Script < Wukong::Script
82
+ def default_options
83
+ super.merge :extra_args => ' -jobconf io.sort.record.percent=0.25 '
84
+ end
85
+ end
86
+ Script.new(Mapper, Reducer).run
87
+ end
88
+ end
@@ -0,0 +1,46 @@
1
+ #!/usr/bin/env ruby
2
+ $: << File.dirname(__FILE__)+'/../../lib'
3
+ require 'wukong'
4
+ require 'wukong/streamer/set_reducer'
5
+
6
+ module PageRank
7
+ class Script < Wukong::Script
8
+ #
9
+ # Input format is
10
+ #
11
+ # rsrc src_id dest_id [... junk ...]
12
+ #
13
+ # All we want from the line are its src and dest IDs.
14
+ #
15
+ def map_command
16
+ %Q{/usr/bin/cut -d"\t" -f2,3}
17
+ end
18
+
19
+ def default_options
20
+ super.merge :extra_args => ' -jobconf io.sort.record.percent=0.25 '
21
+ end
22
+ end
23
+
24
+ #
25
+ # Accumulate the dests list in memory, dump as a whole. Multiple edges between
26
+ # any two nodes are permitted, and will accumulate pagerank according to the
27
+ # edge's multiplicity.
28
+ #
29
+ class Reducer < Wukong::Streamer::ListReducer
30
+ def accumulate src, dest
31
+ self.values << dest
32
+ end
33
+
34
+ # Emit src, initial pagerank, and flattened dests list
35
+ def finalize
36
+ self.values = ['dummy'] if self.values.blank?
37
+ yield [key, 1.0, self.values.to_a.join(",")]
38
+ end
39
+ end
40
+
41
+ # Execute the script
42
+ Script.new(nil, PageRank::Reducer).run
43
+ end
44
+
45
+
46
+
@@ -0,0 +1,19 @@
1
+ #!/usr/bin/env bash
2
+
3
+ # Directory to pagerank on.
4
+ work_dir=$1 ; shift
5
+ if [ "$work_dir" == '' ] ; then echo "Please specify the parent of the directory made by gen_initial_pagerank" ; exit ; fi
6
+
7
+
8
+ # How many rounds to run
9
+ max_iter=10
10
+ # this directory
11
+ script_dir="`dirname $0`"
12
+
13
+ for (( curr=0 , next=1 ; "$curr" < "$max_iter" ; curr++ , next++ )) ; do
14
+ curr_str=`printf "%03d" ${curr}`
15
+ next_str=`printf "%03d" ${next}`
16
+ curr_dir=$work_dir/pagerank_graph_${curr_str}
17
+ next_dir=$work_dir/pagerank_graph_${next_str}
18
+ $script_dir/pagerank.rb --rm --run $curr_dir $next_dir
19
+ done
@@ -0,0 +1,173 @@
1
+ #!/usr/bin/env ruby
2
+ $: << File.dirname(__FILE__)+'/../lib'
3
+ require 'wukong'
4
+ require 'wukong/streamer/rank_and_bin_reducer'
5
+
6
+ #
7
+ # This example uses the classes from http://github.com/mrflip/twitter_friends
8
+ # (That's sloppy, and I apologize. I'm building this script for that, but it
9
+ # seems broadly useful and I'm not maintaining two copies. Once this script is
10
+ # more worky we'll make it standalone. Anyway you should get the picture.)
11
+ #
12
+ $: << File.dirname(__FILE__)+'/../../projects/twitter_friends/lib'
13
+ require 'twitter_friends';
14
+ require 'twitter_friends/struct_model' ; include TwitterFriends::StructModel
15
+
16
+
17
+ #
18
+ # attrs to bin
19
+ #
20
+ BINNABLE_ATTRS = {
21
+ :twitter_user => [
22
+ [:followers_count, :fo ],
23
+ [:friends_count, :fr ],
24
+ [:statuses_count, :st ],
25
+ [:favourites_count, :fv ],
26
+ [:created_at, :crat ]
27
+ ]
28
+
29
+ }
30
+ RESOURCE_ALIASES = {
31
+ :twitter_user => :u,
32
+ :user_metrics => :um,
33
+ }
34
+ #
35
+ # KLUDGE This is not DRY at all but let's get it working first
36
+ #
37
+ BinUserMetrics = TypedStruct.new(
38
+ [:id, Integer],
39
+ *BINNABLE_ATTRS[:user_metrics].map{|attr, attr_abbr| [attr_abbr, Integer] }
40
+ )
41
+ BINNED_RESOURCE_ALIASES = {
42
+ :u => BinTwitterUser,
43
+ }
44
+
45
+ module RankAndBinAttrs
46
+ class ExplodeResourceMapper < Wukong::Streamer::StructStreamer
47
+ def get_and_format_attr thing, attr
48
+ val = thing.send(attr)
49
+ case thing.members_types[attr].to_s.to_sym
50
+ when :Integer then "%010d" % val.to_i
51
+ when :Float then "%020.7f" % val.to_f
52
+ when :Bignum then "%020d" % val.to_i
53
+ else
54
+ raise [val, thing.members_types[attr].to_s.to_sym].inspect
55
+ end
56
+ end
57
+
58
+ #
59
+ # The data expansion of this mapper is large enough that it makes sense to
60
+ # be a little responsible with what we emit. We'll use the RESOURCE_ALIASES
61
+ # and BINNABLE_ATTRS hashes, above, to dump a more parsimonious
62
+ # representation.
63
+ #
64
+ def process thing, *args, &block
65
+ attr_abbrs = BINNABLE_ATTRS[thing.class.resource_name]
66
+ return unless attr_abbrs
67
+ attr_abbrs.each do |attr, abbr|
68
+ yield [
69
+ RESOURCE_ALIASES[thing.class.resource_name],
70
+ abbr,
71
+ get_and_format_attr(thing, attr),
72
+ thing.id.to_i
73
+ ]
74
+ end
75
+ end
76
+ end
77
+
78
+ class BinAttrReducer < Wukong::Streamer::RankAndBinReducer
79
+ attr_accessor :last_rsrc_attr
80
+ #
81
+ # Note that we might get several different resources at the same reducer
82
+ #
83
+ def get_key rsrc, attr, val, *args
84
+ if [rsrc, attr] != self.last_rsrc_attr
85
+ # Note: since each partition has the same cardinality, we don't need to
86
+ # fiddle around with the bin_size, etc -- just reset the order
87
+ # parameters' state.
88
+ reset_order_params!
89
+ self.last_rsrc_attr = [rsrc, attr]
90
+ end
91
+ val
92
+ end
93
+
94
+ #
95
+ # Note well -- we are rearranging the field order to
96
+ #
97
+ # resource_abbr id attr_abbr bin
98
+ #
99
+ # for proper sorting to the re-assembler
100
+ #
101
+ def emit record
102
+ rsrc, attr, val, id, numbering, rank, bin = record
103
+ super [rsrc, id, attr, bin]
104
+ end
105
+ end
106
+
107
+ class ReassembleObjectReducer < Wukong::Streamer::AccumulatingReducer
108
+ attr_accessor :thing
109
+ def klass_from_abbr rsrc_abbr
110
+ BINNED_RESOURCE_ALIASES[rsrc_abbr.to_sym]
111
+ end
112
+ def get_key rsrc_abbr, id, *args
113
+ [rsrc_abbr, id.to_i]
114
+ end
115
+
116
+ def start! rsrc_abbr, id, *args
117
+ klass = klass_from_abbr(rsrc_abbr)
118
+ self.thing = klass.new id.to_i
119
+ end
120
+
121
+ def accumulate rsrc, id, attr, bin
122
+ thing.send("#{attr}=", bin)
123
+ end
124
+
125
+ def finalize
126
+ yield thing
127
+ end
128
+ end
129
+
130
+ #
131
+ # Two-phase script
132
+ #
133
+ # FIXME -- We need a runner class to manage this.
134
+ #
135
+ class Script < Wukong::Script
136
+ attr_accessor :phase
137
+ # KLUDGE !!
138
+ def initialize
139
+ case
140
+ when ARGV.detect{|arg| arg =~ /--phase=1/}
141
+ # Phase 1 -- Steal underpants. Also, disassemble each object, and find
142
+ # the bin for each binnable attribute's value
143
+ self.phase = 1
144
+ self.mapper_klass, self.reducer_klass = [ExplodeResourceMapper, BinAttrReducer]
145
+ when ARGV.detect{|arg| arg =~ /--phase=2/}
146
+ # Phase 2 -- ????
147
+ raise "Phase 2 : ????"
148
+ when ARGV.detect{|arg| arg =~ /--phase=3/}
149
+ # Phase 3 -- profit. In this case, put records back together.
150
+ self.phase = 3
151
+ self.mapper_klass, self.reducer_klass = [nil, ReassembleObjectReducer]
152
+ else
153
+ raise "Please run me with a --phase= option"
154
+ end
155
+ super mapper_klass, reducer_klass
156
+ end
157
+
158
+ def default_options
159
+ extra_options =
160
+ case self.phase
161
+ # partition on [rsrc, attr]; sort on [rsrc, attr, val]
162
+ when 1 then { :sort_fields => 3, :partition_fields => 2 }
163
+ # sort on [rsrc, id]
164
+ when 3 then { :sort_fields => 2 }
165
+ else { }
166
+ end
167
+ super.merge extra_options
168
+ end
169
+ end
170
+
171
+ # execute script
172
+ Script.new.run
173
+ end