mrflip-wukong 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (137) hide show
  1. data/LICENSE.txt +202 -0
  2. data/README-tutorial.textile +163 -0
  3. data/README.textile +165 -0
  4. data/bin/cutc +30 -0
  5. data/bin/cuttab +5 -0
  6. data/bin/greptrue +8 -0
  7. data/bin/hdp-cat +3 -0
  8. data/bin/hdp-catd +3 -0
  9. data/bin/hdp-du +81 -0
  10. data/bin/hdp-get +3 -0
  11. data/bin/hdp-kill +3 -0
  12. data/bin/hdp-ls +10 -0
  13. data/bin/hdp-mkdir +3 -0
  14. data/bin/hdp-mv +3 -0
  15. data/bin/hdp-parts_to_keys.rb +77 -0
  16. data/bin/hdp-ps +3 -0
  17. data/bin/hdp-put +3 -0
  18. data/bin/hdp-rm +11 -0
  19. data/bin/hdp-sort +29 -0
  20. data/bin/hdp-stream +29 -0
  21. data/bin/hdp-stream-flat +18 -0
  22. data/bin/hdp-sync +17 -0
  23. data/bin/hdp-wc +67 -0
  24. data/bin/md5sort +20 -0
  25. data/bin/tabchar +5 -0
  26. data/bin/uniqc +3 -0
  27. data/bin/wu-hist +3 -0
  28. data/bin/wu-lign +177 -0
  29. data/bin/wu-sum +30 -0
  30. data/doc/README-wulign.textile +59 -0
  31. data/doc/README-wutils.textile +128 -0
  32. data/doc/UsingWukong-part1.textile +2 -0
  33. data/doc/UsingWukong-part2.textile +2 -0
  34. data/doc/UsingWukong-part3-parsing.textile +132 -0
  35. data/doc/code/api_response_example.txt +20 -0
  36. data/doc/code/parser_skeleton.rb +38 -0
  37. data/doc/hadoop-setup.textile +21 -0
  38. data/doc/intro_to_map_reduce/MapReduceDiagram.graffle +0 -0
  39. data/doc/links.textile +42 -0
  40. data/doc/overview.textile +91 -0
  41. data/doc/pig/PigLatinExpressionsList.txt +122 -0
  42. data/doc/pig/PigLatinReferenceManual.html +19134 -0
  43. data/doc/pig/PigLatinReferenceManual.txt +1640 -0
  44. data/doc/tips.textile +65 -0
  45. data/doc/utils.textile +48 -0
  46. data/examples/README.txt +17 -0
  47. data/examples/and_pig/sample_queries.rb +128 -0
  48. data/examples/apache_log_parser.rb +53 -0
  49. data/examples/count_keys.rb +56 -0
  50. data/examples/count_keys_at_mapper.rb +57 -0
  51. data/examples/graph/adjacency_list.rb +74 -0
  52. data/examples/graph/breadth_first_search.rb +79 -0
  53. data/examples/graph/gen_2paths.rb +68 -0
  54. data/examples/graph/gen_multi_edge.rb +103 -0
  55. data/examples/graph/gen_symmetric_links.rb +53 -0
  56. data/examples/package-local.rb +100 -0
  57. data/examples/package.rb +96 -0
  58. data/examples/pagerank/README.textile +6 -0
  59. data/examples/pagerank/gen_initial_pagerank_graph.pig +57 -0
  60. data/examples/pagerank/pagerank.rb +88 -0
  61. data/examples/pagerank/pagerank_initialize.rb +46 -0
  62. data/examples/pagerank/run_pagerank.sh +19 -0
  63. data/examples/rank_and_bin.rb +173 -0
  64. data/examples/run_all.sh +47 -0
  65. data/examples/sample_records.rb +44 -0
  66. data/examples/size.rb +60 -0
  67. data/examples/word_count.rb +95 -0
  68. data/lib/wukong.rb +11 -0
  69. data/lib/wukong/and_pig.rb +62 -0
  70. data/lib/wukong/and_pig/README.textile +12 -0
  71. data/lib/wukong/and_pig/as.rb +37 -0
  72. data/lib/wukong/and_pig/data_types.rb +30 -0
  73. data/lib/wukong/and_pig/functions.rb +50 -0
  74. data/lib/wukong/and_pig/generate.rb +85 -0
  75. data/lib/wukong/and_pig/generate/variable_inflections.rb +85 -0
  76. data/lib/wukong/and_pig/junk.rb +51 -0
  77. data/lib/wukong/and_pig/operators.rb +8 -0
  78. data/lib/wukong/and_pig/operators/compound.rb +29 -0
  79. data/lib/wukong/and_pig/operators/evaluators.rb +7 -0
  80. data/lib/wukong/and_pig/operators/execution.rb +15 -0
  81. data/lib/wukong/and_pig/operators/file_methods.rb +29 -0
  82. data/lib/wukong/and_pig/operators/foreach.rb +98 -0
  83. data/lib/wukong/and_pig/operators/groupies.rb +212 -0
  84. data/lib/wukong/and_pig/operators/load_store.rb +65 -0
  85. data/lib/wukong/and_pig/operators/meta.rb +42 -0
  86. data/lib/wukong/and_pig/operators/relational.rb +129 -0
  87. data/lib/wukong/and_pig/pig_struct.rb +48 -0
  88. data/lib/wukong/and_pig/pig_var.rb +95 -0
  89. data/lib/wukong/and_pig/symbol.rb +29 -0
  90. data/lib/wukong/and_pig/utils.rb +0 -0
  91. data/lib/wukong/bad_record.rb +18 -0
  92. data/lib/wukong/boot.rb +47 -0
  93. data/lib/wukong/datatypes.rb +24 -0
  94. data/lib/wukong/datatypes/enum.rb +123 -0
  95. data/lib/wukong/dfs.rb +80 -0
  96. data/lib/wukong/encoding.rb +111 -0
  97. data/lib/wukong/extensions.rb +15 -0
  98. data/lib/wukong/extensions/array.rb +18 -0
  99. data/lib/wukong/extensions/blank.rb +93 -0
  100. data/lib/wukong/extensions/class.rb +189 -0
  101. data/lib/wukong/extensions/date_time.rb +24 -0
  102. data/lib/wukong/extensions/emittable.rb +82 -0
  103. data/lib/wukong/extensions/hash.rb +120 -0
  104. data/lib/wukong/extensions/hash_like.rb +112 -0
  105. data/lib/wukong/extensions/hashlike_class.rb +47 -0
  106. data/lib/wukong/extensions/module.rb +2 -0
  107. data/lib/wukong/extensions/pathname.rb +27 -0
  108. data/lib/wukong/extensions/string.rb +65 -0
  109. data/lib/wukong/extensions/struct.rb +17 -0
  110. data/lib/wukong/extensions/symbol.rb +11 -0
  111. data/lib/wukong/logger.rb +40 -0
  112. data/lib/wukong/models/graph.rb +27 -0
  113. data/lib/wukong/rdf.rb +104 -0
  114. data/lib/wukong/schema.rb +39 -0
  115. data/lib/wukong/script.rb +265 -0
  116. data/lib/wukong/script/hadoop_command.rb +111 -0
  117. data/lib/wukong/script/local_command.rb +14 -0
  118. data/lib/wukong/streamer.rb +13 -0
  119. data/lib/wukong/streamer/accumulating_reducer.rb +89 -0
  120. data/lib/wukong/streamer/base.rb +76 -0
  121. data/lib/wukong/streamer/count_keys.rb +30 -0
  122. data/lib/wukong/streamer/count_lines.rb +26 -0
  123. data/lib/wukong/streamer/filter.rb +20 -0
  124. data/lib/wukong/streamer/line_streamer.rb +12 -0
  125. data/lib/wukong/streamer/list_reducer.rb +20 -0
  126. data/lib/wukong/streamer/preprocess_with_pipe_streamer.rb +22 -0
  127. data/lib/wukong/streamer/rank_and_bin_reducer.rb +145 -0
  128. data/lib/wukong/streamer/set_reducer.rb +14 -0
  129. data/lib/wukong/streamer/struct_streamer.rb +48 -0
  130. data/lib/wukong/streamer/summing_reducer.rb +29 -0
  131. data/lib/wukong/streamer/uniq_by_last_reducer.rb +44 -0
  132. data/lib/wukong/typed_struct.rb +12 -0
  133. data/lib/wukong/wukong_class.rb +20 -0
  134. data/spec/bin/hdp-wc_spec.rb +4 -0
  135. data/spec/spec_helper.rb +0 -0
  136. data/wukong.gemspec +173 -0
  137. metadata +208 -0
@@ -0,0 +1,65 @@
1
+ http://cluster-fork.info/index.php?title=Main_Page
2
+
3
+
4
+ h3. For Big Data, instead of "ACID" you use "ACID*"
5
+
6
+ * A -- Associative
7
+ * C -- Commutative
8
+ * I -- Idempotent
9
+ * D -- Distributed
10
+ * (*) -- (and where possible, left in sort order)
11
+
12
+
13
+ Finally, where possible leave things in sort order by some appropriate index. Clearly I'm not talking about introducing extra unnecessary sorts on ephemeral data. For things that will be read (and experimented with) much more often than they're written, though, it's worth running a final sort. Now you can
14
+
15
+ * Efficiently index into a massive dataset with binary search
16
+ * Do a direct merge sort on two files with the same sort order
17
+ * Run a reducer directly across the data
18
+ * Assign a synthetic key by just serially numbering lines (either distribute a unique prefix to each mapper
19
+
20
+ Note: for files that will live on the DFS, you should usually *not* do a total sort,
21
+
22
+ h3. Encode once, and carefully.
23
+
24
+ Encoding violates idempotence.
25
+
26
+ Is there a lightweight, mostly-transparent, ASCII-compatible *AND* idempotent encoding scheme lurking in a back closet of some algorithms book?
27
+
28
+
29
+
30
+ h3. Keys
31
+
32
+ Natural keys are right for big data
33
+
34
+ Synthetic keys suck. They demand locality or a central keymaster.
35
+
36
+ See About Keys
37
+
38
+ * Use the natural key
39
+ * Hash the natural key. This has some drawbacks
40
+
41
+ OK, fine. you need a synthetic key
42
+
43
+ * Do a total sort, and use nl
44
+ * Generate
45
+ * Use a single reducer to reduce locality. YUCK.
46
+ * have each mapper generate a unique prefix; number each line as "prefix#{line_number}" or whatever.
47
+
48
+ How do you get a unique prefix?
49
+
50
+ * Distribute a unique prefix to each mapper out-of-band. People using Streaming are out of luck.
51
+
52
+ * Use a UUID -- that's what they're for. Drawback: ridiculously long
53
+
54
+ * Hash the machine name, PID and timestamp to something short. Check after the
55
+ fact that uniqueness was achieved. Use the birthday party formula to find out
56
+ how often this will happen. (In practice, almost never.)
57
+
58
+ h3. Epistemology and exeption handling
59
+
60
+ something that goes wrong 1/1000 time will happen
61
+
62
+
63
+ h3. Real hackers use the command line as an IDE
64
+
65
+
@@ -0,0 +1,48 @@
1
+
2
+ <something to tab and align table>
3
+
4
+
5
+ * uniq - report or filter out repeated lines in a file
6
+ ** -c produces line<tab>count
7
+ ** --ignore f1,f2,... discards given fields from consideration. field syntax same as for cut, etc.
8
+
9
+ * sort - sort lines of text files
10
+ ** columns indexed as tab-separated
11
+ ** can specify any column order, uses same field spec as cut
12
+ * tsort - topological sort of a directed graph
13
+
14
+ * cut - select portions of each line of a file
15
+ ** can reorder columns
16
+ * nl - line numbering filter
17
+ ** takes prefix, suffix
18
+ ** count \t line -OR- line \t count
19
+
20
+ * wc - word, line, character, and byte count
21
+ ** field count (tab-separated fields)
22
+ * paste - merge corresponding or subsequent lines of files
23
+ * expand, unexpand - expand tabs to spaces, and vice versa
24
+ * seq
25
+ * simple row, column sums
26
+ * join - relational database operator
27
+ * tac
28
+
29
+ * cat - concatenate and print files
30
+ * head - display first lines of a file
31
+ * tail - display the last part of a file
32
+ * shuf
33
+ * split - split a file into pieces
34
+ * csplit - split files based on context
35
+ * tee - pipe fitting
36
+
37
+ * ls - list directory contents.
38
+ * df - display free disk space
39
+ * du - display disk usage statistics
40
+ ** tab-delimited, space aligned
41
+
42
+ * od - octal, decimal, hex, ASCII dump
43
+ * printf - formatted output
44
+ * cksum, sum - display file checksums and block counts
45
+ * md5sum
46
+
47
+ * diff
48
+ * comm
@@ -0,0 +1,17 @@
1
+ Examples:
2
+
3
+
4
+ * sample_records -- extract a random sample from a collection of data
5
+
6
+ * word_count
7
+
8
+ * apache_log_parser -- example for parsing standard apache webserver log files.
9
+
10
+ * wordchains -- solving a word puzzle using breadth-first search of a graph
11
+
12
+ * graph -- some generic graph
13
+
14
+ * pagerank -- use the pagerank algorithm to find the most 'interesting'
15
+ (central) nodes of a network graph
16
+
17
+
@@ -0,0 +1,128 @@
1
+ #!/usr/bin/env ruby
2
+ $: << File.dirname(__FILE__) + '/../../lib'
3
+ require 'wukong' ; include Wukong
4
+ require 'wukong/and_pig' ; include Wukong::AndPig
5
+
6
+ # PIG_DIR = '/usr/local/share/pig'
7
+ PIG_DIR = '/public/share/pig'
8
+ # full pathname to the pig executable
9
+ # Wukong::AndPig::PIG_EXECUTABLE = "#{PIG_DIR}/bin/pig"
10
+ Wukong::AndPig::PIG_EXECUTABLE = "/public/bin/pig -x local"
11
+
12
+ #
13
+ HDFS_BASE_DIR = 'foo/meta/lang'
14
+ Wukong::AndPig::PigVar.working_dir = HDFS_BASE_DIR
15
+ Wukong::AndPig.comments = false
16
+ # Wukong::AndPig.emit_dest = :captured
17
+
18
+ Wukong::AndPig::PigVar.emit "REGISTER #{PIG_DIR}/contrib/piggybank/java/piggybank.jar"
19
+
20
+ #
21
+ # Load basic types
22
+ #
23
+
24
+ # class Token < Struct.new(:rsrc, :context, :user_id, :token, :usages)
25
+ # end
26
+ # :tokens_users_0 << Token.pig_load('meta/datanerds/token_count/users_tokens')
27
+ # :tokens_users_0 << Token.pig_load('/tmp/users_tokens.tsv')
28
+ # :tokens_users << :tokens_users_0.generate(:user_id, :token, :usages)
29
+ # :tokens_users.checkpoint!
30
+
31
+ class Token < TypedStruct.new(
32
+ [:user_id, Integer], [:token, String], [:usages, Integer])
33
+ end
34
+ :tokens_users << Token.pig_load('/tmp/users_tokens.tsv')
35
+ :tokens_users.describe
36
+
37
+ pig_comment %Q{
38
+ # ***************************************************************************
39
+ #
40
+ # Global totals
41
+ #
42
+ # Each row in Tokens lists a (user, token, usages)
43
+ # We want
44
+ # Sum of all usage counts = total tokens seen in tweet stream.
45
+ # Number of distinct tokens
46
+ # Number of distinct users <- different than total in twitter_users.tsv
47
+ # because we want only users that say stuff.
48
+ }
49
+
50
+ def count_distinct relation, field, options={}
51
+ result_name = options[:as] || "#{relation.name}_#{field}_count".to_sym
52
+ a = relation.
53
+ generate(field).set!.describe.
54
+ distinct(options).set!
55
+ result_name << a.
56
+ group(:all).set!.
57
+ generate(["COUNT(#{a.relation}.#{field})", :u_count, Integer]).set!
58
+ end
59
+
60
+ pig_comment "Count Users"
61
+ tok_users_count = count_distinct(:tokens_users, :user_id).checkpoint!
62
+
63
+ pig_comment "Count Tokens"
64
+ tok_tokens_count = count_distinct(:tokens_users, :token, :parallel => 10).checkpoint!
65
+
66
+
67
+ pig_comment %Q{
68
+ # ***************************************************************************
69
+ #
70
+ # Statistics for each user
71
+ }
72
+
73
+ def user_stats users_tokens
74
+ users_tokens.describe.
75
+ group( :user_id).set!.describe.
76
+ generate(
77
+ [:group, :user_id],
78
+ ["(int)COUNT(#{users_tokens.relation})", :tot_tokens, Integer],
79
+ [ "(int)SUM(#{users_tokens.relation}.usages)", :tot_usages, Integer],
80
+ [ "FLATTEN(#{users_tokens.relation}.token", :token, String ],
81
+ [ "FLATTEN(#{users_tokens.relation}.usages", :usages, Integer]).set!.describe.
82
+ # [ "FLATTEN(#{users_tokens.relation}.(token, usages) )", [:token, :usages], TypedStruct.new([:token, String], [:usages, Integer])]).set!.
83
+ generate(:user_id, :token, :usages,
84
+ ["(float)(1.0*usages / tot_usages)", :usage_pct, Float],
85
+ ["(float)(1.0*usages / tot_usages) * (1.0*(float)usages / tot_usages)", :usage_pct_sq, Float]).set!
86
+ end
87
+
88
+ :user_stats << user_stats(:tokens_users)
89
+ :user_stats.describe.checkpoint!
90
+ puts "UserStats = LOAD 'foo/meta/lang/user_stats' AS (user_id, token, usages, usage_pct, usage_pct_sq) ;"
91
+
92
+ UserStats = TypedStruct.new([:user_id, Integer],
93
+ [:token, String],
94
+ [:usages, Integer],
95
+ [:usage_pct, Float],
96
+ [:usage_pct_sq, Float])
97
+ :user_stats << UserStats.pig_load('foo/meta/lang/user_stats')
98
+
99
+ def range_and_dispersion user_stats
100
+
101
+ n_users = 436
102
+ n_tokens = 61630
103
+
104
+ token_stats = user_stats.group(:token).set!
105
+ token_stats = token_stats.foreach(
106
+ ["(float)SUM(#{user_stats.relation}.usage_pct) / #{n_users.to_f}", :avg_uspct ],
107
+ ["(float)SUM(#{user_stats.relation}.usage_pct_sq)", :sum_uspct_sq],
108
+ ["org.apache.pig.piggybank.evaluation.math.SQRT(
109
+ (sum_uspct_sq /436) -
110
+ ( (SUM(#{user_stats.relation}.usage_pct)/436.0) * (SUM(#{user_stats.relation}.usage_pct)/436.0) )
111
+ )", :stdev_uspct],
112
+ ["1 - ( ( stdev_uspct / avg_uspct ) / org.apache.pig.piggybank.evaluation.math.SQRT(436.0 - 1.0) )", :dispersion],
113
+ [
114
+ [:group, :token, String ],
115
+ ["(int)COUNT(#{user_stats.relation}) ", :range, Integer ],
116
+ ["(int)COUNT(#{user_stats.relation}) / #{n_users.to_f}", :pct_range, Integer ],
117
+ ["(int)SUM( #{user_stats.relation}.usages)", :tot_usages, Integer],
118
+ ["(float)( 1.0e6*SUM(#{user_stats.relation}.usages) / #{n_tokens.to_f})", :ppm_usages, Float],
119
+ [:avg_uspct, :avg_uspct],
120
+ [:stdev_uspct, :stdev_uspct],
121
+ [:dispersion, :dispersion]
122
+ ]
123
+ ).set!
124
+ end
125
+
126
+ range_and_dispersion(:user_stats).checkpoint!
127
+
128
+ Wukong::AndPig.finish
@@ -0,0 +1,53 @@
1
+ #!/usr/bin/env ruby
2
+ $: << File.dirname(__FILE__)+'/../lib'
3
+ require 'wukong'
4
+
5
+ module ApacheLogParser
6
+ class Mapper < Wukong::Streamer::LineStreamer
7
+
8
+
9
+ def parse_request req
10
+ m = %r{\A(\w+) (.*) (\w+/[\w\.]+)\z}.match(req)
11
+ if m
12
+ [''] + m.captures
13
+ else
14
+ [req, '', '', '']
15
+ end
16
+ end
17
+
18
+
19
+ # regular expression to match on apache-style log lines
20
+ # IP addr - - [07/Jun/2008:20:37:11 +0000] 400 "GET /faq" + gaJsHost + "google-analytics.com/ga.js HTTP/1.1" 173 "-" "-" "-"
21
+ LOG_RE = %r{\A(\d+\.\d+\.\d+\.\d+) ([^\s]+) ([^\s]+) \[(\d\d/\w+/\d+):(\d\d:\d\d:\d\d)([^\]]*)\] (\d+) "([^\"]*(?:\" \+ gaJsHost \+ \"[^\"]*)?)" (\d+) "([^\"]*)" "([^\"]*)" "([^\"]*)"\z}
22
+
23
+ def process line
24
+ line.chomp
25
+ m = LOG_RE.match(line)
26
+ if m
27
+ ip, j1, j2, datepart, timepart, tzpart, resp, req, j3, ref, ua, j4 = m.captures
28
+ req_date = DateTime.parse("#{datepart} #{timepart} #{tzpart}").to_flat
29
+ req, method, path, protocol = parse_request(req)
30
+ yield [:logline, method, path, protocol, ip, j1, j2, req_date, resp, req, j3, ref, ua, j4]
31
+ else
32
+ yield [:unparseable, line]
33
+ end
34
+ end
35
+ end
36
+
37
+ class Reducer < Wukong::Streamer::LineStreamer
38
+ end
39
+
40
+ # Execute the script
41
+ class Script < Wukong::Script
42
+ def reduce_command
43
+ "/usr/bin/uniq"
44
+ end
45
+ def default_options
46
+ super.merge :sort_fields => 8 # , :reduce_tasks => 0
47
+ end
48
+ end
49
+
50
+ Script.new(Mapper,nil).run
51
+ end
52
+
53
+ # 55.55.155.55 - - [04/Feb/2008:11:37:52 +0000] 301 "GET /robots.txt HTTP/1.1" 185 "-" "WebAlta Crawler/2.0 (http://www.webalta.net/ru/about_webmaster.html) (Windows; U; Windows NT 5.1; ru-RU)" "-"
@@ -0,0 +1,56 @@
1
+ #!/usr/bin/env ruby
2
+ $: << File.dirname(__FILE__)+'/../lib'
3
+ require 'wukong'
4
+ require 'wukong/streamer/count_keys'
5
+ require 'wukong/streamer/count_lines'
6
+
7
+ #
8
+ #
9
+ class CountKeysReducer < Wukong::Streamer::CountLines
10
+ #
11
+ # Taken from the actionpack Rails component ('action_view/helpers/number_helper')
12
+ #
13
+ # Formats a +number+ with grouped thousands using +delimiter+. You
14
+ # can customize the format using optional <em>delimiter</em> and <em>separator</em> parameters.
15
+ # * <tt>delimiter</tt> - Sets the thousands delimiter, defaults to ","
16
+ # * <tt>separator</tt> - Sets the separator between the units, defaults to "."
17
+ #
18
+ # number_with_delimiter(12345678) => 12,345,678
19
+ # number_with_delimiter(12345678.05) => 12,345,678.05
20
+ # number_with_delimiter(12345678, ".") => 12.345.678
21
+ def number_with_delimiter(number, delimiter=",", separator=".")
22
+ begin
23
+ parts = number.to_s.split('.')
24
+ parts[0].gsub!(/(\d)(?=(\d\d\d)+(?!\d))/, "\\1#{delimiter}")
25
+ parts.join separator
26
+ rescue
27
+ number
28
+ end
29
+ end
30
+
31
+ # Override to look nice
32
+ def formatted_count item, key_count
33
+ key_count_str = number_with_delimiter(key_count.to_i)
34
+ "%-25s\t%12s" % [item, key_count_str]
35
+ end
36
+ end
37
+
38
+ #
39
+ class CountKeysScript < Wukong::Script
40
+ def map_command
41
+ # Use `cut` to extract the first field
42
+ %Q{ cut -d"\t" -f1 }
43
+ end
44
+
45
+ #
46
+ # There's just the one field
47
+ #
48
+ def default_options
49
+ super.merge :sort_fields => 1
50
+ end
51
+ end
52
+
53
+ # Executes the script when run from command line
54
+ if __FILE__ == $0
55
+ CountKeysScript.new(nil, CountKeysReducer).run
56
+ end
@@ -0,0 +1,57 @@
1
+ #!/usr/bin/env ruby
2
+ $: << File.dirname(__FILE__)+'/../lib'
3
+ require 'wukong'
4
+
5
+ #
6
+ #
7
+ module CountKeys
8
+ #
9
+ class Mapper < Wukong::Streamer::Base
10
+ attr_accessor :keys_count
11
+ def initialize *args
12
+ self.keys_count = {}
13
+ end
14
+ def process key, *args
15
+ key.gsub!(/-.*/, '') # kill off the slug
16
+ self.keys_count[key] ||= 0
17
+ self.keys_count[key] += 1
18
+ end
19
+ def stream *args
20
+ super *args
21
+ self.keys_count.each do |key, count|
22
+ emit [key, count].to_flat
23
+ end
24
+ end
25
+ end
26
+ # Identity Mapper
27
+ class Reducer < Wukong::Streamer::AccumulatingReducer
28
+ attr_accessor :key_count
29
+ require 'active_support'
30
+ require 'action_view/helpers/number_helper'; include ActionView::Helpers::NumberHelper
31
+
32
+ # Override to look nice
33
+ def formatted_count item, key_count
34
+ key_count_str = number_with_delimiter(key_count.to_i, :delimiter => ',')
35
+ "%-25s\t%12s" % [item, key_count_str]
36
+ end
37
+ def start! *args
38
+ self.key_count = 0
39
+ end
40
+ def accumulate key, count
41
+ self.key_count += count.to_i
42
+ end
43
+ def finalize
44
+ yield formatted_count(key, key_count)
45
+ end
46
+ end
47
+
48
+ #
49
+ class Script < Wukong::Script
50
+ # There's just the one field
51
+ def default_options
52
+ super.merge :sort_fields => 1, :reduce_tasks => 1
53
+ end
54
+ end
55
+ end
56
+
57
+ CountKeys::Script.new(CountKeys::Mapper, CountKeys::Reducer).run
@@ -0,0 +1,74 @@
1
+ #!/usr/bin/env ruby
2
+ $: << '/home/flip/ics/wukong/lib' # ENV['WUKONG_PATH']
3
+ require 'wukong'
4
+
5
+ #
6
+ # Given an adjacency pairs (from \t to) representation of a directed graph:
7
+ #
8
+ # 1 2
9
+ # 1 7
10
+ # 2 7
11
+ # 2 9
12
+ # 7 2
13
+ #
14
+ # It produces an "adjacency list":http://en.wikipedia.org/wiki/Adjacency_list representation:
15
+ #
16
+ # 1 > 2 7
17
+ # 2 > 7 9
18
+ # 7 > 2
19
+ # 9 >
20
+ #
21
+ # and
22
+ #
23
+ # 1 <
24
+ # 2 < 1 7
25
+ # 7 < 1 2
26
+ # 9 < 2
27
+ #
28
+ # (each column is tab-separated in the actual output)
29
+ #
30
+ #
31
+ #
32
+ module Gen1HoodEdges
33
+ class Mapper < Wukong::Streamer::Base
34
+ def process rsrc, src, dest, *_
35
+ src = src.to_i ; dest = dest.to_i
36
+ yield [ src, '>', dest ]
37
+ yield [ dest, '<', src ]
38
+ end
39
+ end
40
+
41
+ #
42
+ # Accumulate links onto single line.
43
+ #
44
+ # The reduce key is the target node and direction; we just stream through all
45
+ # pairs for each target node and output its neighbor nodes on the same line.
46
+ #
47
+ # To control memory usage, we will print directly to the output (and not run
48
+ # through the Emitter)
49
+ #
50
+ class Reducer < Wukong::Streamer::AccumulatingReducer
51
+ # clear the list of incoming paths
52
+ def start! target, dir, *args
53
+ print target + "\t" + dir # start line with target and list type
54
+ end
55
+ def accumulate target, dir, neighbor
56
+ print "\t" + neighbor # append neighbor to output, same line
57
+ end
58
+ def finalize
59
+ puts '' # start new line
60
+ end
61
+ end
62
+
63
+ class Script < Wukong::Script
64
+ def default_options
65
+ super.merge :sort_fields => 1, :partition_fields => 1
66
+ end
67
+ end
68
+ end
69
+
70
+ # Execute the script
71
+ Gen1HoodEdges::Script.new(
72
+ Gen1HoodEdges::Mapper,
73
+ Gen1HoodEdges::Reducer
74
+ ).run