wukong 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. data/LICENSE.textile +107 -0
  2. data/README.textile +166 -0
  3. data/bin/cutc +30 -0
  4. data/bin/cuttab +5 -0
  5. data/bin/greptrue +8 -0
  6. data/bin/hdp-cat +3 -0
  7. data/bin/hdp-catd +3 -0
  8. data/bin/hdp-du +81 -0
  9. data/bin/hdp-get +3 -0
  10. data/bin/hdp-kill +3 -0
  11. data/bin/hdp-ls +10 -0
  12. data/bin/hdp-mkdir +3 -0
  13. data/bin/hdp-mv +3 -0
  14. data/bin/hdp-parts_to_keys.rb +77 -0
  15. data/bin/hdp-ps +3 -0
  16. data/bin/hdp-put +3 -0
  17. data/bin/hdp-rm +11 -0
  18. data/bin/hdp-sort +29 -0
  19. data/bin/hdp-stream +29 -0
  20. data/bin/hdp-stream-flat +18 -0
  21. data/bin/hdp-sync +17 -0
  22. data/bin/hdp-wc +67 -0
  23. data/bin/md5sort +20 -0
  24. data/bin/tabchar +5 -0
  25. data/bin/uniqc +3 -0
  26. data/bin/wu-hist +3 -0
  27. data/bin/wu-lign +177 -0
  28. data/bin/wu-sum +30 -0
  29. data/doc/INSTALL.textile +41 -0
  30. data/doc/LICENSE.textile +107 -0
  31. data/doc/README-tutorial.textile +163 -0
  32. data/doc/README-wulign.textile +59 -0
  33. data/doc/README-wutils.textile +128 -0
  34. data/doc/TODO.textile +61 -0
  35. data/doc/UsingWukong-part1-setup.textile +2 -0
  36. data/doc/UsingWukong-part2-scraping.textile +2 -0
  37. data/doc/UsingWukong-part3-parsing.textile +132 -0
  38. data/doc/code/api_response_example.txt +20 -0
  39. data/doc/code/parser_skeleton.rb +38 -0
  40. data/doc/hadoop-nfs.textile +51 -0
  41. data/doc/hadoop-setup.textile +29 -0
  42. data/doc/index.textile +124 -0
  43. data/doc/intro_to_map_reduce/MapReduceDiagram.graffle +0 -0
  44. data/doc/links.textile +42 -0
  45. data/doc/overview.textile +91 -0
  46. data/doc/pig/PigLatinExpressionsList.txt +122 -0
  47. data/doc/pig/PigLatinReferenceManual.html +19134 -0
  48. data/doc/pig/PigLatinReferenceManual.txt +1640 -0
  49. data/doc/tips.textile +116 -0
  50. data/doc/usage.textile +102 -0
  51. data/doc/utils.textile +48 -0
  52. data/examples/README.txt +17 -0
  53. data/examples/and_pig/sample_queries.rb +128 -0
  54. data/examples/apache_log_parser.rb +53 -0
  55. data/examples/count_keys.rb +56 -0
  56. data/examples/count_keys_at_mapper.rb +57 -0
  57. data/examples/graph/adjacency_list.rb +74 -0
  58. data/examples/graph/breadth_first_search.rb +79 -0
  59. data/examples/graph/gen_2paths.rb +68 -0
  60. data/examples/graph/gen_multi_edge.rb +103 -0
  61. data/examples/graph/gen_symmetric_links.rb +53 -0
  62. data/examples/package-local.rb +100 -0
  63. data/examples/package.rb +96 -0
  64. data/examples/pagerank/README.textile +6 -0
  65. data/examples/pagerank/gen_initial_pagerank_graph.pig +57 -0
  66. data/examples/pagerank/pagerank.rb +88 -0
  67. data/examples/pagerank/pagerank_initialize.rb +46 -0
  68. data/examples/pagerank/run_pagerank.sh +19 -0
  69. data/examples/rank_and_bin.rb +173 -0
  70. data/examples/run_all.sh +47 -0
  71. data/examples/sample_records.rb +44 -0
  72. data/examples/size.rb +60 -0
  73. data/examples/word_count.rb +95 -0
  74. data/lib/wukong.rb +11 -0
  75. data/lib/wukong/and_pig.rb +62 -0
  76. data/lib/wukong/and_pig/README.textile +12 -0
  77. data/lib/wukong/and_pig/as.rb +37 -0
  78. data/lib/wukong/and_pig/data_types.rb +30 -0
  79. data/lib/wukong/and_pig/functions.rb +50 -0
  80. data/lib/wukong/and_pig/generate.rb +85 -0
  81. data/lib/wukong/and_pig/generate/variable_inflections.rb +82 -0
  82. data/lib/wukong/and_pig/junk.rb +51 -0
  83. data/lib/wukong/and_pig/operators.rb +8 -0
  84. data/lib/wukong/and_pig/operators/compound.rb +29 -0
  85. data/lib/wukong/and_pig/operators/evaluators.rb +7 -0
  86. data/lib/wukong/and_pig/operators/execution.rb +15 -0
  87. data/lib/wukong/and_pig/operators/file_methods.rb +29 -0
  88. data/lib/wukong/and_pig/operators/foreach.rb +98 -0
  89. data/lib/wukong/and_pig/operators/groupies.rb +212 -0
  90. data/lib/wukong/and_pig/operators/load_store.rb +65 -0
  91. data/lib/wukong/and_pig/operators/meta.rb +42 -0
  92. data/lib/wukong/and_pig/operators/relational.rb +129 -0
  93. data/lib/wukong/and_pig/pig_struct.rb +48 -0
  94. data/lib/wukong/and_pig/pig_var.rb +95 -0
  95. data/lib/wukong/and_pig/symbol.rb +29 -0
  96. data/lib/wukong/and_pig/utils.rb +0 -0
  97. data/lib/wukong/bad_record.rb +18 -0
  98. data/lib/wukong/boot.rb +47 -0
  99. data/lib/wukong/datatypes.rb +24 -0
  100. data/lib/wukong/datatypes/enum.rb +123 -0
  101. data/lib/wukong/dfs.rb +80 -0
  102. data/lib/wukong/encoding.rb +111 -0
  103. data/lib/wukong/extensions.rb +15 -0
  104. data/lib/wukong/extensions/array.rb +18 -0
  105. data/lib/wukong/extensions/blank.rb +93 -0
  106. data/lib/wukong/extensions/class.rb +189 -0
  107. data/lib/wukong/extensions/date_time.rb +24 -0
  108. data/lib/wukong/extensions/emittable.rb +82 -0
  109. data/lib/wukong/extensions/hash.rb +120 -0
  110. data/lib/wukong/extensions/hash_like.rb +119 -0
  111. data/lib/wukong/extensions/hashlike_class.rb +47 -0
  112. data/lib/wukong/extensions/module.rb +2 -0
  113. data/lib/wukong/extensions/pathname.rb +27 -0
  114. data/lib/wukong/extensions/string.rb +65 -0
  115. data/lib/wukong/extensions/struct.rb +17 -0
  116. data/lib/wukong/extensions/symbol.rb +11 -0
  117. data/lib/wukong/logger.rb +53 -0
  118. data/lib/wukong/models/graph.rb +27 -0
  119. data/lib/wukong/rdf.rb +104 -0
  120. data/lib/wukong/schema.rb +37 -0
  121. data/lib/wukong/script.rb +265 -0
  122. data/lib/wukong/script/hadoop_command.rb +111 -0
  123. data/lib/wukong/script/local_command.rb +14 -0
  124. data/lib/wukong/streamer.rb +13 -0
  125. data/lib/wukong/streamer/accumulating_reducer.rb +89 -0
  126. data/lib/wukong/streamer/base.rb +76 -0
  127. data/lib/wukong/streamer/count_keys.rb +30 -0
  128. data/lib/wukong/streamer/count_lines.rb +26 -0
  129. data/lib/wukong/streamer/filter.rb +20 -0
  130. data/lib/wukong/streamer/line_streamer.rb +12 -0
  131. data/lib/wukong/streamer/list_reducer.rb +20 -0
  132. data/lib/wukong/streamer/preprocess_with_pipe_streamer.rb +22 -0
  133. data/lib/wukong/streamer/rank_and_bin_reducer.rb +145 -0
  134. data/lib/wukong/streamer/set_reducer.rb +14 -0
  135. data/lib/wukong/streamer/struct_streamer.rb +48 -0
  136. data/lib/wukong/streamer/summing_reducer.rb +29 -0
  137. data/lib/wukong/streamer/uniq_by_last_reducer.rb +44 -0
  138. data/lib/wukong/typed_struct.rb +12 -0
  139. data/lib/wukong/wukong_class.rb +21 -0
  140. data/spec/bin/hdp-wc_spec.rb +4 -0
  141. data/spec/spec_helper.rb +0 -0
  142. data/wukong.gemspec +179 -0
  143. metadata +214 -0
@@ -0,0 +1,56 @@
1
+ #!/usr/bin/env ruby
2
+ $: << File.dirname(__FILE__)+'/../lib'
3
+ require 'wukong'
4
+ require 'wukong/streamer/count_keys'
5
+ require 'wukong/streamer/count_lines'
6
+
7
+ #
8
+ #
9
+ class CountKeysReducer < Wukong::Streamer::CountLines
10
+ #
11
+ # Taken from the actionpack Rails component ('action_view/helpers/number_helper')
12
+ #
13
+ # Formats a +number+ with grouped thousands using +delimiter+. You
14
+ # can customize the format using optional <em>delimiter</em> and <em>separator</em> parameters.
15
+ # * <tt>delimiter</tt> - Sets the thousands delimiter, defaults to ","
16
+ # * <tt>separator</tt> - Sets the separator between the units, defaults to "."
17
+ #
18
+ # number_with_delimiter(12345678) => 12,345,678
19
+ # number_with_delimiter(12345678.05) => 12,345,678.05
20
+ # number_with_delimiter(12345678, ".") => 12.345.678
21
+ def number_with_delimiter(number, delimiter=",", separator=".")
22
+ begin
23
+ parts = number.to_s.split('.')
24
+ parts[0].gsub!(/(\d)(?=(\d\d\d)+(?!\d))/, "\\1#{delimiter}")
25
+ parts.join separator
26
+ rescue
27
+ number
28
+ end
29
+ end
30
+
31
+ # Override to look nice
32
+ def formatted_count item, key_count
33
+ key_count_str = number_with_delimiter(key_count.to_i)
34
+ "%-25s\t%12s" % [item, key_count_str]
35
+ end
36
+ end
37
+
38
+ #
39
+ class CountKeysScript < Wukong::Script
40
+ def map_command
41
+ # Use `cut` to extract the first field
42
+ %Q{ cut -d"\t" -f1 }
43
+ end
44
+
45
+ #
46
+ # There's just the one field
47
+ #
48
+ def default_options
49
+ super.merge :sort_fields => 1
50
+ end
51
+ end
52
+
53
+ # Executes the script when run from command line
54
+ if __FILE__ == $0
55
+ CountKeysScript.new(nil, CountKeysReducer).run
56
+ end
@@ -0,0 +1,57 @@
1
+ #!/usr/bin/env ruby
2
+ $: << File.dirname(__FILE__)+'/../lib'
3
+ require 'wukong'
4
+
5
+ #
6
+ #
7
+ module CountKeys
8
+ #
9
+ class Mapper < Wukong::Streamer::Base
10
+ attr_accessor :keys_count
11
+ def initialize *args
12
+ self.keys_count = {}
13
+ end
14
+ def process key, *args
15
+ key.gsub!(/-.*/, '') # kill off the slug
16
+ self.keys_count[key] ||= 0
17
+ self.keys_count[key] += 1
18
+ end
19
+ def stream *args
20
+ super *args
21
+ self.keys_count.each do |key, count|
22
+ emit [key, count].to_flat
23
+ end
24
+ end
25
+ end
26
+ # Identity Mapper
27
+ class Reducer < Wukong::Streamer::AccumulatingReducer
28
+ attr_accessor :key_count
29
+ require 'active_support'
30
+ require 'action_view/helpers/number_helper'; include ActionView::Helpers::NumberHelper
31
+
32
+ # Override to look nice
33
+ def formatted_count item, key_count
34
+ key_count_str = number_with_delimiter(key_count.to_i, :delimiter => ',')
35
+ "%-25s\t%12s" % [item, key_count_str]
36
+ end
37
+ def start! *args
38
+ self.key_count = 0
39
+ end
40
+ def accumulate key, count
41
+ self.key_count += count.to_i
42
+ end
43
+ def finalize
44
+ yield formatted_count(key, key_count)
45
+ end
46
+ end
47
+
48
+ #
49
+ class Script < Wukong::Script
50
+ # There's just the one field
51
+ def default_options
52
+ super.merge :sort_fields => 1, :reduce_tasks => 1
53
+ end
54
+ end
55
+ end
56
+
57
+ CountKeys::Script.new(CountKeys::Mapper, CountKeys::Reducer).run
@@ -0,0 +1,74 @@
1
+ #!/usr/bin/env ruby
2
+ $: << '/home/flip/ics/wukong/lib' # ENV['WUKONG_PATH']
3
+ require 'wukong'
4
+
5
+ #
6
+ # Given an adjacency pairs (from \t to) representation of a directed graph:
7
+ #
8
+ # 1 2
9
+ # 1 7
10
+ # 2 7
11
+ # 2 9
12
+ # 7 2
13
+ #
14
+ # It produces an "adjacency list":http://en.wikipedia.org/wiki/Adjacency_list representation:
15
+ #
16
+ # 1 > 2 7
17
+ # 2 > 7 9
18
+ # 7 > 2
19
+ # 9 >
20
+ #
21
+ # and
22
+ #
23
+ # 1 <
24
+ # 2 < 1 7
25
+ # 7 < 1 2
26
+ # 9 < 2
27
+ #
28
+ # (each column is tab-separated in the actual output)
29
+ #
30
+ #
31
+ #
32
+ module Gen1HoodEdges
33
+ class Mapper < Wukong::Streamer::Base
34
+ def process rsrc, src, dest, *_
35
+ src = src.to_i ; dest = dest.to_i
36
+ yield [ src, '>', dest ]
37
+ yield [ dest, '<', src ]
38
+ end
39
+ end
40
+
41
+ #
42
+ # Accumulate links onto single line.
43
+ #
44
+ # The reduce key is the target node and direction; we just stream through all
45
+ # pairs for each target node and output its neighbor nodes on the same line.
46
+ #
47
+ # To control memory usage, we will print directly to the output (and not run
48
+ # through the Emitter)
49
+ #
50
+ class Reducer < Wukong::Streamer::AccumulatingReducer
51
+ # clear the list of incoming paths
52
+ def start! target, dir, *args
53
+ print target + "\t" + dir # start line with target and list type
54
+ end
55
+ def accumulate target, dir, neighbor
56
+ print "\t" + neighbor # append neighbor to output, same line
57
+ end
58
+ def finalize
59
+ puts '' # start new line
60
+ end
61
+ end
62
+
63
+ class Script < Wukong::Script
64
+ def default_options
65
+ super.merge :sort_fields => 1, :partition_fields => 1
66
+ end
67
+ end
68
+ end
69
+
70
+ # Execute the script
71
+ Gen1HoodEdges::Script.new(
72
+ Gen1HoodEdges::Mapper,
73
+ Gen1HoodEdges::Reducer
74
+ ).run
@@ -0,0 +1,79 @@
1
+ #!/usr/bin/env ruby
2
+ $: << ENV['WUKONG_PATH']
3
+ require 'wukong'
4
+
5
+ #
6
+ # Use this script to do a Breadth-First Search (BFS) of a graph.
7
+ #
8
+ # Usage:
9
+ # ./make_paths --head=[path_in_key] --tail=[path_out_key] --out_rsrc=[combined_path_key]
10
+ #
11
+ # For example, given an edge list in the file '1path.tsv' that looks like
12
+ # 1path n1 n2
13
+ # 1path n1 n3
14
+ # ... and so forth ...
15
+ # you can run
16
+ # for t in 1 2 3 4 5 6 7 8 9 ; do next=$((t+1)) ; time cat 1path.tsv "${t}path.tsv" | ./make_paths.rb --map --head="1path" --tail="${t}path" | sort -u | ./make_paths.rb --reduce --out_rsrc="${next}path" | sort -u > "${next}path.tsv" ; done
17
+ # to do a 9-deep breadth-first search.
18
+ #
19
+ module Gen1HoodEdges
20
+ class Mapper < Wukong::Streamer::Base
21
+ attr_accessor :head, :tail
22
+ def initialize options
23
+ self.head = options[:head]
24
+ self.tail = options[:tail]
25
+ end
26
+ def process rsrc, *nodes
27
+ yield [ nodes.last, 'i', nodes[0..-2] ] if (rsrc == self.head)
28
+ yield [ nodes.first, 'o', nodes[1..-1] ] if (rsrc == self.tail)
29
+ end
30
+ end
31
+
32
+ #
33
+ # Accumulate ( !!in memory!!) all inbound links onto middle node
34
+ #
35
+ # Then for each outbound link, loop over those inbound links and emit the
36
+ # triple (in, mid,out)
37
+ #
38
+ class Reducer < Wukong::Streamer::AccumulatingReducer
39
+ attr_accessor :paths_in, :out_rsrc
40
+ def initialize options
41
+ self.out_rsrc = options[:out_rsrc]
42
+ end
43
+ # clear the list of incoming paths
44
+ def start! *args
45
+ self.paths_in = []
46
+ end
47
+ def accumulate mid, dir, *nodes
48
+ case dir
49
+ when 'i'
50
+ self.paths_in << nodes
51
+ if (self.paths_in.length % 1000 == 0) && (self.paths_in.length > 10000)
52
+ $stderr.puts ["Accumulating:", mid, self.paths_in.length].join("\t")
53
+ end
54
+ when 'o'
55
+ paths_in.each do |path_in|
56
+ yield [self.out_rsrc, path_in, mid, *nodes]
57
+ end
58
+ end
59
+ end
60
+ def finalize
61
+ end
62
+ def get_key mid, *_
63
+ mid
64
+ end
65
+ end
66
+
67
+ class Script < Wukong::Script
68
+ def default_options
69
+ super.merge :sort_fields => 2, :partition_fields => 1
70
+ end
71
+ end
72
+
73
+ end
74
+
75
+ # Execute the script
76
+ Gen1HoodEdges::Script.new(
77
+ Gen1HoodEdges::Mapper,
78
+ Gen1HoodEdges::Reducer
79
+ ).run
@@ -0,0 +1,68 @@
1
+ #!/usr/bin/env ruby
2
+ $: << File.dirname(__FILE__)+'/../../lib'
3
+ require 'wukong'
4
+
5
+ class Edge < Struct.new(:src, :dest)
6
+ end
7
+
8
+ class MultiEdge < Struct.new(
9
+ :src, :dest,
10
+ :a_follows_b, :b_follows_a,
11
+ :a_replies_b, :b_replies_a,
12
+ :a_favorites_b, :b_favorites_a
13
+ )
14
+ end
15
+
16
+ module Gen1HoodEdges
17
+ class Mapper < Wukong::Streamer::Base
18
+ def process rsrc, src, dest
19
+ # next if (src.to_i == 0) || (dest.to_i == 0)
20
+ yield [ dest, 'i', src ]
21
+ yield [ src, 'o', dest]
22
+ end
23
+ end
24
+
25
+ #
26
+ # Accumulate ( !!in memory!!) all inbound links onto middle node
27
+ #
28
+ # Then for each outbound link, loop over those inbound links and emit the
29
+ # triple (in, mid,out)
30
+ #
31
+ class Reducer < Wukong::Streamer::AccumulatingReducer
32
+ attr_accessor :ins
33
+ def start! *args
34
+ self.ins = []
35
+ end
36
+ def accumulate mid, dir, node
37
+ case dir
38
+ when 'i'
39
+ self.ins << node
40
+ if (self.ins.length % 1000 == 0) && (self.ins.length > 10000)
41
+ $stderr.puts ["Accumulating:", mid, self.ins.length].join("\t")
42
+ end
43
+ when 'o'
44
+ ins.each do |inn|
45
+ yield ['path_2', inn, mid, node]
46
+ end
47
+ end
48
+ end
49
+ def finalize
50
+ end
51
+ def get_key mid, *_
52
+ mid
53
+ end
54
+ end
55
+
56
+ class Script < Wukong::Script
57
+ def default_options
58
+ super.merge :sort_fields => 2, :partition_fields => 1
59
+ end
60
+ end
61
+
62
+ end
63
+
64
+ # Execute the script
65
+ Gen1HoodEdges::Script.new(
66
+ Gen1HoodEdges::Mapper,
67
+ Gen1HoodEdges::Reducer
68
+ ).run
@@ -0,0 +1,103 @@
1
+ #!/usr/bin/env ruby
2
+ $: << File.dirname(__FILE__)+'/../../lib'
3
+ require 'wukong'
4
+ require 'wukong/models/graph'; include Wukong::Models
5
+
6
+ #
7
+ # Takes any number of flavors of directed edge with the form
8
+ #
9
+ # a_relatesto_b src_id dest_id [optional fields]
10
+ #
11
+ # and prepares a combined adjacency list. You need to supply a model named
12
+ # "MultiEdge" with members for each edge type.
13
+ #
14
+ # For instance, suppose you have a social network with edges like
15
+ #
16
+ # a_follows_b user_a_id user_b_id
17
+ # a_messages_b user_a_id user_b_id message_id date
18
+ # a_favorites_b user_a_id user_b_id message_id date
19
+ #
20
+ # Your MultiEdge class might look like
21
+ #
22
+ # class MultiEdge < Struct(
23
+ # :src, :dest,
24
+ # :a_follows_b, :b_follows_a,
25
+ # :a_messages_b, :b_messages_a,
26
+ # :a_favorites_b, :b_favorites_a
27
+ # )
28
+ # end
29
+ #
30
+ # The row for a user pair who follows each other; with user_a #24601 messaging b
31
+ # 57 times and favoriting 5 of user_b's messages; and user_b #8675309 messaging
32
+ # 62 times and favoriting none, will emerge as (tab separated, with [blank]
33
+ # indicating there is no text in that slot):
34
+ #
35
+ # ...
36
+ # 24601 8675309 1 1 57 62 5 [blank]
37
+ # ...
38
+ #
39
+ module GenMultiEdge
40
+ #
41
+ # Emit each relation as
42
+ #
43
+ # src dest rel
44
+ #
45
+ # Canonicalizes the src and dest ids to 10-character, zero-padded strings.
46
+ # (Ten chars fits a 32-bit up-to-4-billion-and-change unsigned integer.)
47
+ # Discards all the ancillary crap except +src+, +dest+ and +rel+
48
+ #
49
+ class Mapper < Wukong::Streamer::Base
50
+ def process rsrc, src, dest, *_
51
+ # note that a_retweets_b_id matches here
52
+ m = /^a_([a-z]+)_b.*/.match(rsrc) or return
53
+ rel = m.captures.first
54
+ src = src.to_i ; dest = dest.to_i
55
+ return if ((src == 0) || (dest == 0))
56
+ yield ["%010d"%src, "%010d"%dest, "a_#{rel}_b"]
57
+ yield ["%010d"%dest, "%010d"%src, "b_#{rel}_a"]
58
+ end
59
+ end
60
+
61
+ #
62
+ # Aggregate all sightings of relations for each pair into
63
+ # a single combined
64
+ #
65
+ # Note that [a,b] and [b,a] /each/ have a listing, with the a->b and b<-a
66
+ # relations repeated for each. That is, if there is an "a_messages_b"
67
+ # relation, you'll have edges
68
+ #
69
+ # x y ... a_messages_b(x,y) b_messages_a(y,x) ...
70
+ # y x ... a_messages_b(y,x) b_messages_a(x,y) ...
71
+ #
72
+ #
73
+ class Reducer < Wukong::Streamer::AccumulatingReducer
74
+ attr_accessor :multi_edge
75
+ def get_key src, dest, rel
76
+ [src, dest]
77
+ end
78
+ def start! *args
79
+ self.multi_edge = MultiEdge.new
80
+ end
81
+ def accumulate src, dest, rel
82
+ self.multi_edge[rel] ||= 0
83
+ self.multi_edge[rel] += 1
84
+ end
85
+ def finalize
86
+ multi_edge.src, multi_edge.dest = key
87
+ yield self.multi_edge
88
+ end
89
+ end
90
+
91
+ #
92
+ # Sort on the first two keys: each @[src, dest]@ pair winds up at the same
93
+ # reducer.
94
+ #
95
+ class Script < Wukong::Script
96
+ def default_options
97
+ super.merge :sort_fields => 2
98
+ end
99
+ end
100
+
101
+ # Execute the script
102
+ Script.new(Mapper, Reducer).run
103
+ end
@@ -0,0 +1,53 @@
1
+ #!/usr/bin/env ruby
2
+ $: << File.dirname(__FILE__)+'/../../lib'
3
+ require 'wukong'
4
+
5
+ class Edge < Struct.new(:src, :dest)
6
+ end
7
+
8
+ class ASymmetricB < Edge
9
+ end
10
+
11
+ module Wukong::Streamer
12
+ class EdgeStreamer < Wukong::Streamer::Base
13
+ def recordize line
14
+ rsrc, src, dest, *_ = super(line)
15
+ [ASymmetricB.new(src.to_i, dest.to_i)]
16
+ end
17
+ end
18
+ end
19
+
20
+ module FindSymmetricLinks
21
+
22
+ class Mapper < Wukong::Streamer::EdgeStreamer
23
+ def process edge
24
+ yield edge.to_flat(false)
25
+ yield ASymmetricB.new(edge.dest, edge.src).to_flat(false)
26
+ end
27
+ end
28
+
29
+ #
30
+ #
31
+ class Reducer < Wukong::Streamer::Base
32
+ def stream
33
+ %x{/usr/bin/uniq -c}.split("\n").each do |line|
34
+ key_count, rsrc, src, dest, data = line.chomp.strip.split(/\s+/, 4)
35
+ next unless key_count.to_i == 2
36
+ next unless src.to_i < dest.to_i
37
+ emit [src, dest, data].compact
38
+ end
39
+ end
40
+ end
41
+
42
+ class Script < Wukong::Script
43
+ def default_options
44
+ super.merge :sort_fields => 3
45
+ end
46
+ end
47
+ end
48
+
49
+ # Execute the script
50
+ Wukong::Script.new(
51
+ FindSymmetricLinks::Mapper,
52
+ FindSymmetricLinks::Reducer
53
+ ).run