wukong 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE.textile +107 -0
- data/README.textile +166 -0
- data/bin/cutc +30 -0
- data/bin/cuttab +5 -0
- data/bin/greptrue +8 -0
- data/bin/hdp-cat +3 -0
- data/bin/hdp-catd +3 -0
- data/bin/hdp-du +81 -0
- data/bin/hdp-get +3 -0
- data/bin/hdp-kill +3 -0
- data/bin/hdp-ls +10 -0
- data/bin/hdp-mkdir +3 -0
- data/bin/hdp-mv +3 -0
- data/bin/hdp-parts_to_keys.rb +77 -0
- data/bin/hdp-ps +3 -0
- data/bin/hdp-put +3 -0
- data/bin/hdp-rm +11 -0
- data/bin/hdp-sort +29 -0
- data/bin/hdp-stream +29 -0
- data/bin/hdp-stream-flat +18 -0
- data/bin/hdp-sync +17 -0
- data/bin/hdp-wc +67 -0
- data/bin/md5sort +20 -0
- data/bin/tabchar +5 -0
- data/bin/uniqc +3 -0
- data/bin/wu-hist +3 -0
- data/bin/wu-lign +177 -0
- data/bin/wu-sum +30 -0
- data/doc/INSTALL.textile +41 -0
- data/doc/LICENSE.textile +107 -0
- data/doc/README-tutorial.textile +163 -0
- data/doc/README-wulign.textile +59 -0
- data/doc/README-wutils.textile +128 -0
- data/doc/TODO.textile +61 -0
- data/doc/UsingWukong-part1-setup.textile +2 -0
- data/doc/UsingWukong-part2-scraping.textile +2 -0
- data/doc/UsingWukong-part3-parsing.textile +132 -0
- data/doc/code/api_response_example.txt +20 -0
- data/doc/code/parser_skeleton.rb +38 -0
- data/doc/hadoop-nfs.textile +51 -0
- data/doc/hadoop-setup.textile +29 -0
- data/doc/index.textile +124 -0
- data/doc/intro_to_map_reduce/MapReduceDiagram.graffle +0 -0
- data/doc/links.textile +42 -0
- data/doc/overview.textile +91 -0
- data/doc/pig/PigLatinExpressionsList.txt +122 -0
- data/doc/pig/PigLatinReferenceManual.html +19134 -0
- data/doc/pig/PigLatinReferenceManual.txt +1640 -0
- data/doc/tips.textile +116 -0
- data/doc/usage.textile +102 -0
- data/doc/utils.textile +48 -0
- data/examples/README.txt +17 -0
- data/examples/and_pig/sample_queries.rb +128 -0
- data/examples/apache_log_parser.rb +53 -0
- data/examples/count_keys.rb +56 -0
- data/examples/count_keys_at_mapper.rb +57 -0
- data/examples/graph/adjacency_list.rb +74 -0
- data/examples/graph/breadth_first_search.rb +79 -0
- data/examples/graph/gen_2paths.rb +68 -0
- data/examples/graph/gen_multi_edge.rb +103 -0
- data/examples/graph/gen_symmetric_links.rb +53 -0
- data/examples/package-local.rb +100 -0
- data/examples/package.rb +96 -0
- data/examples/pagerank/README.textile +6 -0
- data/examples/pagerank/gen_initial_pagerank_graph.pig +57 -0
- data/examples/pagerank/pagerank.rb +88 -0
- data/examples/pagerank/pagerank_initialize.rb +46 -0
- data/examples/pagerank/run_pagerank.sh +19 -0
- data/examples/rank_and_bin.rb +173 -0
- data/examples/run_all.sh +47 -0
- data/examples/sample_records.rb +44 -0
- data/examples/size.rb +60 -0
- data/examples/word_count.rb +95 -0
- data/lib/wukong.rb +11 -0
- data/lib/wukong/and_pig.rb +62 -0
- data/lib/wukong/and_pig/README.textile +12 -0
- data/lib/wukong/and_pig/as.rb +37 -0
- data/lib/wukong/and_pig/data_types.rb +30 -0
- data/lib/wukong/and_pig/functions.rb +50 -0
- data/lib/wukong/and_pig/generate.rb +85 -0
- data/lib/wukong/and_pig/generate/variable_inflections.rb +82 -0
- data/lib/wukong/and_pig/junk.rb +51 -0
- data/lib/wukong/and_pig/operators.rb +8 -0
- data/lib/wukong/and_pig/operators/compound.rb +29 -0
- data/lib/wukong/and_pig/operators/evaluators.rb +7 -0
- data/lib/wukong/and_pig/operators/execution.rb +15 -0
- data/lib/wukong/and_pig/operators/file_methods.rb +29 -0
- data/lib/wukong/and_pig/operators/foreach.rb +98 -0
- data/lib/wukong/and_pig/operators/groupies.rb +212 -0
- data/lib/wukong/and_pig/operators/load_store.rb +65 -0
- data/lib/wukong/and_pig/operators/meta.rb +42 -0
- data/lib/wukong/and_pig/operators/relational.rb +129 -0
- data/lib/wukong/and_pig/pig_struct.rb +48 -0
- data/lib/wukong/and_pig/pig_var.rb +95 -0
- data/lib/wukong/and_pig/symbol.rb +29 -0
- data/lib/wukong/and_pig/utils.rb +0 -0
- data/lib/wukong/bad_record.rb +18 -0
- data/lib/wukong/boot.rb +47 -0
- data/lib/wukong/datatypes.rb +24 -0
- data/lib/wukong/datatypes/enum.rb +123 -0
- data/lib/wukong/dfs.rb +80 -0
- data/lib/wukong/encoding.rb +111 -0
- data/lib/wukong/extensions.rb +15 -0
- data/lib/wukong/extensions/array.rb +18 -0
- data/lib/wukong/extensions/blank.rb +93 -0
- data/lib/wukong/extensions/class.rb +189 -0
- data/lib/wukong/extensions/date_time.rb +24 -0
- data/lib/wukong/extensions/emittable.rb +82 -0
- data/lib/wukong/extensions/hash.rb +120 -0
- data/lib/wukong/extensions/hash_like.rb +119 -0
- data/lib/wukong/extensions/hashlike_class.rb +47 -0
- data/lib/wukong/extensions/module.rb +2 -0
- data/lib/wukong/extensions/pathname.rb +27 -0
- data/lib/wukong/extensions/string.rb +65 -0
- data/lib/wukong/extensions/struct.rb +17 -0
- data/lib/wukong/extensions/symbol.rb +11 -0
- data/lib/wukong/logger.rb +53 -0
- data/lib/wukong/models/graph.rb +27 -0
- data/lib/wukong/rdf.rb +104 -0
- data/lib/wukong/schema.rb +37 -0
- data/lib/wukong/script.rb +265 -0
- data/lib/wukong/script/hadoop_command.rb +111 -0
- data/lib/wukong/script/local_command.rb +14 -0
- data/lib/wukong/streamer.rb +13 -0
- data/lib/wukong/streamer/accumulating_reducer.rb +89 -0
- data/lib/wukong/streamer/base.rb +76 -0
- data/lib/wukong/streamer/count_keys.rb +30 -0
- data/lib/wukong/streamer/count_lines.rb +26 -0
- data/lib/wukong/streamer/filter.rb +20 -0
- data/lib/wukong/streamer/line_streamer.rb +12 -0
- data/lib/wukong/streamer/list_reducer.rb +20 -0
- data/lib/wukong/streamer/preprocess_with_pipe_streamer.rb +22 -0
- data/lib/wukong/streamer/rank_and_bin_reducer.rb +145 -0
- data/lib/wukong/streamer/set_reducer.rb +14 -0
- data/lib/wukong/streamer/struct_streamer.rb +48 -0
- data/lib/wukong/streamer/summing_reducer.rb +29 -0
- data/lib/wukong/streamer/uniq_by_last_reducer.rb +44 -0
- data/lib/wukong/typed_struct.rb +12 -0
- data/lib/wukong/wukong_class.rb +21 -0
- data/spec/bin/hdp-wc_spec.rb +4 -0
- data/spec/spec_helper.rb +0 -0
- data/wukong.gemspec +179 -0
- metadata +214 -0
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
$: << File.dirname(__FILE__)+'/../lib'
|
|
3
|
+
require 'wukong'
|
|
4
|
+
require 'wukong/streamer/count_keys'
|
|
5
|
+
require 'wukong/streamer/count_lines'
|
|
6
|
+
|
|
7
|
+
#
|
|
8
|
+
#
|
|
9
|
+
class CountKeysReducer < Wukong::Streamer::CountLines
|
|
10
|
+
#
|
|
11
|
+
# Taken from the actionpack Rails component ('action_view/helpers/number_helper')
|
|
12
|
+
#
|
|
13
|
+
# Formats a +number+ with grouped thousands using +delimiter+. You
|
|
14
|
+
# can customize the format using optional <em>delimiter</em> and <em>separator</em> parameters.
|
|
15
|
+
# * <tt>delimiter</tt> - Sets the thousands delimiter, defaults to ","
|
|
16
|
+
# * <tt>separator</tt> - Sets the separator between the units, defaults to "."
|
|
17
|
+
#
|
|
18
|
+
# number_with_delimiter(12345678) => 12,345,678
|
|
19
|
+
# number_with_delimiter(12345678.05) => 12,345,678.05
|
|
20
|
+
# number_with_delimiter(12345678, ".") => 12.345.678
|
|
21
|
+
def number_with_delimiter(number, delimiter=",", separator=".")
|
|
22
|
+
begin
|
|
23
|
+
parts = number.to_s.split('.')
|
|
24
|
+
parts[0].gsub!(/(\d)(?=(\d\d\d)+(?!\d))/, "\\1#{delimiter}")
|
|
25
|
+
parts.join separator
|
|
26
|
+
rescue
|
|
27
|
+
number
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# Override to look nice
|
|
32
|
+
def formatted_count item, key_count
|
|
33
|
+
key_count_str = number_with_delimiter(key_count.to_i)
|
|
34
|
+
"%-25s\t%12s" % [item, key_count_str]
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
#
|
|
39
|
+
class CountKeysScript < Wukong::Script
|
|
40
|
+
def map_command
|
|
41
|
+
# Use `cut` to extract the first field
|
|
42
|
+
%Q{ cut -d"\t" -f1 }
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
#
|
|
46
|
+
# There's just the one field
|
|
47
|
+
#
|
|
48
|
+
def default_options
|
|
49
|
+
super.merge :sort_fields => 1
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# Executes the script when run from command line
|
|
54
|
+
if __FILE__ == $0
|
|
55
|
+
CountKeysScript.new(nil, CountKeysReducer).run
|
|
56
|
+
end
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
$: << File.dirname(__FILE__)+'/../lib'
|
|
3
|
+
require 'wukong'
|
|
4
|
+
|
|
5
|
+
#
|
|
6
|
+
#
|
|
7
|
+
module CountKeys
|
|
8
|
+
#
|
|
9
|
+
class Mapper < Wukong::Streamer::Base
|
|
10
|
+
attr_accessor :keys_count
|
|
11
|
+
def initialize *args
|
|
12
|
+
self.keys_count = {}
|
|
13
|
+
end
|
|
14
|
+
def process key, *args
|
|
15
|
+
key.gsub!(/-.*/, '') # kill off the slug
|
|
16
|
+
self.keys_count[key] ||= 0
|
|
17
|
+
self.keys_count[key] += 1
|
|
18
|
+
end
|
|
19
|
+
def stream *args
|
|
20
|
+
super *args
|
|
21
|
+
self.keys_count.each do |key, count|
|
|
22
|
+
emit [key, count].to_flat
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
# Identity Mapper
|
|
27
|
+
class Reducer < Wukong::Streamer::AccumulatingReducer
|
|
28
|
+
attr_accessor :key_count
|
|
29
|
+
require 'active_support'
|
|
30
|
+
require 'action_view/helpers/number_helper'; include ActionView::Helpers::NumberHelper
|
|
31
|
+
|
|
32
|
+
# Override to look nice
|
|
33
|
+
def formatted_count item, key_count
|
|
34
|
+
key_count_str = number_with_delimiter(key_count.to_i, :delimiter => ',')
|
|
35
|
+
"%-25s\t%12s" % [item, key_count_str]
|
|
36
|
+
end
|
|
37
|
+
def start! *args
|
|
38
|
+
self.key_count = 0
|
|
39
|
+
end
|
|
40
|
+
def accumulate key, count
|
|
41
|
+
self.key_count += count.to_i
|
|
42
|
+
end
|
|
43
|
+
def finalize
|
|
44
|
+
yield formatted_count(key, key_count)
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
#
|
|
49
|
+
class Script < Wukong::Script
|
|
50
|
+
# There's just the one field
|
|
51
|
+
def default_options
|
|
52
|
+
super.merge :sort_fields => 1, :reduce_tasks => 1
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
CountKeys::Script.new(CountKeys::Mapper, CountKeys::Reducer).run
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
$: << '/home/flip/ics/wukong/lib' # ENV['WUKONG_PATH']
|
|
3
|
+
require 'wukong'
|
|
4
|
+
|
|
5
|
+
#
|
|
6
|
+
# Given an adjacency pairs (from \t to) representation of a directed graph:
|
|
7
|
+
#
|
|
8
|
+
# 1 2
|
|
9
|
+
# 1 7
|
|
10
|
+
# 2 7
|
|
11
|
+
# 2 9
|
|
12
|
+
# 7 2
|
|
13
|
+
#
|
|
14
|
+
# It produces an "adjacency list":http://en.wikipedia.org/wiki/Adjacency_list representation:
|
|
15
|
+
#
|
|
16
|
+
# 1 > 2 7
|
|
17
|
+
# 2 > 7 9
|
|
18
|
+
# 7 > 2
|
|
19
|
+
# 9 >
|
|
20
|
+
#
|
|
21
|
+
# and
|
|
22
|
+
#
|
|
23
|
+
# 1 <
|
|
24
|
+
# 2 < 1 7
|
|
25
|
+
# 7 < 1 2
|
|
26
|
+
# 9 < 2
|
|
27
|
+
#
|
|
28
|
+
# (each column is tab-separated in the actual output)
|
|
29
|
+
#
|
|
30
|
+
#
|
|
31
|
+
#
|
|
32
|
+
module Gen1HoodEdges
|
|
33
|
+
class Mapper < Wukong::Streamer::Base
|
|
34
|
+
def process rsrc, src, dest, *_
|
|
35
|
+
src = src.to_i ; dest = dest.to_i
|
|
36
|
+
yield [ src, '>', dest ]
|
|
37
|
+
yield [ dest, '<', src ]
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
#
|
|
42
|
+
# Accumulate links onto single line.
|
|
43
|
+
#
|
|
44
|
+
# The reduce key is the target node and direction; we just stream through all
|
|
45
|
+
# pairs for each target node and output its neighbor nodes on the same line.
|
|
46
|
+
#
|
|
47
|
+
# To control memory usage, we will print directly to the output (and not run
|
|
48
|
+
# through the Emitter)
|
|
49
|
+
#
|
|
50
|
+
class Reducer < Wukong::Streamer::AccumulatingReducer
|
|
51
|
+
# clear the list of incoming paths
|
|
52
|
+
def start! target, dir, *args
|
|
53
|
+
print target + "\t" + dir # start line with target and list type
|
|
54
|
+
end
|
|
55
|
+
def accumulate target, dir, neighbor
|
|
56
|
+
print "\t" + neighbor # append neighbor to output, same line
|
|
57
|
+
end
|
|
58
|
+
def finalize
|
|
59
|
+
puts '' # start new line
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
class Script < Wukong::Script
|
|
64
|
+
def default_options
|
|
65
|
+
super.merge :sort_fields => 1, :partition_fields => 1
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# Execute the script
|
|
71
|
+
Gen1HoodEdges::Script.new(
|
|
72
|
+
Gen1HoodEdges::Mapper,
|
|
73
|
+
Gen1HoodEdges::Reducer
|
|
74
|
+
).run
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
$: << ENV['WUKONG_PATH']
|
|
3
|
+
require 'wukong'
|
|
4
|
+
|
|
5
|
+
#
|
|
6
|
+
# Use this script to do a Breadth-First Search (BFS) of a graph.
|
|
7
|
+
#
|
|
8
|
+
# Usage:
|
|
9
|
+
# ./make_paths --head=[path_in_key] --tail=[path_out_key] --out_rsrc=[combined_path_key]
|
|
10
|
+
#
|
|
11
|
+
# For example, given an edge list in the file '1path.tsv' that looks like
|
|
12
|
+
# 1path n1 n2
|
|
13
|
+
# 1path n1 n3
|
|
14
|
+
# ... and so forth ...
|
|
15
|
+
# you can run
|
|
16
|
+
# for t in 1 2 3 4 5 6 7 8 9 ; do next=$((t+1)) ; time cat 1path.tsv "${t}path.tsv" | ./make_paths.rb --map --head="1path" --tail="${t}path" | sort -u | ./make_paths.rb --reduce --out_rsrc="${next}path" | sort -u > "${next}path.tsv" ; done
|
|
17
|
+
# to do a 9-deep breadth-first search.
|
|
18
|
+
#
|
|
19
|
+
module Gen1HoodEdges
|
|
20
|
+
class Mapper < Wukong::Streamer::Base
|
|
21
|
+
attr_accessor :head, :tail
|
|
22
|
+
def initialize options
|
|
23
|
+
self.head = options[:head]
|
|
24
|
+
self.tail = options[:tail]
|
|
25
|
+
end
|
|
26
|
+
def process rsrc, *nodes
|
|
27
|
+
yield [ nodes.last, 'i', nodes[0..-2] ] if (rsrc == self.head)
|
|
28
|
+
yield [ nodes.first, 'o', nodes[1..-1] ] if (rsrc == self.tail)
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
#
|
|
33
|
+
# Accumulate ( !!in memory!!) all inbound links onto middle node
|
|
34
|
+
#
|
|
35
|
+
# Then for each outbound link, loop over those inbound links and emit the
|
|
36
|
+
# triple (in, mid,out)
|
|
37
|
+
#
|
|
38
|
+
class Reducer < Wukong::Streamer::AccumulatingReducer
|
|
39
|
+
attr_accessor :paths_in, :out_rsrc
|
|
40
|
+
def initialize options
|
|
41
|
+
self.out_rsrc = options[:out_rsrc]
|
|
42
|
+
end
|
|
43
|
+
# clear the list of incoming paths
|
|
44
|
+
def start! *args
|
|
45
|
+
self.paths_in = []
|
|
46
|
+
end
|
|
47
|
+
def accumulate mid, dir, *nodes
|
|
48
|
+
case dir
|
|
49
|
+
when 'i'
|
|
50
|
+
self.paths_in << nodes
|
|
51
|
+
if (self.paths_in.length % 1000 == 0) && (self.paths_in.length > 10000)
|
|
52
|
+
$stderr.puts ["Accumulating:", mid, self.paths_in.length].join("\t")
|
|
53
|
+
end
|
|
54
|
+
when 'o'
|
|
55
|
+
paths_in.each do |path_in|
|
|
56
|
+
yield [self.out_rsrc, path_in, mid, *nodes]
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
def finalize
|
|
61
|
+
end
|
|
62
|
+
def get_key mid, *_
|
|
63
|
+
mid
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
class Script < Wukong::Script
|
|
68
|
+
def default_options
|
|
69
|
+
super.merge :sort_fields => 2, :partition_fields => 1
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
# Execute the script
|
|
76
|
+
Gen1HoodEdges::Script.new(
|
|
77
|
+
Gen1HoodEdges::Mapper,
|
|
78
|
+
Gen1HoodEdges::Reducer
|
|
79
|
+
).run
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
$: << File.dirname(__FILE__)+'/../../lib'
|
|
3
|
+
require 'wukong'
|
|
4
|
+
|
|
5
|
+
class Edge < Struct.new(:src, :dest)
|
|
6
|
+
end
|
|
7
|
+
|
|
8
|
+
class MultiEdge < Struct.new(
|
|
9
|
+
:src, :dest,
|
|
10
|
+
:a_follows_b, :b_follows_a,
|
|
11
|
+
:a_replies_b, :b_replies_a,
|
|
12
|
+
:a_favorites_b, :b_favorites_a
|
|
13
|
+
)
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
module Gen1HoodEdges
|
|
17
|
+
class Mapper < Wukong::Streamer::Base
|
|
18
|
+
def process rsrc, src, dest
|
|
19
|
+
# next if (src.to_i == 0) || (dest.to_i == 0)
|
|
20
|
+
yield [ dest, 'i', src ]
|
|
21
|
+
yield [ src, 'o', dest]
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
#
|
|
26
|
+
# Accumulate ( !!in memory!!) all inbound links onto middle node
|
|
27
|
+
#
|
|
28
|
+
# Then for each outbound link, loop over those inbound links and emit the
|
|
29
|
+
# triple (in, mid,out)
|
|
30
|
+
#
|
|
31
|
+
class Reducer < Wukong::Streamer::AccumulatingReducer
|
|
32
|
+
attr_accessor :ins
|
|
33
|
+
def start! *args
|
|
34
|
+
self.ins = []
|
|
35
|
+
end
|
|
36
|
+
def accumulate mid, dir, node
|
|
37
|
+
case dir
|
|
38
|
+
when 'i'
|
|
39
|
+
self.ins << node
|
|
40
|
+
if (self.ins.length % 1000 == 0) && (self.ins.length > 10000)
|
|
41
|
+
$stderr.puts ["Accumulating:", mid, self.ins.length].join("\t")
|
|
42
|
+
end
|
|
43
|
+
when 'o'
|
|
44
|
+
ins.each do |inn|
|
|
45
|
+
yield ['path_2', inn, mid, node]
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
def finalize
|
|
50
|
+
end
|
|
51
|
+
def get_key mid, *_
|
|
52
|
+
mid
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
class Script < Wukong::Script
|
|
57
|
+
def default_options
|
|
58
|
+
super.merge :sort_fields => 2, :partition_fields => 1
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# Execute the script
|
|
65
|
+
Gen1HoodEdges::Script.new(
|
|
66
|
+
Gen1HoodEdges::Mapper,
|
|
67
|
+
Gen1HoodEdges::Reducer
|
|
68
|
+
).run
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
$: << File.dirname(__FILE__)+'/../../lib'
|
|
3
|
+
require 'wukong'
|
|
4
|
+
require 'wukong/models/graph'; include Wukong::Models
|
|
5
|
+
|
|
6
|
+
#
|
|
7
|
+
# Takes any number of flavors of directed edge with the form
|
|
8
|
+
#
|
|
9
|
+
# a_relatesto_b src_id dest_id [optional fields]
|
|
10
|
+
#
|
|
11
|
+
# and prepares a combined adjacency list. You need to supply a model named
|
|
12
|
+
# "MultiEdge" with members for each edge type.
|
|
13
|
+
#
|
|
14
|
+
# For instance, suppose you have a social network with edges like
|
|
15
|
+
#
|
|
16
|
+
# a_follows_b user_a_id user_b_id
|
|
17
|
+
# a_messages_b user_a_id user_b_id message_id date
|
|
18
|
+
# a_favorites_b user_a_id user_b_id message_id date
|
|
19
|
+
#
|
|
20
|
+
# Your MultiEdge class might look like
|
|
21
|
+
#
|
|
22
|
+
# class MultiEdge < Struct(
|
|
23
|
+
# :src, :dest,
|
|
24
|
+
# :a_follows_b, :b_follows_a,
|
|
25
|
+
# :a_messages_b, :b_messages_a,
|
|
26
|
+
# :a_favorites_b, :b_favorites_a
|
|
27
|
+
# )
|
|
28
|
+
# end
|
|
29
|
+
#
|
|
30
|
+
# The row for a user pair who follows each other; with user_a #24601 messaging b
|
|
31
|
+
# 57 times and favoriting 5 of user_b's messages; and user_b #8675309 messaging
|
|
32
|
+
# 62 times and favoriting none, will emerge as (tab separated, with [blank]
|
|
33
|
+
# indicating there is no text in that slot):
|
|
34
|
+
#
|
|
35
|
+
# ...
|
|
36
|
+
# 24601 8675309 1 1 57 62 5 [blank]
|
|
37
|
+
# ...
|
|
38
|
+
#
|
|
39
|
+
module GenMultiEdge
|
|
40
|
+
#
|
|
41
|
+
# Emit each relation as
|
|
42
|
+
#
|
|
43
|
+
# src dest rel
|
|
44
|
+
#
|
|
45
|
+
# Canonicalizes the src and dest ids to 10-character, zero-padded strings.
|
|
46
|
+
# (Ten chars fits a 32-bit up-to-4-billion-and-change unsigned integer.)
|
|
47
|
+
# Discards all the ancillary crap except +src+, +dest+ and +rel+
|
|
48
|
+
#
|
|
49
|
+
class Mapper < Wukong::Streamer::Base
|
|
50
|
+
def process rsrc, src, dest, *_
|
|
51
|
+
# note that a_retweets_b_id matches here
|
|
52
|
+
m = /^a_([a-z]+)_b.*/.match(rsrc) or return
|
|
53
|
+
rel = m.captures.first
|
|
54
|
+
src = src.to_i ; dest = dest.to_i
|
|
55
|
+
return if ((src == 0) || (dest == 0))
|
|
56
|
+
yield ["%010d"%src, "%010d"%dest, "a_#{rel}_b"]
|
|
57
|
+
yield ["%010d"%dest, "%010d"%src, "b_#{rel}_a"]
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
#
|
|
62
|
+
# Aggregate all sightings of relations for each pair into
|
|
63
|
+
# a single combined
|
|
64
|
+
#
|
|
65
|
+
# Note that [a,b] and [b,a] /each/ have a listing, with the a->b and b<-a
|
|
66
|
+
# relations repeated for each. That is, if there is an "a_messages_b"
|
|
67
|
+
# relation, you'll have edges
|
|
68
|
+
#
|
|
69
|
+
# x y ... a_messages_b(x,y) b_messages_a(y,x) ...
|
|
70
|
+
# y x ... a_messages_b(y,x) b_messages_a(x,y) ...
|
|
71
|
+
#
|
|
72
|
+
#
|
|
73
|
+
class Reducer < Wukong::Streamer::AccumulatingReducer
|
|
74
|
+
attr_accessor :multi_edge
|
|
75
|
+
def get_key src, dest, rel
|
|
76
|
+
[src, dest]
|
|
77
|
+
end
|
|
78
|
+
def start! *args
|
|
79
|
+
self.multi_edge = MultiEdge.new
|
|
80
|
+
end
|
|
81
|
+
def accumulate src, dest, rel
|
|
82
|
+
self.multi_edge[rel] ||= 0
|
|
83
|
+
self.multi_edge[rel] += 1
|
|
84
|
+
end
|
|
85
|
+
def finalize
|
|
86
|
+
multi_edge.src, multi_edge.dest = key
|
|
87
|
+
yield self.multi_edge
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
#
|
|
92
|
+
# Sort on the first two keys: each @[src, dest]@ pair winds up at the same
|
|
93
|
+
# reducer.
|
|
94
|
+
#
|
|
95
|
+
class Script < Wukong::Script
|
|
96
|
+
def default_options
|
|
97
|
+
super.merge :sort_fields => 2
|
|
98
|
+
end
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
# Execute the script
|
|
102
|
+
Script.new(Mapper, Reducer).run
|
|
103
|
+
end
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
$: << File.dirname(__FILE__)+'/../../lib'
|
|
3
|
+
require 'wukong'
|
|
4
|
+
|
|
5
|
+
class Edge < Struct.new(:src, :dest)
|
|
6
|
+
end
|
|
7
|
+
|
|
8
|
+
class ASymmetricB < Edge
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
module Wukong::Streamer
|
|
12
|
+
class EdgeStreamer < Wukong::Streamer::Base
|
|
13
|
+
def recordize line
|
|
14
|
+
rsrc, src, dest, *_ = super(line)
|
|
15
|
+
[ASymmetricB.new(src.to_i, dest.to_i)]
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
module FindSymmetricLinks
|
|
21
|
+
|
|
22
|
+
class Mapper < Wukong::Streamer::EdgeStreamer
|
|
23
|
+
def process edge
|
|
24
|
+
yield edge.to_flat(false)
|
|
25
|
+
yield ASymmetricB.new(edge.dest, edge.src).to_flat(false)
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
#
|
|
30
|
+
#
|
|
31
|
+
class Reducer < Wukong::Streamer::Base
|
|
32
|
+
def stream
|
|
33
|
+
%x{/usr/bin/uniq -c}.split("\n").each do |line|
|
|
34
|
+
key_count, rsrc, src, dest, data = line.chomp.strip.split(/\s+/, 4)
|
|
35
|
+
next unless key_count.to_i == 2
|
|
36
|
+
next unless src.to_i < dest.to_i
|
|
37
|
+
emit [src, dest, data].compact
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
class Script < Wukong::Script
|
|
43
|
+
def default_options
|
|
44
|
+
super.merge :sort_fields => 3
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# Execute the script
|
|
50
|
+
Wukong::Script.new(
|
|
51
|
+
FindSymmetricLinks::Mapper,
|
|
52
|
+
FindSymmetricLinks::Reducer
|
|
53
|
+
).run
|