mrflip-wukong 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE.txt +202 -0
- data/README-tutorial.textile +163 -0
- data/README.textile +165 -0
- data/bin/cutc +30 -0
- data/bin/cuttab +5 -0
- data/bin/greptrue +8 -0
- data/bin/hdp-cat +3 -0
- data/bin/hdp-catd +3 -0
- data/bin/hdp-du +81 -0
- data/bin/hdp-get +3 -0
- data/bin/hdp-kill +3 -0
- data/bin/hdp-ls +10 -0
- data/bin/hdp-mkdir +3 -0
- data/bin/hdp-mv +3 -0
- data/bin/hdp-parts_to_keys.rb +77 -0
- data/bin/hdp-ps +3 -0
- data/bin/hdp-put +3 -0
- data/bin/hdp-rm +11 -0
- data/bin/hdp-sort +29 -0
- data/bin/hdp-stream +29 -0
- data/bin/hdp-stream-flat +18 -0
- data/bin/hdp-sync +17 -0
- data/bin/hdp-wc +67 -0
- data/bin/md5sort +20 -0
- data/bin/tabchar +5 -0
- data/bin/uniqc +3 -0
- data/bin/wu-hist +3 -0
- data/bin/wu-lign +177 -0
- data/bin/wu-sum +30 -0
- data/doc/README-wulign.textile +59 -0
- data/doc/README-wutils.textile +128 -0
- data/doc/UsingWukong-part1.textile +2 -0
- data/doc/UsingWukong-part2.textile +2 -0
- data/doc/UsingWukong-part3-parsing.textile +132 -0
- data/doc/code/api_response_example.txt +20 -0
- data/doc/code/parser_skeleton.rb +38 -0
- data/doc/hadoop-setup.textile +21 -0
- data/doc/intro_to_map_reduce/MapReduceDiagram.graffle +0 -0
- data/doc/links.textile +42 -0
- data/doc/overview.textile +91 -0
- data/doc/pig/PigLatinExpressionsList.txt +122 -0
- data/doc/pig/PigLatinReferenceManual.html +19134 -0
- data/doc/pig/PigLatinReferenceManual.txt +1640 -0
- data/doc/tips.textile +65 -0
- data/doc/utils.textile +48 -0
- data/examples/README.txt +17 -0
- data/examples/and_pig/sample_queries.rb +128 -0
- data/examples/apache_log_parser.rb +53 -0
- data/examples/count_keys.rb +56 -0
- data/examples/count_keys_at_mapper.rb +57 -0
- data/examples/graph/adjacency_list.rb +74 -0
- data/examples/graph/breadth_first_search.rb +79 -0
- data/examples/graph/gen_2paths.rb +68 -0
- data/examples/graph/gen_multi_edge.rb +103 -0
- data/examples/graph/gen_symmetric_links.rb +53 -0
- data/examples/package-local.rb +100 -0
- data/examples/package.rb +96 -0
- data/examples/pagerank/README.textile +6 -0
- data/examples/pagerank/gen_initial_pagerank_graph.pig +57 -0
- data/examples/pagerank/pagerank.rb +88 -0
- data/examples/pagerank/pagerank_initialize.rb +46 -0
- data/examples/pagerank/run_pagerank.sh +19 -0
- data/examples/rank_and_bin.rb +173 -0
- data/examples/run_all.sh +47 -0
- data/examples/sample_records.rb +44 -0
- data/examples/size.rb +60 -0
- data/examples/word_count.rb +95 -0
- data/lib/wukong.rb +11 -0
- data/lib/wukong/and_pig.rb +62 -0
- data/lib/wukong/and_pig/README.textile +12 -0
- data/lib/wukong/and_pig/as.rb +37 -0
- data/lib/wukong/and_pig/data_types.rb +30 -0
- data/lib/wukong/and_pig/functions.rb +50 -0
- data/lib/wukong/and_pig/generate.rb +85 -0
- data/lib/wukong/and_pig/generate/variable_inflections.rb +85 -0
- data/lib/wukong/and_pig/junk.rb +51 -0
- data/lib/wukong/and_pig/operators.rb +8 -0
- data/lib/wukong/and_pig/operators/compound.rb +29 -0
- data/lib/wukong/and_pig/operators/evaluators.rb +7 -0
- data/lib/wukong/and_pig/operators/execution.rb +15 -0
- data/lib/wukong/and_pig/operators/file_methods.rb +29 -0
- data/lib/wukong/and_pig/operators/foreach.rb +98 -0
- data/lib/wukong/and_pig/operators/groupies.rb +212 -0
- data/lib/wukong/and_pig/operators/load_store.rb +65 -0
- data/lib/wukong/and_pig/operators/meta.rb +42 -0
- data/lib/wukong/and_pig/operators/relational.rb +129 -0
- data/lib/wukong/and_pig/pig_struct.rb +48 -0
- data/lib/wukong/and_pig/pig_var.rb +95 -0
- data/lib/wukong/and_pig/symbol.rb +29 -0
- data/lib/wukong/and_pig/utils.rb +0 -0
- data/lib/wukong/bad_record.rb +18 -0
- data/lib/wukong/boot.rb +47 -0
- data/lib/wukong/datatypes.rb +24 -0
- data/lib/wukong/datatypes/enum.rb +123 -0
- data/lib/wukong/dfs.rb +80 -0
- data/lib/wukong/encoding.rb +111 -0
- data/lib/wukong/extensions.rb +15 -0
- data/lib/wukong/extensions/array.rb +18 -0
- data/lib/wukong/extensions/blank.rb +93 -0
- data/lib/wukong/extensions/class.rb +189 -0
- data/lib/wukong/extensions/date_time.rb +24 -0
- data/lib/wukong/extensions/emittable.rb +82 -0
- data/lib/wukong/extensions/hash.rb +120 -0
- data/lib/wukong/extensions/hash_like.rb +112 -0
- data/lib/wukong/extensions/hashlike_class.rb +47 -0
- data/lib/wukong/extensions/module.rb +2 -0
- data/lib/wukong/extensions/pathname.rb +27 -0
- data/lib/wukong/extensions/string.rb +65 -0
- data/lib/wukong/extensions/struct.rb +17 -0
- data/lib/wukong/extensions/symbol.rb +11 -0
- data/lib/wukong/logger.rb +40 -0
- data/lib/wukong/models/graph.rb +27 -0
- data/lib/wukong/rdf.rb +104 -0
- data/lib/wukong/schema.rb +39 -0
- data/lib/wukong/script.rb +265 -0
- data/lib/wukong/script/hadoop_command.rb +111 -0
- data/lib/wukong/script/local_command.rb +14 -0
- data/lib/wukong/streamer.rb +13 -0
- data/lib/wukong/streamer/accumulating_reducer.rb +89 -0
- data/lib/wukong/streamer/base.rb +76 -0
- data/lib/wukong/streamer/count_keys.rb +30 -0
- data/lib/wukong/streamer/count_lines.rb +26 -0
- data/lib/wukong/streamer/filter.rb +20 -0
- data/lib/wukong/streamer/line_streamer.rb +12 -0
- data/lib/wukong/streamer/list_reducer.rb +20 -0
- data/lib/wukong/streamer/preprocess_with_pipe_streamer.rb +22 -0
- data/lib/wukong/streamer/rank_and_bin_reducer.rb +145 -0
- data/lib/wukong/streamer/set_reducer.rb +14 -0
- data/lib/wukong/streamer/struct_streamer.rb +48 -0
- data/lib/wukong/streamer/summing_reducer.rb +29 -0
- data/lib/wukong/streamer/uniq_by_last_reducer.rb +44 -0
- data/lib/wukong/typed_struct.rb +12 -0
- data/lib/wukong/wukong_class.rb +20 -0
- data/spec/bin/hdp-wc_spec.rb +4 -0
- data/spec/spec_helper.rb +0 -0
- data/wukong.gemspec +173 -0
- metadata +208 -0
@@ -0,0 +1,79 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
$: << ENV['WUKONG_PATH']
|
3
|
+
require 'wukong'
|
4
|
+
|
5
|
+
#
|
6
|
+
# Use this script to do a Breadth-First Search (BFS) of a graph.
|
7
|
+
#
|
8
|
+
# Usage:
|
9
|
+
# ./make_paths --head=[path_in_key] --tail=[path_out_key] --out_rsrc=[combined_path_key]
|
10
|
+
#
|
11
|
+
# For example, given an edge list in the file '1path.tsv' that looks like
|
12
|
+
# 1path n1 n2
|
13
|
+
# 1path n1 n3
|
14
|
+
# ... and so forth ...
|
15
|
+
# you can run
|
16
|
+
# for t in 1 2 3 4 5 6 7 8 9 ; do next=$((t+1)) ; time cat 1path.tsv "${t}path.tsv" | ./make_paths.rb --map --head="1path" --tail="${t}path" | sort -u | ./make_paths.rb --reduce --out_rsrc="${next}path" | sort -u > "${next}path.tsv" ; done
|
17
|
+
# to do a 9-deep breadth-first search.
|
18
|
+
#
|
19
|
+
module Gen1HoodEdges
|
20
|
+
class Mapper < Wukong::Streamer::Base
|
21
|
+
attr_accessor :head, :tail
|
22
|
+
def initialize options
|
23
|
+
self.head = options[:head]
|
24
|
+
self.tail = options[:tail]
|
25
|
+
end
|
26
|
+
def process rsrc, *nodes
|
27
|
+
yield [ nodes.last, 'i', nodes[0..-2] ] if (rsrc == self.head)
|
28
|
+
yield [ nodes.first, 'o', nodes[1..-1] ] if (rsrc == self.tail)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
#
|
33
|
+
# Accumulate ( !!in memory!!) all inbound links onto middle node
|
34
|
+
#
|
35
|
+
# Then for each outbound link, loop over those inbound links and emit the
|
36
|
+
# triple (in, mid,out)
|
37
|
+
#
|
38
|
+
class Reducer < Wukong::Streamer::AccumulatingReducer
|
39
|
+
attr_accessor :paths_in, :out_rsrc
|
40
|
+
def initialize options
|
41
|
+
self.out_rsrc = options[:out_rsrc]
|
42
|
+
end
|
43
|
+
# clear the list of incoming paths
|
44
|
+
def start! *args
|
45
|
+
self.paths_in = []
|
46
|
+
end
|
47
|
+
def accumulate mid, dir, *nodes
|
48
|
+
case dir
|
49
|
+
when 'i'
|
50
|
+
self.paths_in << nodes
|
51
|
+
if (self.paths_in.length % 1000 == 0) && (self.paths_in.length > 10000)
|
52
|
+
$stderr.puts ["Accumulating:", mid, self.paths_in.length].join("\t")
|
53
|
+
end
|
54
|
+
when 'o'
|
55
|
+
paths_in.each do |path_in|
|
56
|
+
yield [self.out_rsrc, path_in, mid, *nodes]
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
def finalize
|
61
|
+
end
|
62
|
+
def get_key mid, *_
|
63
|
+
mid
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
class Script < Wukong::Script
|
68
|
+
def default_options
|
69
|
+
super.merge :sort_fields => 2, :partition_fields => 1
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
end
|
74
|
+
|
75
|
+
# Execute the script
|
76
|
+
Gen1HoodEdges::Script.new(
|
77
|
+
Gen1HoodEdges::Mapper,
|
78
|
+
Gen1HoodEdges::Reducer
|
79
|
+
).run
|
@@ -0,0 +1,68 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
$: << File.dirname(__FILE__)+'/../../lib'
|
3
|
+
require 'wukong'
|
4
|
+
|
5
|
+
class Edge < Struct.new(:src, :dest)
|
6
|
+
end
|
7
|
+
|
8
|
+
class MultiEdge < Struct.new(
|
9
|
+
:src, :dest,
|
10
|
+
:a_follows_b, :b_follows_a,
|
11
|
+
:a_replies_b, :b_replies_a,
|
12
|
+
:a_favorites_b, :b_favorites_a
|
13
|
+
)
|
14
|
+
end
|
15
|
+
|
16
|
+
module Gen1HoodEdges
|
17
|
+
class Mapper < Wukong::Streamer::Base
|
18
|
+
def process rsrc, src, dest
|
19
|
+
# next if (src.to_i == 0) || (dest.to_i == 0)
|
20
|
+
yield [ dest, 'i', src ]
|
21
|
+
yield [ src, 'o', dest]
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
#
|
26
|
+
# Accumulate ( !!in memory!!) all inbound links onto middle node
|
27
|
+
#
|
28
|
+
# Then for each outbound link, loop over those inbound links and emit the
|
29
|
+
# triple (in, mid,out)
|
30
|
+
#
|
31
|
+
class Reducer < Wukong::Streamer::AccumulatingReducer
|
32
|
+
attr_accessor :ins
|
33
|
+
def start! *args
|
34
|
+
self.ins = []
|
35
|
+
end
|
36
|
+
def accumulate mid, dir, node
|
37
|
+
case dir
|
38
|
+
when 'i'
|
39
|
+
self.ins << node
|
40
|
+
if (self.ins.length % 1000 == 0) && (self.ins.length > 10000)
|
41
|
+
$stderr.puts ["Accumulating:", mid, self.ins.length].join("\t")
|
42
|
+
end
|
43
|
+
when 'o'
|
44
|
+
ins.each do |inn|
|
45
|
+
yield ['path_2', inn, mid, node]
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
def finalize
|
50
|
+
end
|
51
|
+
def get_key mid, *_
|
52
|
+
mid
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
class Script < Wukong::Script
|
57
|
+
def default_options
|
58
|
+
super.merge :sort_fields => 2, :partition_fields => 1
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
end
|
63
|
+
|
64
|
+
# Execute the script
|
65
|
+
Gen1HoodEdges::Script.new(
|
66
|
+
Gen1HoodEdges::Mapper,
|
67
|
+
Gen1HoodEdges::Reducer
|
68
|
+
).run
|
@@ -0,0 +1,103 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
$: << File.dirname(__FILE__)+'/../../lib'
|
3
|
+
require 'wukong'
|
4
|
+
require 'wukong/models/graph'; include Wukong::Models
|
5
|
+
|
6
|
+
#
|
7
|
+
# Takes any number of flavors of directed edge with the form
|
8
|
+
#
|
9
|
+
# a_relatesto_b src_id dest_id [optional fields]
|
10
|
+
#
|
11
|
+
# and prepares a combined adjacency list. You need to supply a model named
|
12
|
+
# "MultiEdge" with members for each edge type.
|
13
|
+
#
|
14
|
+
# For instance, suppose you have a social network with edges like
|
15
|
+
#
|
16
|
+
# a_follows_b user_a_id user_b_id
|
17
|
+
# a_messages_b user_a_id user_b_id message_id date
|
18
|
+
# a_favorites_b user_a_id user_b_id message_id date
|
19
|
+
#
|
20
|
+
# Your MultiEdge class might look like
|
21
|
+
#
|
22
|
+
# class MultiEdge < Struct(
|
23
|
+
# :src, :dest,
|
24
|
+
# :a_follows_b, :b_follows_a,
|
25
|
+
# :a_messages_b, :b_messages_a,
|
26
|
+
# :a_favorites_b, :b_favorites_a
|
27
|
+
# )
|
28
|
+
# end
|
29
|
+
#
|
30
|
+
# The row for a user pair who follows each other; with user_a #24601 messaging b
|
31
|
+
# 57 times and favoriting 5 of user_b's messages; and user_b #8675309 messaging
|
32
|
+
# 62 times and favoriting none, will emerge as (tab separated, with [blank]
|
33
|
+
# indicating there is no text in that slot):
|
34
|
+
#
|
35
|
+
# ...
|
36
|
+
# 24601 8675309 1 1 57 62 5 [blank]
|
37
|
+
# ...
|
38
|
+
#
|
39
|
+
module GenMultiEdge
|
40
|
+
#
|
41
|
+
# Emit each relation as
|
42
|
+
#
|
43
|
+
# src dest rel
|
44
|
+
#
|
45
|
+
# Canonicalizes the src and dest ids to 10-character, zero-padded strings.
|
46
|
+
# (Ten chars fits a 32-bit up-to-4-billion-and-change unsigned integer.)
|
47
|
+
# Discards all the ancillary crap except +src+, +dest+ and +rel+
|
48
|
+
#
|
49
|
+
class Mapper < Wukong::Streamer::Base
|
50
|
+
def process rsrc, src, dest, *_
|
51
|
+
# note that a_retweets_b_id matches here
|
52
|
+
m = /^a_([a-z]+)_b.*/.match(rsrc) or return
|
53
|
+
rel = m.captures.first
|
54
|
+
src = src.to_i ; dest = dest.to_i
|
55
|
+
return if ((src == 0) || (dest == 0))
|
56
|
+
yield ["%010d"%src, "%010d"%dest, "a_#{rel}_b"]
|
57
|
+
yield ["%010d"%dest, "%010d"%src, "b_#{rel}_a"]
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
#
|
62
|
+
# Aggregate all sightings of relations for each pair into
|
63
|
+
# a single combined
|
64
|
+
#
|
65
|
+
# Note that [a,b] and [b,a] /each/ have a listing, with the a->b and b<-a
|
66
|
+
# relations repeated for each. That is, if there is an "a_messages_b"
|
67
|
+
# relation, you'll have edges
|
68
|
+
#
|
69
|
+
# x y ... a_messages_b(x,y) b_messages_a(y,x) ...
|
70
|
+
# y x ... a_messages_b(y,x) b_messages_a(x,y) ...
|
71
|
+
#
|
72
|
+
#
|
73
|
+
class Reducer < Wukong::Streamer::AccumulatingReducer
|
74
|
+
attr_accessor :multi_edge
|
75
|
+
def get_key src, dest, rel
|
76
|
+
[src, dest]
|
77
|
+
end
|
78
|
+
def start! *args
|
79
|
+
self.multi_edge = MultiEdge.new
|
80
|
+
end
|
81
|
+
def accumulate src, dest, rel
|
82
|
+
self.multi_edge[rel] ||= 0
|
83
|
+
self.multi_edge[rel] += 1
|
84
|
+
end
|
85
|
+
def finalize
|
86
|
+
multi_edge.src, multi_edge.dest = key
|
87
|
+
yield self.multi_edge
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
#
|
92
|
+
# Sort on the first two keys: each @[src, dest]@ pair winds up at the same
|
93
|
+
# reducer.
|
94
|
+
#
|
95
|
+
class Script < Wukong::Script
|
96
|
+
def default_options
|
97
|
+
super.merge :sort_fields => 2
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
# Execute the script
|
102
|
+
Script.new(Mapper, Reducer).run
|
103
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
$: << File.dirname(__FILE__)+'/../../lib'
|
3
|
+
require 'wukong'
|
4
|
+
|
5
|
+
class Edge < Struct.new(:src, :dest)
|
6
|
+
end
|
7
|
+
|
8
|
+
class ASymmetricB < Edge
|
9
|
+
end
|
10
|
+
|
11
|
+
module Wukong::Streamer
|
12
|
+
class EdgeStreamer < Wukong::Streamer::Base
|
13
|
+
def recordize line
|
14
|
+
rsrc, src, dest, *_ = super(line)
|
15
|
+
[ASymmetricB.new(src.to_i, dest.to_i)]
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
module FindSymmetricLinks
|
21
|
+
|
22
|
+
class Mapper < Wukong::Streamer::EdgeStreamer
|
23
|
+
def process edge
|
24
|
+
yield edge.to_flat(false)
|
25
|
+
yield ASymmetricB.new(edge.dest, edge.src).to_flat(false)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
#
|
30
|
+
#
|
31
|
+
class Reducer < Wukong::Streamer::Base
|
32
|
+
def stream
|
33
|
+
%x{/usr/bin/uniq -c}.split("\n").each do |line|
|
34
|
+
key_count, rsrc, src, dest, data = line.chomp.strip.split(/\s+/, 4)
|
35
|
+
next unless key_count.to_i == 2
|
36
|
+
next unless src.to_i < dest.to_i
|
37
|
+
emit [src, dest, data].compact
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
class Script < Wukong::Script
|
43
|
+
def default_options
|
44
|
+
super.merge :sort_fields => 3
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
# Execute the script
|
50
|
+
Wukong::Script.new(
|
51
|
+
FindSymmetricLinks::Mapper,
|
52
|
+
FindSymmetricLinks::Reducer
|
53
|
+
).run
|
@@ -0,0 +1,100 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
$: << File.dirname(__FILE__)+'/../lib'
|
3
|
+
|
4
|
+
require 'wukong'
|
5
|
+
|
6
|
+
#
|
7
|
+
# This is so very very kludgey
|
8
|
+
#
|
9
|
+
# Input is an 'ls' file, listing files to .bz2 package.
|
10
|
+
#
|
11
|
+
# Mapper takes each in turn and creates, within a parallel directory tree under
|
12
|
+
# ~/pkgd on the HDFS, a .bz2 compressed version of the file.
|
13
|
+
#
|
14
|
+
# So, the file
|
15
|
+
# /user/me/fixd/all-20090103
|
16
|
+
# is packaged onto the DFS as
|
17
|
+
# /user/me/pkgd/user/me/fixd/all-20090103
|
18
|
+
#
|
19
|
+
# listing=tmp/fixd-all-package-listing
|
20
|
+
# hdp-rm $listing
|
21
|
+
# hadoop dfs -lsr fixd | egrep '(part-|\.tsv$)' | hdp-put - $listing ;
|
22
|
+
#
|
23
|
+
# ./package.rb --run --rm --map_tasks=1 $listing $pkgd_log
|
24
|
+
#
|
25
|
+
module ExportPackager
|
26
|
+
PKGD_DIR = '/workspace/flip/pkgd'
|
27
|
+
|
28
|
+
#
|
29
|
+
#
|
30
|
+
class Reducer < Wukong::Streamer::Base
|
31
|
+
def announce *args
|
32
|
+
$stdout.puts *args
|
33
|
+
$stderr.puts *args
|
34
|
+
end
|
35
|
+
|
36
|
+
def handle_existing_target output_filename
|
37
|
+
return true unless File.exist?(output_filename)
|
38
|
+
# announce "Exists! #{output_filename}"
|
39
|
+
# return false
|
40
|
+
announce "Removing target file #{output_filename}"
|
41
|
+
begin announce `rm #{output_filename}`
|
42
|
+
rescue Exception => e ; announce e ; end
|
43
|
+
true
|
44
|
+
end
|
45
|
+
|
46
|
+
def mkdir_target_safely output_filename
|
47
|
+
output_dir = File.dirname(output_filename)
|
48
|
+
announce "Ensuring directory #{output_dir} exists"
|
49
|
+
begin announce `mkdir -p #{output_dir}`
|
50
|
+
rescue Exception => e ; announce e ; end
|
51
|
+
end
|
52
|
+
|
53
|
+
def bzip_into_pkgd_file input_filename, output_filename
|
54
|
+
announce "bzip'ing into #{output_filename}"
|
55
|
+
announce `( hadoop dfs -cat #{input_filename}/[^_]\** ) | bzip2 -c > #{output_filename}`
|
56
|
+
end
|
57
|
+
|
58
|
+
def gen_output_filename input_filename
|
59
|
+
input_filename += '.tsv' unless input_filename =~ /.*\.\w{2,}/
|
60
|
+
"%s/%s.bz2" % [PKGD_DIR, input_filename.gsub(/^\//, '')]
|
61
|
+
end
|
62
|
+
|
63
|
+
def rsync host, local_path, remote_path=nil
|
64
|
+
remote_path ||= local_path
|
65
|
+
announce `/usr/bin/rsync -Cuvrtlp #{local_path} #{host}:#{remote_path}`
|
66
|
+
sleep 5
|
67
|
+
end
|
68
|
+
|
69
|
+
def process input_filename
|
70
|
+
output_filename = gen_output_filename(input_filename)
|
71
|
+
handle_existing_target(output_filename) or return
|
72
|
+
mkdir_target_safely output_filename
|
73
|
+
bzip_into_pkgd_file input_filename, output_filename
|
74
|
+
rsync :lab3, output_filename
|
75
|
+
#
|
76
|
+
end
|
77
|
+
|
78
|
+
def recordize line
|
79
|
+
# handle ls or straight file list, either
|
80
|
+
line.split(/\s/).last
|
81
|
+
end
|
82
|
+
|
83
|
+
def stream
|
84
|
+
super
|
85
|
+
rsync :lab3, PKGD_DIR+'/'
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
class Script < Wukong::Script
|
90
|
+
def default_options
|
91
|
+
super.merge :map_tasks => 1,
|
92
|
+
:max_node_reduce_tasks => 1, # only one reducer per local filesystem
|
93
|
+
:timeout => 40 * 60 * 1000 # timeout in ms
|
94
|
+
end
|
95
|
+
end
|
96
|
+
# Execute the script
|
97
|
+
Script.new(nil, Reducer).run
|
98
|
+
end
|
99
|
+
|
100
|
+
|
data/examples/package.rb
ADDED
@@ -0,0 +1,96 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
$: << ENV['WUKONG_PATH'] if ENV['WUKONG_PATH']
|
3
|
+
require 'wukong'
|
4
|
+
|
5
|
+
#
|
6
|
+
# This is so very very kludgey
|
7
|
+
#
|
8
|
+
# Input is an 'ls' file, listing files to .bz2 package.
|
9
|
+
#
|
10
|
+
# Reducer takes each in turn and creates, within a parallel directory tree under
|
11
|
+
# ~/pkgd on the HDFS, a .bz2 compressed version of the file.
|
12
|
+
#
|
13
|
+
# So, the file
|
14
|
+
# /user/me/fixd/all-20090103
|
15
|
+
# is packaged onto the DFS as
|
16
|
+
# /user/me/pkgd/user/me/fixd/all-20090103
|
17
|
+
#
|
18
|
+
# listing=tmp/fixd-all-package-listing
|
19
|
+
# hdp-rm $listing
|
20
|
+
# hadoop dfs -lsr fixd | egrep '(part-|\.tsv$)' | hdp-put - $listing ;
|
21
|
+
#
|
22
|
+
# ./package.rb --run --rm --map_tasks=1 $listing $pkgd_log
|
23
|
+
#
|
24
|
+
module ExportPackager
|
25
|
+
PKGD_DIR = 'pkgd'
|
26
|
+
|
27
|
+
#
|
28
|
+
#
|
29
|
+
class Reducer < Wukong::Streamer::Base
|
30
|
+
def announce str
|
31
|
+
return if str.blank?
|
32
|
+
$stderr.puts str
|
33
|
+
$stdout.puts str
|
34
|
+
end
|
35
|
+
|
36
|
+
def remove_target_filename output_filename
|
37
|
+
begin announce "rm\t#{"%-70s"%output_filename}\t" +
|
38
|
+
`( hadoop dfs -rmr #{output_filename} ) 2>&1`
|
39
|
+
rescue ; nil ; end
|
40
|
+
end
|
41
|
+
|
42
|
+
def mkdir_target_safely output_filename
|
43
|
+
output_dir = File.dirname(output_filename)
|
44
|
+
begin announce "mkdir\t#{"%-70s"%output_dir}\t" +
|
45
|
+
`( hadoop dfs -mkdir #{output_dir} ) 2>&1`
|
46
|
+
rescue ; nil ; end
|
47
|
+
end
|
48
|
+
|
49
|
+
def bzip_into_pkgd_file input_filename, output_filename
|
50
|
+
announce "cat|bz\t#{"%-70s"%input_filename}\t" +
|
51
|
+
`( hadoop dfs -cat #{input_filename}/[^_]\\* | bzip2 -c | hadoop dfs -put - #{output_filename} ) 2>&1`
|
52
|
+
end
|
53
|
+
|
54
|
+
def verify input_filename, output_filename
|
55
|
+
announce "sha1sum\t#{"%-70s"%output_filename}\t" +
|
56
|
+
`( hadoop dfs -cat #{output_filename} | bzcat - | sha1sum ) 2>&1`
|
57
|
+
announce "sha1sum\t#{"%-70s"%input_filename}\t" +
|
58
|
+
`( hadoop dfs -cat #{input_filename}/[^_]\\* | sha1sum ) 2>&1`
|
59
|
+
end
|
60
|
+
|
61
|
+
def gen_output_filename input_filename
|
62
|
+
"%s/%s.bz2" % [PKGD_DIR, input_filename.gsub(%r{^/},"")]
|
63
|
+
end
|
64
|
+
|
65
|
+
def process input_filename, output_filename
|
66
|
+
# remove_target_filename output_filename
|
67
|
+
# mkdir_target_safely output_filename
|
68
|
+
bzip_into_pkgd_file input_filename, output_filename
|
69
|
+
verify input_filename, output_filename
|
70
|
+
end
|
71
|
+
|
72
|
+
def stream
|
73
|
+
announce `hostname`
|
74
|
+
$stdin.each do |input_filename|
|
75
|
+
# handle ls or straight file list, either
|
76
|
+
input_filename = input_filename.chomp.strip.split(/\s/).last
|
77
|
+
output_filename = gen_output_filename input_filename
|
78
|
+
announce "********************************************************"
|
79
|
+
announce "Packing\t#{"%-70s"%input_filename}\t#{output_filename}"
|
80
|
+
process input_filename, output_filename
|
81
|
+
announce "Done\t#{"%-70s"%input_filename}\t#{output_filename}\n\n"
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
class Script < Wukong::Script
|
87
|
+
def default_options
|
88
|
+
super.merge :timeout => (24 * 60 * 60 * 1000) # milliseconds in one day
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
#
|
94
|
+
# Execute the script
|
95
|
+
#
|
96
|
+
ExportPackager::Script.new(nil, ExportPackager::Reducer, :reduce_tasks => 1000).run
|