mrflip-wukong 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE.txt +202 -0
- data/README-tutorial.textile +163 -0
- data/README.textile +165 -0
- data/bin/cutc +30 -0
- data/bin/cuttab +5 -0
- data/bin/greptrue +8 -0
- data/bin/hdp-cat +3 -0
- data/bin/hdp-catd +3 -0
- data/bin/hdp-du +81 -0
- data/bin/hdp-get +3 -0
- data/bin/hdp-kill +3 -0
- data/bin/hdp-ls +10 -0
- data/bin/hdp-mkdir +3 -0
- data/bin/hdp-mv +3 -0
- data/bin/hdp-parts_to_keys.rb +77 -0
- data/bin/hdp-ps +3 -0
- data/bin/hdp-put +3 -0
- data/bin/hdp-rm +11 -0
- data/bin/hdp-sort +29 -0
- data/bin/hdp-stream +29 -0
- data/bin/hdp-stream-flat +18 -0
- data/bin/hdp-sync +17 -0
- data/bin/hdp-wc +67 -0
- data/bin/md5sort +20 -0
- data/bin/tabchar +5 -0
- data/bin/uniqc +3 -0
- data/bin/wu-hist +3 -0
- data/bin/wu-lign +177 -0
- data/bin/wu-sum +30 -0
- data/doc/README-wulign.textile +59 -0
- data/doc/README-wutils.textile +128 -0
- data/doc/UsingWukong-part1.textile +2 -0
- data/doc/UsingWukong-part2.textile +2 -0
- data/doc/UsingWukong-part3-parsing.textile +132 -0
- data/doc/code/api_response_example.txt +20 -0
- data/doc/code/parser_skeleton.rb +38 -0
- data/doc/hadoop-setup.textile +21 -0
- data/doc/intro_to_map_reduce/MapReduceDiagram.graffle +0 -0
- data/doc/links.textile +42 -0
- data/doc/overview.textile +91 -0
- data/doc/pig/PigLatinExpressionsList.txt +122 -0
- data/doc/pig/PigLatinReferenceManual.html +19134 -0
- data/doc/pig/PigLatinReferenceManual.txt +1640 -0
- data/doc/tips.textile +65 -0
- data/doc/utils.textile +48 -0
- data/examples/README.txt +17 -0
- data/examples/and_pig/sample_queries.rb +128 -0
- data/examples/apache_log_parser.rb +53 -0
- data/examples/count_keys.rb +56 -0
- data/examples/count_keys_at_mapper.rb +57 -0
- data/examples/graph/adjacency_list.rb +74 -0
- data/examples/graph/breadth_first_search.rb +79 -0
- data/examples/graph/gen_2paths.rb +68 -0
- data/examples/graph/gen_multi_edge.rb +103 -0
- data/examples/graph/gen_symmetric_links.rb +53 -0
- data/examples/package-local.rb +100 -0
- data/examples/package.rb +96 -0
- data/examples/pagerank/README.textile +6 -0
- data/examples/pagerank/gen_initial_pagerank_graph.pig +57 -0
- data/examples/pagerank/pagerank.rb +88 -0
- data/examples/pagerank/pagerank_initialize.rb +46 -0
- data/examples/pagerank/run_pagerank.sh +19 -0
- data/examples/rank_and_bin.rb +173 -0
- data/examples/run_all.sh +47 -0
- data/examples/sample_records.rb +44 -0
- data/examples/size.rb +60 -0
- data/examples/word_count.rb +95 -0
- data/lib/wukong.rb +11 -0
- data/lib/wukong/and_pig.rb +62 -0
- data/lib/wukong/and_pig/README.textile +12 -0
- data/lib/wukong/and_pig/as.rb +37 -0
- data/lib/wukong/and_pig/data_types.rb +30 -0
- data/lib/wukong/and_pig/functions.rb +50 -0
- data/lib/wukong/and_pig/generate.rb +85 -0
- data/lib/wukong/and_pig/generate/variable_inflections.rb +85 -0
- data/lib/wukong/and_pig/junk.rb +51 -0
- data/lib/wukong/and_pig/operators.rb +8 -0
- data/lib/wukong/and_pig/operators/compound.rb +29 -0
- data/lib/wukong/and_pig/operators/evaluators.rb +7 -0
- data/lib/wukong/and_pig/operators/execution.rb +15 -0
- data/lib/wukong/and_pig/operators/file_methods.rb +29 -0
- data/lib/wukong/and_pig/operators/foreach.rb +98 -0
- data/lib/wukong/and_pig/operators/groupies.rb +212 -0
- data/lib/wukong/and_pig/operators/load_store.rb +65 -0
- data/lib/wukong/and_pig/operators/meta.rb +42 -0
- data/lib/wukong/and_pig/operators/relational.rb +129 -0
- data/lib/wukong/and_pig/pig_struct.rb +48 -0
- data/lib/wukong/and_pig/pig_var.rb +95 -0
- data/lib/wukong/and_pig/symbol.rb +29 -0
- data/lib/wukong/and_pig/utils.rb +0 -0
- data/lib/wukong/bad_record.rb +18 -0
- data/lib/wukong/boot.rb +47 -0
- data/lib/wukong/datatypes.rb +24 -0
- data/lib/wukong/datatypes/enum.rb +123 -0
- data/lib/wukong/dfs.rb +80 -0
- data/lib/wukong/encoding.rb +111 -0
- data/lib/wukong/extensions.rb +15 -0
- data/lib/wukong/extensions/array.rb +18 -0
- data/lib/wukong/extensions/blank.rb +93 -0
- data/lib/wukong/extensions/class.rb +189 -0
- data/lib/wukong/extensions/date_time.rb +24 -0
- data/lib/wukong/extensions/emittable.rb +82 -0
- data/lib/wukong/extensions/hash.rb +120 -0
- data/lib/wukong/extensions/hash_like.rb +112 -0
- data/lib/wukong/extensions/hashlike_class.rb +47 -0
- data/lib/wukong/extensions/module.rb +2 -0
- data/lib/wukong/extensions/pathname.rb +27 -0
- data/lib/wukong/extensions/string.rb +65 -0
- data/lib/wukong/extensions/struct.rb +17 -0
- data/lib/wukong/extensions/symbol.rb +11 -0
- data/lib/wukong/logger.rb +40 -0
- data/lib/wukong/models/graph.rb +27 -0
- data/lib/wukong/rdf.rb +104 -0
- data/lib/wukong/schema.rb +39 -0
- data/lib/wukong/script.rb +265 -0
- data/lib/wukong/script/hadoop_command.rb +111 -0
- data/lib/wukong/script/local_command.rb +14 -0
- data/lib/wukong/streamer.rb +13 -0
- data/lib/wukong/streamer/accumulating_reducer.rb +89 -0
- data/lib/wukong/streamer/base.rb +76 -0
- data/lib/wukong/streamer/count_keys.rb +30 -0
- data/lib/wukong/streamer/count_lines.rb +26 -0
- data/lib/wukong/streamer/filter.rb +20 -0
- data/lib/wukong/streamer/line_streamer.rb +12 -0
- data/lib/wukong/streamer/list_reducer.rb +20 -0
- data/lib/wukong/streamer/preprocess_with_pipe_streamer.rb +22 -0
- data/lib/wukong/streamer/rank_and_bin_reducer.rb +145 -0
- data/lib/wukong/streamer/set_reducer.rb +14 -0
- data/lib/wukong/streamer/struct_streamer.rb +48 -0
- data/lib/wukong/streamer/summing_reducer.rb +29 -0
- data/lib/wukong/streamer/uniq_by_last_reducer.rb +44 -0
- data/lib/wukong/typed_struct.rb +12 -0
- data/lib/wukong/wukong_class.rb +20 -0
- data/spec/bin/hdp-wc_spec.rb +4 -0
- data/spec/spec_helper.rb +0 -0
- data/wukong.gemspec +173 -0
- metadata +208 -0
data/doc/tips.textile
ADDED
@@ -0,0 +1,65 @@
|
|
1
|
+
http://cluster-fork.info/index.php?title=Main_Page
|
2
|
+
|
3
|
+
|
4
|
+
h3. For Big Data, instead of "ACID" you use "ACID*"
|
5
|
+
|
6
|
+
* A -- Associative
|
7
|
+
* C -- Commutative
|
8
|
+
* I -- Idempotent
|
9
|
+
* D -- Distributed
|
10
|
+
* (*) -- (and where possible, left in sort order)
|
11
|
+
|
12
|
+
|
13
|
+
Finally, where possible leave things in sort order by some appropriate index. Clearly I'm not talking about introducing extra unnecessary sorts on ephemeral data. For things that will be read (and experimented with) much more often than they're written, though, it's worth running a final sort. Now you can
|
14
|
+
|
15
|
+
* Efficiently index into a massive dataset with binary search
|
16
|
+
* Do a direct merge sort on two files with the same sort order
|
17
|
+
* Run a reducer directly across the data
|
18
|
+
* Assign a synthetic key by just serially numbering lines (either distribute a unique prefix to each mapper
|
19
|
+
|
20
|
+
Note: for files that will live on the DFS, you should usually *not* do a total sort,
|
21
|
+
|
22
|
+
h3. Encode once, and carefully.
|
23
|
+
|
24
|
+
Encoding violates idempotence.
|
25
|
+
|
26
|
+
Is there a lightweight, mostly-transparent, ASCII-compatible *AND* idempotent encoding scheme lurking in a back closet of some algorithms book?
|
27
|
+
|
28
|
+
|
29
|
+
|
30
|
+
h3. Keys
|
31
|
+
|
32
|
+
Natural keys are right for big data
|
33
|
+
|
34
|
+
Synthetic keys suck. They demand locality or a central keymaster.
|
35
|
+
|
36
|
+
See About Keys
|
37
|
+
|
38
|
+
* Use the natural key
|
39
|
+
* Hash the natural key. This has some drawbacks
|
40
|
+
|
41
|
+
OK, fine. you need a synthetic key
|
42
|
+
|
43
|
+
* Do a total sort, and use nl
|
44
|
+
* Generate
|
45
|
+
* Use a single reducer to reduce locality. YUCK.
|
46
|
+
* have each mapper generate a unique prefix; number each line as "prefix#{line_number}" or whatever.
|
47
|
+
|
48
|
+
How do you get a unique prefix?
|
49
|
+
|
50
|
+
* Distribute a unique prefix to each mapper out-of-band. People using Streaming are out of luck.
|
51
|
+
|
52
|
+
* Use a UUID -- that's what they're for. Drawback: ridiculously long
|
53
|
+
|
54
|
+
* Hash the machine name, PID and timestamp to something short. Check after the
|
55
|
+
fact that uniqueness was achieved. Use the birthday party formula to find out
|
56
|
+
how often this will happen. (In practice, almost never.)
|
57
|
+
|
58
|
+
h3. Epistemology and exeption handling
|
59
|
+
|
60
|
+
something that goes wrong 1/1000 time will happen
|
61
|
+
|
62
|
+
|
63
|
+
h3. Real hackers use the command line as an IDE
|
64
|
+
|
65
|
+
|
data/doc/utils.textile
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
|
2
|
+
<something to tab and align table>
|
3
|
+
|
4
|
+
|
5
|
+
* uniq - report or filter out repeated lines in a file
|
6
|
+
** -c produces line<tab>count
|
7
|
+
** --ignore f1,f2,... discards given fields from consideration. field syntax same as for cut, etc.
|
8
|
+
|
9
|
+
* sort - sort lines of text files
|
10
|
+
** columns indexed as tab-separated
|
11
|
+
** can specify any column order, uses same field spec as cut
|
12
|
+
* tsort - topological sort of a directed graph
|
13
|
+
|
14
|
+
* cut - select portions of each line of a file
|
15
|
+
** can reorder columns
|
16
|
+
* nl - line numbering filter
|
17
|
+
** takes prefix, suffix
|
18
|
+
** count \t line -OR- line \t count
|
19
|
+
|
20
|
+
* wc - word, line, character, and byte count
|
21
|
+
** field count (tab-separated fields)
|
22
|
+
* paste - merge corresponding or subsequent lines of files
|
23
|
+
* expand, unexpand - expand tabs to spaces, and vice versa
|
24
|
+
* seq
|
25
|
+
* simple row, column sums
|
26
|
+
* join - relational database operator
|
27
|
+
* tac
|
28
|
+
|
29
|
+
* cat - concatenate and print files
|
30
|
+
* head - display first lines of a file
|
31
|
+
* tail - display the last part of a file
|
32
|
+
* shuf
|
33
|
+
* split - split a file into pieces
|
34
|
+
* csplit - split files based on context
|
35
|
+
* tee - pipe fitting
|
36
|
+
|
37
|
+
* ls - list directory contents.
|
38
|
+
* df - display free disk space
|
39
|
+
* du - display disk usage statistics
|
40
|
+
** tab-delimited, space aligned
|
41
|
+
|
42
|
+
* od - octal, decimal, hex, ASCII dump
|
43
|
+
* printf - formatted output
|
44
|
+
* cksum, sum - display file checksums and block counts
|
45
|
+
* md5sum
|
46
|
+
|
47
|
+
* diff
|
48
|
+
* comm
|
data/examples/README.txt
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
Examples:
|
2
|
+
|
3
|
+
|
4
|
+
* sample_records -- extract a random sample from a collection of data
|
5
|
+
|
6
|
+
* word_count
|
7
|
+
|
8
|
+
* apache_log_parser -- example for parsing standard apache webserver log files.
|
9
|
+
|
10
|
+
* wordchains -- solving a word puzzle using breadth-first search of a graph
|
11
|
+
|
12
|
+
* graph -- some generic graph
|
13
|
+
|
14
|
+
* pagerank -- use the pagerank algorithm to find the most 'interesting'
|
15
|
+
(central) nodes of a network graph
|
16
|
+
|
17
|
+
|
@@ -0,0 +1,128 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
$: << File.dirname(__FILE__) + '/../../lib'
|
3
|
+
require 'wukong' ; include Wukong
|
4
|
+
require 'wukong/and_pig' ; include Wukong::AndPig
|
5
|
+
|
6
|
+
# PIG_DIR = '/usr/local/share/pig'
|
7
|
+
PIG_DIR = '/public/share/pig'
|
8
|
+
# full pathname to the pig executable
|
9
|
+
# Wukong::AndPig::PIG_EXECUTABLE = "#{PIG_DIR}/bin/pig"
|
10
|
+
Wukong::AndPig::PIG_EXECUTABLE = "/public/bin/pig -x local"
|
11
|
+
|
12
|
+
#
|
13
|
+
HDFS_BASE_DIR = 'foo/meta/lang'
|
14
|
+
Wukong::AndPig::PigVar.working_dir = HDFS_BASE_DIR
|
15
|
+
Wukong::AndPig.comments = false
|
16
|
+
# Wukong::AndPig.emit_dest = :captured
|
17
|
+
|
18
|
+
Wukong::AndPig::PigVar.emit "REGISTER #{PIG_DIR}/contrib/piggybank/java/piggybank.jar"
|
19
|
+
|
20
|
+
#
|
21
|
+
# Load basic types
|
22
|
+
#
|
23
|
+
|
24
|
+
# class Token < Struct.new(:rsrc, :context, :user_id, :token, :usages)
|
25
|
+
# end
|
26
|
+
# :tokens_users_0 << Token.pig_load('meta/datanerds/token_count/users_tokens')
|
27
|
+
# :tokens_users_0 << Token.pig_load('/tmp/users_tokens.tsv')
|
28
|
+
# :tokens_users << :tokens_users_0.generate(:user_id, :token, :usages)
|
29
|
+
# :tokens_users.checkpoint!
|
30
|
+
|
31
|
+
class Token < TypedStruct.new(
|
32
|
+
[:user_id, Integer], [:token, String], [:usages, Integer])
|
33
|
+
end
|
34
|
+
:tokens_users << Token.pig_load('/tmp/users_tokens.tsv')
|
35
|
+
:tokens_users.describe
|
36
|
+
|
37
|
+
pig_comment %Q{
|
38
|
+
# ***************************************************************************
|
39
|
+
#
|
40
|
+
# Global totals
|
41
|
+
#
|
42
|
+
# Each row in Tokens lists a (user, token, usages)
|
43
|
+
# We want
|
44
|
+
# Sum of all usage counts = total tokens seen in tweet stream.
|
45
|
+
# Number of distinct tokens
|
46
|
+
# Number of distinct users <- different than total in twitter_users.tsv
|
47
|
+
# because we want only users that say stuff.
|
48
|
+
}
|
49
|
+
|
50
|
+
def count_distinct relation, field, options={}
|
51
|
+
result_name = options[:as] || "#{relation.name}_#{field}_count".to_sym
|
52
|
+
a = relation.
|
53
|
+
generate(field).set!.describe.
|
54
|
+
distinct(options).set!
|
55
|
+
result_name << a.
|
56
|
+
group(:all).set!.
|
57
|
+
generate(["COUNT(#{a.relation}.#{field})", :u_count, Integer]).set!
|
58
|
+
end
|
59
|
+
|
60
|
+
pig_comment "Count Users"
|
61
|
+
tok_users_count = count_distinct(:tokens_users, :user_id).checkpoint!
|
62
|
+
|
63
|
+
pig_comment "Count Tokens"
|
64
|
+
tok_tokens_count = count_distinct(:tokens_users, :token, :parallel => 10).checkpoint!
|
65
|
+
|
66
|
+
|
67
|
+
pig_comment %Q{
|
68
|
+
# ***************************************************************************
|
69
|
+
#
|
70
|
+
# Statistics for each user
|
71
|
+
}
|
72
|
+
|
73
|
+
def user_stats users_tokens
|
74
|
+
users_tokens.describe.
|
75
|
+
group( :user_id).set!.describe.
|
76
|
+
generate(
|
77
|
+
[:group, :user_id],
|
78
|
+
["(int)COUNT(#{users_tokens.relation})", :tot_tokens, Integer],
|
79
|
+
[ "(int)SUM(#{users_tokens.relation}.usages)", :tot_usages, Integer],
|
80
|
+
[ "FLATTEN(#{users_tokens.relation}.token", :token, String ],
|
81
|
+
[ "FLATTEN(#{users_tokens.relation}.usages", :usages, Integer]).set!.describe.
|
82
|
+
# [ "FLATTEN(#{users_tokens.relation}.(token, usages) )", [:token, :usages], TypedStruct.new([:token, String], [:usages, Integer])]).set!.
|
83
|
+
generate(:user_id, :token, :usages,
|
84
|
+
["(float)(1.0*usages / tot_usages)", :usage_pct, Float],
|
85
|
+
["(float)(1.0*usages / tot_usages) * (1.0*(float)usages / tot_usages)", :usage_pct_sq, Float]).set!
|
86
|
+
end
|
87
|
+
|
88
|
+
:user_stats << user_stats(:tokens_users)
|
89
|
+
:user_stats.describe.checkpoint!
|
90
|
+
puts "UserStats = LOAD 'foo/meta/lang/user_stats' AS (user_id, token, usages, usage_pct, usage_pct_sq) ;"
|
91
|
+
|
92
|
+
UserStats = TypedStruct.new([:user_id, Integer],
|
93
|
+
[:token, String],
|
94
|
+
[:usages, Integer],
|
95
|
+
[:usage_pct, Float],
|
96
|
+
[:usage_pct_sq, Float])
|
97
|
+
:user_stats << UserStats.pig_load('foo/meta/lang/user_stats')
|
98
|
+
|
99
|
+
def range_and_dispersion user_stats
|
100
|
+
|
101
|
+
n_users = 436
|
102
|
+
n_tokens = 61630
|
103
|
+
|
104
|
+
token_stats = user_stats.group(:token).set!
|
105
|
+
token_stats = token_stats.foreach(
|
106
|
+
["(float)SUM(#{user_stats.relation}.usage_pct) / #{n_users.to_f}", :avg_uspct ],
|
107
|
+
["(float)SUM(#{user_stats.relation}.usage_pct_sq)", :sum_uspct_sq],
|
108
|
+
["org.apache.pig.piggybank.evaluation.math.SQRT(
|
109
|
+
(sum_uspct_sq /436) -
|
110
|
+
( (SUM(#{user_stats.relation}.usage_pct)/436.0) * (SUM(#{user_stats.relation}.usage_pct)/436.0) )
|
111
|
+
)", :stdev_uspct],
|
112
|
+
["1 - ( ( stdev_uspct / avg_uspct ) / org.apache.pig.piggybank.evaluation.math.SQRT(436.0 - 1.0) )", :dispersion],
|
113
|
+
[
|
114
|
+
[:group, :token, String ],
|
115
|
+
["(int)COUNT(#{user_stats.relation}) ", :range, Integer ],
|
116
|
+
["(int)COUNT(#{user_stats.relation}) / #{n_users.to_f}", :pct_range, Integer ],
|
117
|
+
["(int)SUM( #{user_stats.relation}.usages)", :tot_usages, Integer],
|
118
|
+
["(float)( 1.0e6*SUM(#{user_stats.relation}.usages) / #{n_tokens.to_f})", :ppm_usages, Float],
|
119
|
+
[:avg_uspct, :avg_uspct],
|
120
|
+
[:stdev_uspct, :stdev_uspct],
|
121
|
+
[:dispersion, :dispersion]
|
122
|
+
]
|
123
|
+
).set!
|
124
|
+
end
|
125
|
+
|
126
|
+
range_and_dispersion(:user_stats).checkpoint!
|
127
|
+
|
128
|
+
Wukong::AndPig.finish
|
@@ -0,0 +1,53 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
$: << File.dirname(__FILE__)+'/../lib'
|
3
|
+
require 'wukong'
|
4
|
+
|
5
|
+
module ApacheLogParser
|
6
|
+
class Mapper < Wukong::Streamer::LineStreamer
|
7
|
+
|
8
|
+
|
9
|
+
def parse_request req
|
10
|
+
m = %r{\A(\w+) (.*) (\w+/[\w\.]+)\z}.match(req)
|
11
|
+
if m
|
12
|
+
[''] + m.captures
|
13
|
+
else
|
14
|
+
[req, '', '', '']
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
|
19
|
+
# regular expression to match on apache-style log lines
|
20
|
+
# IP addr - - [07/Jun/2008:20:37:11 +0000] 400 "GET /faq" + gaJsHost + "google-analytics.com/ga.js HTTP/1.1" 173 "-" "-" "-"
|
21
|
+
LOG_RE = %r{\A(\d+\.\d+\.\d+\.\d+) ([^\s]+) ([^\s]+) \[(\d\d/\w+/\d+):(\d\d:\d\d:\d\d)([^\]]*)\] (\d+) "([^\"]*(?:\" \+ gaJsHost \+ \"[^\"]*)?)" (\d+) "([^\"]*)" "([^\"]*)" "([^\"]*)"\z}
|
22
|
+
|
23
|
+
def process line
|
24
|
+
line.chomp
|
25
|
+
m = LOG_RE.match(line)
|
26
|
+
if m
|
27
|
+
ip, j1, j2, datepart, timepart, tzpart, resp, req, j3, ref, ua, j4 = m.captures
|
28
|
+
req_date = DateTime.parse("#{datepart} #{timepart} #{tzpart}").to_flat
|
29
|
+
req, method, path, protocol = parse_request(req)
|
30
|
+
yield [:logline, method, path, protocol, ip, j1, j2, req_date, resp, req, j3, ref, ua, j4]
|
31
|
+
else
|
32
|
+
yield [:unparseable, line]
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
class Reducer < Wukong::Streamer::LineStreamer
|
38
|
+
end
|
39
|
+
|
40
|
+
# Execute the script
|
41
|
+
class Script < Wukong::Script
|
42
|
+
def reduce_command
|
43
|
+
"/usr/bin/uniq"
|
44
|
+
end
|
45
|
+
def default_options
|
46
|
+
super.merge :sort_fields => 8 # , :reduce_tasks => 0
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
Script.new(Mapper,nil).run
|
51
|
+
end
|
52
|
+
|
53
|
+
# 55.55.155.55 - - [04/Feb/2008:11:37:52 +0000] 301 "GET /robots.txt HTTP/1.1" 185 "-" "WebAlta Crawler/2.0 (http://www.webalta.net/ru/about_webmaster.html) (Windows; U; Windows NT 5.1; ru-RU)" "-"
|
@@ -0,0 +1,56 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
$: << File.dirname(__FILE__)+'/../lib'
|
3
|
+
require 'wukong'
|
4
|
+
require 'wukong/streamer/count_keys'
|
5
|
+
require 'wukong/streamer/count_lines'
|
6
|
+
|
7
|
+
#
|
8
|
+
#
|
9
|
+
class CountKeysReducer < Wukong::Streamer::CountLines
|
10
|
+
#
|
11
|
+
# Taken from the actionpack Rails component ('action_view/helpers/number_helper')
|
12
|
+
#
|
13
|
+
# Formats a +number+ with grouped thousands using +delimiter+. You
|
14
|
+
# can customize the format using optional <em>delimiter</em> and <em>separator</em> parameters.
|
15
|
+
# * <tt>delimiter</tt> - Sets the thousands delimiter, defaults to ","
|
16
|
+
# * <tt>separator</tt> - Sets the separator between the units, defaults to "."
|
17
|
+
#
|
18
|
+
# number_with_delimiter(12345678) => 12,345,678
|
19
|
+
# number_with_delimiter(12345678.05) => 12,345,678.05
|
20
|
+
# number_with_delimiter(12345678, ".") => 12.345.678
|
21
|
+
def number_with_delimiter(number, delimiter=",", separator=".")
|
22
|
+
begin
|
23
|
+
parts = number.to_s.split('.')
|
24
|
+
parts[0].gsub!(/(\d)(?=(\d\d\d)+(?!\d))/, "\\1#{delimiter}")
|
25
|
+
parts.join separator
|
26
|
+
rescue
|
27
|
+
number
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
# Override to look nice
|
32
|
+
def formatted_count item, key_count
|
33
|
+
key_count_str = number_with_delimiter(key_count.to_i)
|
34
|
+
"%-25s\t%12s" % [item, key_count_str]
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
#
|
39
|
+
class CountKeysScript < Wukong::Script
|
40
|
+
def map_command
|
41
|
+
# Use `cut` to extract the first field
|
42
|
+
%Q{ cut -d"\t" -f1 }
|
43
|
+
end
|
44
|
+
|
45
|
+
#
|
46
|
+
# There's just the one field
|
47
|
+
#
|
48
|
+
def default_options
|
49
|
+
super.merge :sort_fields => 1
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
# Executes the script when run from command line
|
54
|
+
if __FILE__ == $0
|
55
|
+
CountKeysScript.new(nil, CountKeysReducer).run
|
56
|
+
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
$: << File.dirname(__FILE__)+'/../lib'
|
3
|
+
require 'wukong'
|
4
|
+
|
5
|
+
#
|
6
|
+
#
|
7
|
+
module CountKeys
|
8
|
+
#
|
9
|
+
class Mapper < Wukong::Streamer::Base
|
10
|
+
attr_accessor :keys_count
|
11
|
+
def initialize *args
|
12
|
+
self.keys_count = {}
|
13
|
+
end
|
14
|
+
def process key, *args
|
15
|
+
key.gsub!(/-.*/, '') # kill off the slug
|
16
|
+
self.keys_count[key] ||= 0
|
17
|
+
self.keys_count[key] += 1
|
18
|
+
end
|
19
|
+
def stream *args
|
20
|
+
super *args
|
21
|
+
self.keys_count.each do |key, count|
|
22
|
+
emit [key, count].to_flat
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
# Identity Mapper
|
27
|
+
class Reducer < Wukong::Streamer::AccumulatingReducer
|
28
|
+
attr_accessor :key_count
|
29
|
+
require 'active_support'
|
30
|
+
require 'action_view/helpers/number_helper'; include ActionView::Helpers::NumberHelper
|
31
|
+
|
32
|
+
# Override to look nice
|
33
|
+
def formatted_count item, key_count
|
34
|
+
key_count_str = number_with_delimiter(key_count.to_i, :delimiter => ',')
|
35
|
+
"%-25s\t%12s" % [item, key_count_str]
|
36
|
+
end
|
37
|
+
def start! *args
|
38
|
+
self.key_count = 0
|
39
|
+
end
|
40
|
+
def accumulate key, count
|
41
|
+
self.key_count += count.to_i
|
42
|
+
end
|
43
|
+
def finalize
|
44
|
+
yield formatted_count(key, key_count)
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
#
|
49
|
+
class Script < Wukong::Script
|
50
|
+
# There's just the one field
|
51
|
+
def default_options
|
52
|
+
super.merge :sort_fields => 1, :reduce_tasks => 1
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
CountKeys::Script.new(CountKeys::Mapper, CountKeys::Reducer).run
|
@@ -0,0 +1,74 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
$: << '/home/flip/ics/wukong/lib' # ENV['WUKONG_PATH']
|
3
|
+
require 'wukong'
|
4
|
+
|
5
|
+
#
|
6
|
+
# Given an adjacency pairs (from \t to) representation of a directed graph:
|
7
|
+
#
|
8
|
+
# 1 2
|
9
|
+
# 1 7
|
10
|
+
# 2 7
|
11
|
+
# 2 9
|
12
|
+
# 7 2
|
13
|
+
#
|
14
|
+
# It produces an "adjacency list":http://en.wikipedia.org/wiki/Adjacency_list representation:
|
15
|
+
#
|
16
|
+
# 1 > 2 7
|
17
|
+
# 2 > 7 9
|
18
|
+
# 7 > 2
|
19
|
+
# 9 >
|
20
|
+
#
|
21
|
+
# and
|
22
|
+
#
|
23
|
+
# 1 <
|
24
|
+
# 2 < 1 7
|
25
|
+
# 7 < 1 2
|
26
|
+
# 9 < 2
|
27
|
+
#
|
28
|
+
# (each column is tab-separated in the actual output)
|
29
|
+
#
|
30
|
+
#
|
31
|
+
#
|
32
|
+
module Gen1HoodEdges
|
33
|
+
class Mapper < Wukong::Streamer::Base
|
34
|
+
def process rsrc, src, dest, *_
|
35
|
+
src = src.to_i ; dest = dest.to_i
|
36
|
+
yield [ src, '>', dest ]
|
37
|
+
yield [ dest, '<', src ]
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
#
|
42
|
+
# Accumulate links onto single line.
|
43
|
+
#
|
44
|
+
# The reduce key is the target node and direction; we just stream through all
|
45
|
+
# pairs for each target node and output its neighbor nodes on the same line.
|
46
|
+
#
|
47
|
+
# To control memory usage, we will print directly to the output (and not run
|
48
|
+
# through the Emitter)
|
49
|
+
#
|
50
|
+
class Reducer < Wukong::Streamer::AccumulatingReducer
|
51
|
+
# clear the list of incoming paths
|
52
|
+
def start! target, dir, *args
|
53
|
+
print target + "\t" + dir # start line with target and list type
|
54
|
+
end
|
55
|
+
def accumulate target, dir, neighbor
|
56
|
+
print "\t" + neighbor # append neighbor to output, same line
|
57
|
+
end
|
58
|
+
def finalize
|
59
|
+
puts '' # start new line
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
class Script < Wukong::Script
|
64
|
+
def default_options
|
65
|
+
super.merge :sort_fields => 1, :partition_fields => 1
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
# Execute the script
|
71
|
+
Gen1HoodEdges::Script.new(
|
72
|
+
Gen1HoodEdges::Mapper,
|
73
|
+
Gen1HoodEdges::Reducer
|
74
|
+
).run
|