wukong 1.4.9 → 1.4.10
Sign up to get free protection for your applications and to get access to all the features.
- data/TODO.textile +13 -0
- data/bin/hdp-bzip +3 -3
- data/bin/hdp-kill-task +3 -0
- data/bin/hdp-mkdir +0 -1
- data/bin/hdp-put +1 -1
- data/bin/hdp-sort +5 -17
- data/bin/hdp-stream +5 -17
- data/bin/hdp-stream-flat +5 -5
- data/bin/wu-sum +1 -0
- data/docpages/README-performance.textile +90 -0
- data/examples/binning_percentile_estimator.rb +142 -0
- data/examples/corpus/words_to_bigrams.rb +52 -0
- data/examples/keystore/cassandra_batch_test.rb +41 -0
- data/examples/network_graph/gen_multi_edge.rb +3 -2
- data/examples/sample_records.rb +1 -0
- data/lib/wukong/extensions/date_time.rb +4 -3
- data/lib/wukong/extensions/enumerable.rb +79 -0
- data/lib/wukong/extensions.rb +1 -0
- data/lib/wukong/keystore/redis_db.rb +24 -0
- data/lib/wukong/keystore/tyrant_db.rb +124 -0
- data/lib/wukong/keystore/tyrant_notes.textile +145 -0
- data/lib/wukong/periodic_monitor.rb +57 -0
- data/lib/wukong/script/hadoop_command.rb +3 -1
- data/lib/wukong/streamer/accumulating_reducer.rb +1 -0
- data/lib/wukong/streamer/cassandra_streamer.rb +61 -0
- data/lib/wukong/streamer.rb +12 -10
- data/wukong.gemspec +34 -16
- metadata +60 -16
data/TODO.textile
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
|
4
|
+
* add GEM_PATH to hadoop_recycle_env
|
5
|
+
|
6
|
+
* Hadoop_command function received an array for the input_path parameter
|
7
|
+
|
8
|
+
** We should be able to specify comma *or* space separated paths; the last
|
9
|
+
space-separated path in Settings.rest becomes the output file, the others are
|
10
|
+
used as the input_file list.
|
11
|
+
|
12
|
+
* Make configliere Settings and streamer_instance.options() be the same
|
13
|
+
thing. (instead of almost-but-confusingly-not-always the same thing).
|
data/bin/hdp-bzip
CHANGED
@@ -14,15 +14,15 @@ echo "Removing output directory $OUTPUT"
|
|
14
14
|
hadoop fs -rmr $OUTPUT
|
15
15
|
|
16
16
|
cmd="${HADOOP_HOME}/bin/hadoop \
|
17
|
-
jar ${HADOOP_HOME}/contrib/streaming/hadoop
|
17
|
+
jar ${HADOOP_HOME}/contrib/streaming/hadoop-*streaming*.jar \
|
18
18
|
-partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner \
|
19
19
|
-jobconf mapred.output.compress=true \
|
20
20
|
-jobconf mapred.output.compression.codec=org.apache.hadoop.io.compress.BZip2Codec \
|
21
21
|
-jobconf mapred.reduce.tasks=1 \
|
22
22
|
-mapper \"/bin/cat\" \
|
23
|
-
-reducer \"/
|
23
|
+
-reducer \"/bin/cat\" \
|
24
24
|
$INPUTS
|
25
25
|
-output $OUTPUT \
|
26
26
|
"
|
27
27
|
echo $cmd
|
28
|
-
$cmd
|
28
|
+
$cmd
|
data/bin/hdp-kill-task
ADDED
data/bin/hdp-mkdir
CHANGED
data/bin/hdp-put
CHANGED
data/bin/hdp-sort
CHANGED
@@ -8,7 +8,7 @@ reduce_script=${1-/usr/bin/uniq} ; shift
|
|
8
8
|
partfields=${1-2} ; shift
|
9
9
|
sortfields=${1-2} ; shift
|
10
10
|
|
11
|
-
if [ "$output_file" == "" ] ; then echo "$0 input_file output_file [mapper=/bin/cat] [reducer=/usr/bin/uniq] [
|
11
|
+
if [ "$output_file" == "" ] ; then echo "$0 input_file output_file [mapper=/bin/cat] [reducer=/usr/bin/uniq] [partfields=2] [sortfields=2] [extra_args]" ; exit ; fi
|
12
12
|
|
13
13
|
HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
|
14
14
|
|
@@ -16,7 +16,9 @@ cmd="${HADOOP_HOME}/bin/hadoop \
|
|
16
16
|
jar ${HADOOP_HOME}/contrib/streaming/hadoop-*-streaming.jar
|
17
17
|
-partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner
|
18
18
|
-jobconf num.key.fields.for.partition=\"$partfields\"
|
19
|
-
-jobconf stream.num.map.output.key.fields=\"$sortfields\"
|
19
|
+
-jobconf stream.num.map.output.key.fields=\"$sortfields\"
|
20
|
+
-jobconf stream.map.output.field.separator=\"'/t'\"
|
21
|
+
-jobconf mapred.text.key.partitioner.options=\"-k1,$partfields\"
|
20
22
|
-mapper \"$map_script\"
|
21
23
|
-reducer \"$reduce_script\"
|
22
24
|
-input \"$input_file\"
|
@@ -28,22 +30,8 @@ echo "$cmd"
|
|
28
30
|
|
29
31
|
$cmd
|
30
32
|
|
31
|
-
#
|
32
|
-
# -jobconf stream.map.output.field.separator='\t' \
|
33
|
-
# -jobconf map.output.key.field.separator='\t' \
|
34
|
-
# -jobconf mapred.map.tasks=3 \
|
35
|
-
# -jobconf mapred.reduce.tasks=3 \
|
36
|
-
|
33
|
+
# Maybe?
|
37
34
|
#
|
38
|
-
# TODO:
|
39
|
-
# http://issues.apache.org/jira/browse/MAPREDUCE-594
|
40
|
-
# http://hadoop.apache.org/common/docs/current/api/org/apache/hadoop/mapred/KeyValueTextInputFormat.html
|
41
|
-
# Instead of /bin/cat, Identity can be (I think)
|
42
35
|
# -inputformat org.apache.hadoop.mapred.KeyValueTextInputFormat \
|
43
36
|
# -mapper org.apache.hadoop.mapred.lib.IdentityMapper \
|
44
|
-
# ...
|
45
|
-
#
|
46
|
-
# TODO
|
47
37
|
#
|
48
|
-
# New-style secondary sort:
|
49
|
-
# http://hadoop.apache.org/common/docs/r0.20.0/streaming.html
|
data/bin/hdp-stream
CHANGED
@@ -8,7 +8,7 @@ reduce_script=${1-/usr/bin/uniq} ; shift
|
|
8
8
|
partfields=${1-2} ; shift
|
9
9
|
sortfields=${1-2} ; shift
|
10
10
|
|
11
|
-
if [ "$output_file" == "" ] ; then echo "$0 input_file output_file [mapper=/bin/cat] [reducer=/usr/bin/uniq] [
|
11
|
+
if [ "$output_file" == "" ] ; then echo "$0 input_file output_file [mapper=/bin/cat] [reducer=/usr/bin/uniq] [partfields=2] [sortfields=2] [extra_args]" ; exit ; fi
|
12
12
|
|
13
13
|
HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
|
14
14
|
|
@@ -16,7 +16,9 @@ cmd="${HADOOP_HOME}/bin/hadoop \
|
|
16
16
|
jar ${HADOOP_HOME}/contrib/streaming/hadoop-*-streaming.jar
|
17
17
|
-partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner
|
18
18
|
-jobconf num.key.fields.for.partition=\"$partfields\"
|
19
|
-
-jobconf stream.num.map.output.key.fields=\"$sortfields\"
|
19
|
+
-jobconf stream.num.map.output.key.fields=\"$sortfields\"
|
20
|
+
-jobconf stream.map.output.field.separator=\"'/t'\"
|
21
|
+
-jobconf mapred.text.key.partitioner.options=\"-k1,$partfields\"
|
20
22
|
-mapper \"$map_script\"
|
21
23
|
-reducer \"$reduce_script\"
|
22
24
|
-input \"$input_file\"
|
@@ -28,22 +30,8 @@ echo "$cmd"
|
|
28
30
|
|
29
31
|
$cmd
|
30
32
|
|
31
|
-
#
|
32
|
-
# -jobconf stream.map.output.field.separator='\t' \
|
33
|
-
# -jobconf map.output.key.field.separator='\t' \
|
34
|
-
# -jobconf mapred.map.tasks=3 \
|
35
|
-
# -jobconf mapred.reduce.tasks=3 \
|
36
|
-
|
33
|
+
# Maybe?
|
37
34
|
#
|
38
|
-
# TODO:
|
39
|
-
# http://issues.apache.org/jira/browse/MAPREDUCE-594
|
40
|
-
# http://hadoop.apache.org/common/docs/current/api/org/apache/hadoop/mapred/KeyValueTextInputFormat.html
|
41
|
-
# Instead of /bin/cat, Identity can be (I think)
|
42
35
|
# -inputformat org.apache.hadoop.mapred.KeyValueTextInputFormat \
|
43
36
|
# -mapper org.apache.hadoop.mapred.lib.IdentityMapper \
|
44
|
-
# ...
|
45
|
-
#
|
46
|
-
# TODO
|
47
37
|
#
|
48
|
-
# New-style secondary sort:
|
49
|
-
# http://hadoop.apache.org/common/docs/r0.20.0/streaming.html
|
data/bin/hdp-stream-flat
CHANGED
@@ -1,9 +1,9 @@
|
|
1
1
|
#!/usr/bin/env bash
|
2
2
|
|
3
|
-
input_file
|
4
|
-
output_file
|
5
|
-
map_script
|
6
|
-
reduce_script
|
3
|
+
input_file="${1}" ; shift
|
4
|
+
output_file="${1}" ; shift
|
5
|
+
map_script="${1-/bin/cat}" ; shift
|
6
|
+
reduce_script="${1-/usr/bin/uniq}" ; shift
|
7
7
|
|
8
8
|
if [ "$output_file" == "" ] ; then echo "$0 input_file output_file [mapper=/bin/cat] [reducer=/usr/bin/uniq] [extra_args]" ; exit ; fi
|
9
9
|
|
@@ -14,7 +14,7 @@ HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
|
|
14
14
|
# -jobconf mapred.reduce.tasks=3 \
|
15
15
|
|
16
16
|
exec ${HADOOP_HOME}/bin/hadoop \
|
17
|
-
jar ${HADOOP_HOME}/contrib/streaming/hadoop
|
17
|
+
jar ${HADOOP_HOME}/contrib/streaming/hadoop*streaming*.jar \
|
18
18
|
"$@" \
|
19
19
|
-jobconf "mapred.job.name=`basename $0`-$map_script-$input_file-$output_file" \
|
20
20
|
-mapper "$map_script" \
|
data/bin/wu-sum
CHANGED
@@ -0,0 +1,90 @@
|
|
1
|
+
job_201006200508_0002 NORMAL flip parse_twitter_api_requests.rb---s3n//monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b 100.00%
|
2
|
+
s3 => hdfs bz2 parser, cond_em empty (?)
|
3
|
+
201006200508_0002 35mins, 34sec 1 1812031232 0 12495736645 7240978546 8180472 388863907 parse_twitter_api_requests.rb---s3n//monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b
|
4
|
+
|
5
|
+
job_201006200508_0003 NORMAL flip parse_twitter_api_requests.rb---s3n//monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-dupes 100.00%
|
6
|
+
s3 => hdfs bz2 parser, cond_em duplicate
|
7
|
+
201006200508_0003 15mins, 50sec 1 1812031232 0 11877866580 7240978546 8180472 383928615 parse_twitter_api_requests.rb---s3n//monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-dupes
|
8
|
+
|
9
|
+
job_201006200508_0004 NORMAL flip parse_twitter_api_requests.rb---/data/ripd/com.tw/com.twitter/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-from_hdfs_bz2 100.00%
|
10
|
+
hdfs => hdfs bz2 parser, cond_em empty
|
11
|
+
201006200508_0004 36mins, 56sec 1 1812031232 13334645497 7240978546 8180472 395564272 parse_twitter_api_requests.rb---/data/ripd/com.tw/com.twitter/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-from_hdfs_bz2
|
12
|
+
|
13
|
+
job_201006200508_0005 NORMAL flip parse_twitter_api_requests.rb---/data/ripd/com.tw/com.twitter/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-from_hdfs_bz2_no_cond_em 100.00%
|
14
|
+
hdfs => hdfs bz2 parser, no_cond_em --
|
15
|
+
201006200508_0005 35mins, 23sec 1 1812031232 13479823318 7240978546 8180472 396757046 parse_twitter_api_requests.rb---/data/ripd/com.tw/com.twitter/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-from_hdfs_bz2_no_cond_em
|
16
|
+
|
17
|
+
job_201006200508_0006 NORMAL flip hdp-stream-flat-/bin/cat-/data/ripd/com.tw/com.twitter/2009111*-/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111 100.00%
|
18
|
+
hdfs => hdfs bz2 `which cat`
|
19
|
+
201006200508_0006 1mins, 10sec 1 1812031232 7240978549 7240978546 8180472 8180472 hdp-stream-flat-/bin/cat-/data/ripd/com.tw/com.twitter/2009111*-/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111
|
20
|
+
|
21
|
+
job_201006200508_0007 NORMAL flip hdp-stream-flat-/bin/cat-s3n://monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/2009111*-/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111-from_s3n 100.00%
|
22
|
+
s3 => hdfs bz2 `which cat`
|
23
|
+
201006200508_0007 1mins, 55sec 1 1812031232 0 7240978549 7240978546 8180472 8180472 hdp-stream-flat-/bin/cat-s3n://monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/2009111*-/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111-from_s3n
|
24
|
+
|
25
|
+
job_201006200508_0008 NORMAL flip parse_twitter_api_requests.rb---/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-hdfs-flat-no_cond_em-no_db 100.00%
|
26
|
+
hdfs => hdfs flat parser no cond_em no db
|
27
|
+
201006200508_0008 10mins, 59sec 1 7240978549 13545881166 7240978549 8180472 397172723 parse_twitter_api_requests.rb---/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-hdfs-flat-no_cond_em-no_db
|
28
|
+
|
29
|
+
job_201006200508_0015 NORMAL flip parse_twitter_api_requests.rb---/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-hdfs-flat-cond_em_users-no_db 100.00%
|
30
|
+
hdfs => hdfs flat parser cond_em on users only no DB
|
31
|
+
201006200508_0015 23mins, 48sec 1 7240978549 13415414554 7240978549 8180472 396101235 parse_twitter_api_requests.rb---/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-hdfs-flat-cond_em_users-no_db
|
32
|
+
|
33
|
+
job_201006200508_0016 NORMAL flip parse_twitter_api_requests.rb---/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-hdfs-flat-cond_em_users-yes_db-nodupes 100.00%
|
34
|
+
hdfs => hdfs flat parser cond_em on users only - vanished saving id/sn to DB
|
35
|
+
201006200508_0016 28mins, 7sec 1 0 7240978549 13414285504 7240978549 8180472 396091251 parse_twitter_api_requests.rb---/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-hdfs-flat-cond_em_users-yes_db-dupes
|
36
|
+
|
37
|
+
job_201006200508_0017 NORMAL flip parse_twitter_api_requests.rb---/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-hdfs-flat-cond_em_users-yes_db-dupes 100.00%
|
38
|
+
hdfs => hdfs flat parser cond_em on users only - duped saving id/sn to DB
|
39
|
+
201006200508_0017 11mins, 51sec 1 0 7240978549 12221205449 7240978549 8180472 386114331 parse_twitter_api_requests.rb---/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-hdfs-flat-cond_em_users-yes_db-dupes
|
40
|
+
|
41
|
+
===========================================================================
|
42
|
+
== Parse
|
43
|
+
==
|
44
|
+
|
45
|
+
job_201006200508_0018 NORMAL flip parse_twitter_api_requests.rb---s3n//monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/2010056---/data/sn/tw/rawd/parsed/2010056 100.00%
|
46
|
+
201006200508_0018 11hrs, 12mins, 43sec 1 25560337747 141729936525 128606199040 14198839 3918844056 parse_twitter_api_requests.rb---s3n//monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/2010056---/data/sn/tw/rawd/parsed/2010056
|
47
|
+
|
48
|
+
for foo in 0016 0017 0018 ; do echo $foo ; ~/ics/hadoop/chimpmark/bin/elephantscat.rb job_201006200508_$foo ; done
|
49
|
+
cat ~/timings/job/201006200508/*/*.tsv | wu-lign
|
50
|
+
|
51
|
+
job_id scraped_at run_time succ? s3n_in hdfs_in file_in hdfs_out file_out map_in map_out map_recs_in map_recs_out red_recs_in red_recs_out job_name
|
52
|
+
201006200508_0002 35mins, 34sec 1 1812031232 0 0 12495736645 0 7240978546 0 8180472 388863907 0 0 parse_twitter_api_requests.rb---s3n//monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b
|
53
|
+
201006200508_0003 15mins, 50sec 1 1812031232 0 0 11877866580 0 7240978546 0 8180472 383928615 0 0 parse_twitter_api_requests.rb---s3n//monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-dupes
|
54
|
+
201006200508_0004 36mins, 56sec 1 1812031232 0 13334645497 0 7240978546 0 8180472 395564272 0 0 parse_twitter_api_requests.rb---/data/ripd/com.tw/com.twitter/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-from_hdfs_bz2
|
55
|
+
201006200508_0005 35mins, 23sec 1 1812031232 0 13479823318 0 7240978546 0 8180472 396757046 0 0 parse_twitter_api_requests.rb---/data/ripd/com.tw/com.twitter/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-from_hdfs_bz2_no_cond_em
|
56
|
+
201006200508_0006 1mins, 10sec 1 1812031232 0 7240978549 0 7240978546 0 8180472 8180472 0 0 hdp-stream-flat-/bin/cat-/data/ripd/com.tw/com.twitter/2009111*-/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111
|
57
|
+
201006200508_0007 1mins, 55sec 1 1812031232 0 0 7240978549 0 7240978546 0 8180472 8180472 0 0 hdp-stream-flat-/bin/cat-s3n://monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/2009111*-/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111-from_s3n
|
58
|
+
201006200508_0008 10mins, 59sec 1 7240978549 0 13545881166 0 7240978549 0 8180472 397172723 0 0 parse_twitter_api_requests.rb---/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-hdfs-flat-no_cond_em-no_db
|
59
|
+
201006200508_0015 23mins, 48sec 1 7240978549 0 13415414554 0 7240978549 0 8180472 396101235 0 0 parse_twitter_api_requests.rb---/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-hdfs-flat-cond_em_users-no_db
|
60
|
+
201006200508_0016 28mins, 7sec 1 7240978549 0 13414285504 0 7240978549 0 8180472 396091251 0 0 parse_twitter_api_requests.rb---/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-hdfs-flat-cond_em_users-yes_db-dupes
|
61
|
+
201006200508_0017 11mins, 51sec 1 7240978549 0 12221205449 0 7240978549 0 8180472 386114331 0 0 parse_twitter_api_requests.rb---/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-hdfs-flat-cond_em_users-yes_db-dupes
|
62
|
+
201006200508_0018 11hrs, 12mins, 43sec 1 25560337747 0 0 141729936525 0 128606199040 0 14198839 3918844056 0 0 parse_twitter_api_requests.rb---s3n//monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/2010056---/data/sn/tw/rawd/parsed/2010056
|
63
|
+
201006200508_0021 8hrs, 50mins, 52sec 1 141779023755 62208536220 24722859867 73825391771 141729936525 189098533358 3918844056 3918844056 155139258 155139258 Unsplicer
|
64
|
+
201006200508_0029 1mins, 20sec 1 1763173995 0 1762322014 0 1762322014 0 22764940 22764940 0 0 hdp-stream-flat-/bin/cat-/data/sn/tw/rawd/unspliced/twitter_user-/tmp/foo
|
65
|
+
201006200508_0031 3hrs, 48mins, 6sec 1 14930014182 0 0 48106164389 0 113092707367 0 8408164 753481311 0 0 parse_twitter_api_requests.rb---s3n//monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/201004---/data/sn/tw/rawd/parsed/api/201004
|
66
|
+
201006200508_0034 30mins, 46sec 1 7170990599 2203578261 8389754083 5031160348 7170990599 7170990510 143461243 143461241 143461241 67443309 bulk_load_conversation.rb---/data/sn/tw/fixd/objects/a_replies_b---/data/sn/tw/fixd/apeyeye/conversation/a_replies_b_json
|
67
|
+
|
68
|
+
Identity mapper Wukong `which cat` pig
|
69
|
+
Identity reducer wukong `which cat` pig
|
70
|
+
* no skew
|
71
|
+
* data/reducer > ram
|
72
|
+
|
73
|
+
Do a sort|uniq on 150GB
|
74
|
+
|
75
|
+
|
76
|
+
* 1.8 GB bz2, S3 => HDFS 1m55s
|
77
|
+
* 1.8 GB bz2, HDFS => HDFS 1m10s
|
78
|
+
|
79
|
+
TokyoTyrant, 1 node => 4 m1.large (Balancer) 15_000 inserts/sec
|
80
|
+
TokyoTyrant, 20 tasks => 4 m1.large (Balancer) 2_000 inserts/sec
|
81
|
+
|
82
|
+
===========================================================================
|
83
|
+
|
84
|
+
Parse:
|
85
|
+
|
86
|
+
hdp-du s3n://monkeyshines.infochimps.org/data/ripd/com.tw/\*/ > /mnt/tmp/ripd_com.tw-du.tsv
|
87
|
+
|
88
|
+
|
89
|
+
|
90
|
+
1050 entries 448483502374 417.7 GB
|
@@ -0,0 +1,142 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'wukong'
|
5
|
+
require 'wukong/streamer/count_keys'
|
6
|
+
|
7
|
+
|
8
|
+
#
|
9
|
+
# Ch3ck out dis moist azz code bitches!!
|
10
|
+
#
|
11
|
+
#
|
12
|
+
|
13
|
+
#
|
14
|
+
# Do nothing more than bin users here, arbitrary and probably bad
|
15
|
+
#
|
16
|
+
class Mapper < Wukong::Streamer::RecordStreamer
|
17
|
+
def process rank, followers
|
18
|
+
followers = followers.to_i
|
19
|
+
if followers > 100
|
20
|
+
yield [9,rank]
|
21
|
+
elsif followers > 75
|
22
|
+
yield [8,rank]
|
23
|
+
elsif followers > 50
|
24
|
+
yield [7,rank]
|
25
|
+
elsif followers > 25
|
26
|
+
yield [6,rank]
|
27
|
+
elsif followers > 15
|
28
|
+
yield [5,rank]
|
29
|
+
elsif followers > 10
|
30
|
+
yield [4,rank]
|
31
|
+
elsif followers > 5
|
32
|
+
yield [3,rank]
|
33
|
+
elsif followers > 4
|
34
|
+
yield [2,rank]
|
35
|
+
elsif followers > 1
|
36
|
+
yield [1,rank]
|
37
|
+
else
|
38
|
+
yield [0,rank]
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
|
44
|
+
#
|
45
|
+
# Calculate percentile rank for every pr value in a given follower bracket
|
46
|
+
#
|
47
|
+
class Reducer < Wukong::Streamer::AccumulatingReducer
|
48
|
+
attr_accessor :count_bin
|
49
|
+
def start! bin, rank
|
50
|
+
self.count_bin ||= {}
|
51
|
+
self.count_bin[bin] ||= {}
|
52
|
+
end
|
53
|
+
|
54
|
+
def accumulate bin, rank
|
55
|
+
rank = (rank.to_f*10.0).round.to_f/10.0
|
56
|
+
self.count_bin[bin][rank] ||= 0
|
57
|
+
self.count_bin[bin][rank] += 1
|
58
|
+
end
|
59
|
+
|
60
|
+
def finalize
|
61
|
+
count_bin[key] = generate_all_pairs(key).inject({}){|h,pair| h[pair.first] = pair.last; h}
|
62
|
+
yield [key, count_bin[key].values.sort.join(",")]
|
63
|
+
end
|
64
|
+
|
65
|
+
#
|
66
|
+
# Write the final table to disk as a ruby hash
|
67
|
+
#
|
68
|
+
def after_stream
|
69
|
+
table = File.open("trstrank_table.rb", 'w')
|
70
|
+
table << "TRSTRANK_TABLE = " << count_bin.inspect
|
71
|
+
table.close
|
72
|
+
end
|
73
|
+
|
74
|
+
#
|
75
|
+
# Return percentile of a given trstrank for a given follower bracket
|
76
|
+
#
|
77
|
+
def percentile bin, rank
|
78
|
+
((count_less_than(bin,rank) + 0.5*frequency_of(bin,rank))/ total_num(bin) )*100.0
|
79
|
+
end
|
80
|
+
|
81
|
+
#
|
82
|
+
# Return the count of values less than rank
|
83
|
+
#
|
84
|
+
def count_less_than bin, rank
|
85
|
+
count_bin[bin].keys.inject(0){|count,key| count += count_bin[bin][key] if key.to_f < rank; count}
|
86
|
+
end
|
87
|
+
|
88
|
+
#
|
89
|
+
# Return the count of rank
|
90
|
+
#
|
91
|
+
def frequency_of bin, rank
|
92
|
+
count_bin[bin].keys.inject(0){|count,key| count += count_bin[bin][key] if key.to_f == rank; count}
|
93
|
+
end
|
94
|
+
|
95
|
+
#
|
96
|
+
# Return the total number in sample
|
97
|
+
#
|
98
|
+
def total_num bin
|
99
|
+
count_bin[bin].values.inject(0){|count,v| count += v; count}
|
100
|
+
end
|
101
|
+
|
102
|
+
#
|
103
|
+
# Generate a list of all pairs {trstrank => percentile}, interpolate when necessary
|
104
|
+
#
|
105
|
+
def generate_all_pairs bin
|
106
|
+
h = {}
|
107
|
+
count_bin[bin].keys.each do |rank|
|
108
|
+
h[rank.to_f] = percentile(bin, rank.to_f)
|
109
|
+
end
|
110
|
+
h[0.0] ||= 0.0
|
111
|
+
h[10.0] ||= 100.0
|
112
|
+
arr = h.to_a.sort!{|x,y| x.first <=> y.first}
|
113
|
+
list = arr.zip(arr[1..-1])
|
114
|
+
big_list = []
|
115
|
+
big_list << [0.0,0.0]
|
116
|
+
list.each do |pairs|
|
117
|
+
interpolate(pairs.first, pairs.last, 0.1).each{|pair| big_list << pair}
|
118
|
+
end
|
119
|
+
big_list.uniq.sort{|x,y| x.first <=> y.first}
|
120
|
+
end
|
121
|
+
|
122
|
+
|
123
|
+
#
|
124
|
+
# Nothing to see here, move along
|
125
|
+
#
|
126
|
+
def interpolate pair1, pair2, dx
|
127
|
+
return [pair1] if pair2.blank?
|
128
|
+
m = (pair2.last - pair1.last)/(pair2.first - pair1.first) # slope
|
129
|
+
b = pair2.last - m*pair2.first # y intercept
|
130
|
+
num = ((pair2.first - pair1.first)/dx).abs.round # number of points to interpolate
|
131
|
+
points = []
|
132
|
+
num.times do |i|
|
133
|
+
x = pair1.first + (i+1).to_f*dx
|
134
|
+
y = m*x + b
|
135
|
+
points << [x,y]
|
136
|
+
end
|
137
|
+
points # return an array of pairs
|
138
|
+
end
|
139
|
+
|
140
|
+
end
|
141
|
+
|
142
|
+
Wukong::Script.new(Mapper,Reducer).run
|
@@ -0,0 +1,52 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'wukong'
|
3
|
+
|
4
|
+
#
|
5
|
+
# Bigram counts
|
6
|
+
#
|
7
|
+
# head -n 100 /usr/share/dict/words | ./examples/corpus/words_to_bigrams.rb | sort | /tmp/words_to_bigrams.rb
|
8
|
+
#
|
9
|
+
|
10
|
+
|
11
|
+
#
|
12
|
+
# Kludge to work in Elastic map reduce:
|
13
|
+
#
|
14
|
+
# If your script is ./examples/corpus/words_to_bigrams.rb, make symlinks
|
15
|
+
# to it from ./examples/corpus/words_to_bigrams__map.rb and
|
16
|
+
# ./examples/corpus/words_to_bigrams__reduce.rb
|
17
|
+
#
|
18
|
+
if $0 =~ /__(map|reduce)\.rb$/
|
19
|
+
Settings[$1.to_sym] = true
|
20
|
+
end
|
21
|
+
|
22
|
+
|
23
|
+
#
|
24
|
+
# given one word per line
|
25
|
+
# emits all successive pairs of characters in that word
|
26
|
+
# eg 'boooo-urns' yields
|
27
|
+
# bo oo oo oo o- -u ur rn ns
|
28
|
+
#
|
29
|
+
class WordNGrams < Wukong::Streamer::Base
|
30
|
+
def process word
|
31
|
+
word[0..-2].chars.zip(word[1..-1].chars).each do |ngram_2|
|
32
|
+
yield ngram_2.join('')
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
#
|
38
|
+
# number of unique keys in a row
|
39
|
+
#
|
40
|
+
class KeyCountStreamer < Wukong::Streamer::AccumulatingReducer
|
41
|
+
def start! *args
|
42
|
+
@count = 0
|
43
|
+
end
|
44
|
+
def accumulate *args
|
45
|
+
@count += 1
|
46
|
+
end
|
47
|
+
def finalize
|
48
|
+
yield [key, @count]
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
Wukong::Script.new(WordNGrams, KeyCountStreamer).run
|
@@ -0,0 +1,41 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'rubygems'
|
3
|
+
require 'wukong'
|
4
|
+
|
5
|
+
# An example (and test) of streaming batches of data into distributed cassandra db
|
6
|
+
# Stream in whatever you like that has a key and value. Notice that you must
|
7
|
+
# have already defined a column space called 'Cruft' in storage-conf.xml as well
|
8
|
+
# as a column family called 'OhBaby'
|
9
|
+
|
10
|
+
class Mapper < Wukong::Streamer::CassandraStreamer
|
11
|
+
|
12
|
+
# you must redefine the column space, batch size, and db-seeds or they will
|
13
|
+
# be defaults. For testing on local machine simply seed db with 127.0.0.1:9160
|
14
|
+
|
15
|
+
def initialize *args
|
16
|
+
self.column_space = 'Cruft'
|
17
|
+
self.batch_size = 100
|
18
|
+
self.db_seeds = "127.0.0.1:9160"
|
19
|
+
super(*args)
|
20
|
+
@iter = 0
|
21
|
+
end
|
22
|
+
|
23
|
+
def process key, value, *_, &blk
|
24
|
+
insert_into_db(key, value)
|
25
|
+
yield [key, value] if (@iter %10 == 0)
|
26
|
+
end
|
27
|
+
|
28
|
+
# you must specify the column family, key, and value here
|
29
|
+
def insert_into_db key, value
|
30
|
+
@iter += 1
|
31
|
+
cassandra_db.insert(:OhBaby, key, {"value" => value}, :consistency => Cassandra::Consistency::ANY) unless key.blank?
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
#
|
36
|
+
# Executes the script
|
37
|
+
#
|
38
|
+
Wukong::Script.new(
|
39
|
+
Mapper,
|
40
|
+
nil
|
41
|
+
).run
|
@@ -1,4 +1,5 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
+
require 'rubygems'
|
2
3
|
$: << File.dirname(__FILE__)+'/../../lib'
|
3
4
|
require 'wukong'
|
4
5
|
require 'wukong/models/graph'; include Wukong::Models
|
@@ -53,8 +54,8 @@ module GenMultiEdge
|
|
53
54
|
rel = m.captures.first
|
54
55
|
src = src.to_i ; dest = dest.to_i
|
55
56
|
return if ((src == 0) || (dest == 0))
|
56
|
-
yield [
|
57
|
-
yield [
|
57
|
+
yield [src, dest, "a_#{rel}_b"]
|
58
|
+
yield [dest, src, "b_#{rel}_a"]
|
58
59
|
end
|
59
60
|
end
|
60
61
|
|
data/examples/sample_records.rb
CHANGED
@@ -2,6 +2,7 @@ require 'time'
|
|
2
2
|
DateTime.class_eval do
|
3
3
|
#
|
4
4
|
# Parses the time but never fails.
|
5
|
+
# Return value is always in the UTC time zone.
|
5
6
|
#
|
6
7
|
# A flattened time -- a 12-digit YYYYmmddHHMMMSS -- is treated as a UTC
|
7
8
|
# datetime.
|
@@ -9,11 +10,11 @@ DateTime.class_eval do
|
|
9
10
|
def self.parse_safely dt
|
10
11
|
begin
|
11
12
|
if dt.to_s =~ /\A\d{12}Z?\z/
|
12
|
-
parse(dt+'Z', true)
|
13
|
+
parse(dt+'Z', true).utc
|
13
14
|
else
|
14
|
-
parse(dt, true)
|
15
|
+
parse(dt, true).utc
|
15
16
|
end
|
16
|
-
rescue
|
17
|
+
rescue StandardError
|
17
18
|
nil
|
18
19
|
end
|
19
20
|
end
|
@@ -0,0 +1,79 @@
|
|
1
|
+
module Enumerable
|
2
|
+
|
3
|
+
# Calculates a sum from the elements. Examples:
|
4
|
+
#
|
5
|
+
# payments.sum { |p| p.price * p.tax_rate }
|
6
|
+
# payments.sum(&:price)
|
7
|
+
#
|
8
|
+
# The latter is a shortcut for:
|
9
|
+
#
|
10
|
+
# payments.inject { |sum, p| sum + p.price }
|
11
|
+
#
|
12
|
+
# It can also calculate the sum without the use of a block.
|
13
|
+
#
|
14
|
+
# [5, 15, 10].sum # => 30
|
15
|
+
# ["foo", "bar"].sum # => "foobar"
|
16
|
+
# [[1, 2], [3, 1, 5]].sum => [1, 2, 3, 1, 5]
|
17
|
+
#
|
18
|
+
# The default sum of an empty list is zero. You can override this default:
|
19
|
+
#
|
20
|
+
# [].sum(Payment.new(0)) { |i| i.amount } # => Payment.new(0)
|
21
|
+
#
|
22
|
+
def sum(identity = 0, &block)
|
23
|
+
if block_given?
|
24
|
+
map(&block).sum(identity)
|
25
|
+
else
|
26
|
+
inject { |sum, element| sum + element } || identity
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
# Iterates over a collection, passing the current element *and* the
|
31
|
+
# +memo+ to the block. Handy for building up hashes or
|
32
|
+
# reducing collections down to one object. Examples:
|
33
|
+
#
|
34
|
+
# %w(foo bar).each_with_object({}) { |str, hsh| hsh[str] = str.upcase } #=> {'foo' => 'FOO', 'bar' => 'BAR'}
|
35
|
+
#
|
36
|
+
# *Note* that you can't use immutable objects like numbers, true or false as
|
37
|
+
# the memo. You would think the following returns 120, but since the memo is
|
38
|
+
# never changed, it does not.
|
39
|
+
#
|
40
|
+
# (1..5).each_with_object(1) { |value, memo| memo *= value } # => 1
|
41
|
+
#
|
42
|
+
def each_with_object(memo, &block)
|
43
|
+
returning memo do |m|
|
44
|
+
each do |element|
|
45
|
+
block.call(element, m)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end unless [].respond_to?(:each_with_object)
|
49
|
+
|
50
|
+
# Convert an enumerable to a hash. Examples:
|
51
|
+
#
|
52
|
+
# people.index_by(&:login)
|
53
|
+
# => { "nextangle" => <Person ...>, "chade-" => <Person ...>, ...}
|
54
|
+
# people.index_by { |person| "#{person.first_name} #{person.last_name}" }
|
55
|
+
# => { "Chade- Fowlersburg-e" => <Person ...>, "David Heinemeier Hansson" => <Person ...>, ...}
|
56
|
+
#
|
57
|
+
def index_by
|
58
|
+
inject({}) do |accum, elem|
|
59
|
+
accum[yield(elem)] = elem
|
60
|
+
accum
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
# Returns true if the collection has more than 1 element. Functionally equivalent to collection.size > 1.
|
65
|
+
# Works with a block too ala any?, so people.many? { |p| p.age > 26 } # => returns true if more than 1 person is over 26.
|
66
|
+
def many?(&block)
|
67
|
+
size = block_given? ? select(&block).size : self.size
|
68
|
+
size > 1
|
69
|
+
end
|
70
|
+
|
71
|
+
# Returns true if none of the elements match the given block.
|
72
|
+
#
|
73
|
+
# success = responses.none? {|r| r.status / 100 == 5 }
|
74
|
+
#
|
75
|
+
# This is a builtin method in Ruby 1.8.7 and later.
|
76
|
+
def none?(&block)
|
77
|
+
!any?(&block)
|
78
|
+
end unless [].respond_to?(:none?)
|
79
|
+
end
|