wukong 1.4.9 → 1.4.10

Sign up to get free protection for your applications and to get access to all the features.
data/TODO.textile ADDED
@@ -0,0 +1,13 @@
1
+
2
+
3
+
4
+ * add GEM_PATH to hadoop_recycle_env
5
+
6
+ * Hadoop_command function received an array for the input_path parameter
7
+
8
+ ** We should be able to specify comma *or* space separated paths; the last
9
+ space-separated path in Settings.rest becomes the output file, the others are
10
+ used as the input_file list.
11
+
12
+ * Make configliere Settings and streamer_instance.options() be the same
13
+ thing. (instead of almost-but-confusingly-not-always the same thing).
data/bin/hdp-bzip CHANGED
@@ -14,15 +14,15 @@ echo "Removing output directory $OUTPUT"
14
14
  hadoop fs -rmr $OUTPUT
15
15
 
16
16
  cmd="${HADOOP_HOME}/bin/hadoop \
17
- jar ${HADOOP_HOME}/contrib/streaming/hadoop-*-streaming.jar \
17
+ jar ${HADOOP_HOME}/contrib/streaming/hadoop-*streaming*.jar \
18
18
  -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner \
19
19
  -jobconf mapred.output.compress=true \
20
20
  -jobconf mapred.output.compression.codec=org.apache.hadoop.io.compress.BZip2Codec \
21
21
  -jobconf mapred.reduce.tasks=1 \
22
22
  -mapper \"/bin/cat\" \
23
- -reducer \"/usr/bin/uniq\" \
23
+ -reducer \"/bin/cat\" \
24
24
  $INPUTS
25
25
  -output $OUTPUT \
26
26
  "
27
27
  echo $cmd
28
- $cmd
28
+ $cmd
data/bin/hdp-kill-task ADDED
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env bash
2
+
3
+ exec hadoop fs -kill-task "$1"
data/bin/hdp-mkdir CHANGED
@@ -1,3 +1,2 @@
1
1
  #!/usr/bin/env bash
2
-
3
2
  exec hadoop fs -mkdir "$@"
data/bin/hdp-put CHANGED
@@ -1,3 +1,3 @@
1
1
  #!/usr/bin/env bash
2
2
 
3
- exec hadoop dfs -put "$1" "$2"
3
+ exec hadoop dfs -put "$@"
data/bin/hdp-sort CHANGED
@@ -8,7 +8,7 @@ reduce_script=${1-/usr/bin/uniq} ; shift
8
8
  partfields=${1-2} ; shift
9
9
  sortfields=${1-2} ; shift
10
10
 
11
- if [ "$output_file" == "" ] ; then echo "$0 input_file output_file [mapper=/bin/cat] [reducer=/usr/bin/uniq] [sortfields=2] [partfields=1] [extra_args]" ; exit ; fi
11
+ if [ "$output_file" == "" ] ; then echo "$0 input_file output_file [mapper=/bin/cat] [reducer=/usr/bin/uniq] [partfields=2] [sortfields=2] [extra_args]" ; exit ; fi
12
12
 
13
13
  HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
14
14
 
@@ -16,7 +16,9 @@ cmd="${HADOOP_HOME}/bin/hadoop \
16
16
  jar ${HADOOP_HOME}/contrib/streaming/hadoop-*-streaming.jar
17
17
  -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner
18
18
  -jobconf num.key.fields.for.partition=\"$partfields\"
19
- -jobconf stream.num.map.output.key.fields=\"$sortfields\"
19
+ -jobconf stream.num.map.output.key.fields=\"$sortfields\"
20
+ -jobconf stream.map.output.field.separator=\"'/t'\"
21
+ -jobconf mapred.text.key.partitioner.options=\"-k1,$partfields\"
20
22
  -mapper \"$map_script\"
21
23
  -reducer \"$reduce_script\"
22
24
  -input \"$input_file\"
@@ -28,22 +30,8 @@ echo "$cmd"
28
30
 
29
31
  $cmd
30
32
 
31
- # -jobconf mapred.text.key.partitioner.options="-k1,$partfields" \
32
- # -jobconf stream.map.output.field.separator='\t' \
33
- # -jobconf map.output.key.field.separator='\t' \
34
- # -jobconf mapred.map.tasks=3 \
35
- # -jobconf mapred.reduce.tasks=3 \
36
-
33
+ # Maybe?
37
34
  #
38
- # TODO:
39
- # http://issues.apache.org/jira/browse/MAPREDUCE-594
40
- # http://hadoop.apache.org/common/docs/current/api/org/apache/hadoop/mapred/KeyValueTextInputFormat.html
41
- # Instead of /bin/cat, Identity can be (I think)
42
35
  # -inputformat org.apache.hadoop.mapred.KeyValueTextInputFormat \
43
36
  # -mapper org.apache.hadoop.mapred.lib.IdentityMapper \
44
- # ...
45
- #
46
- # TODO
47
37
  #
48
- # New-style secondary sort:
49
- # http://hadoop.apache.org/common/docs/r0.20.0/streaming.html
data/bin/hdp-stream CHANGED
@@ -8,7 +8,7 @@ reduce_script=${1-/usr/bin/uniq} ; shift
8
8
  partfields=${1-2} ; shift
9
9
  sortfields=${1-2} ; shift
10
10
 
11
- if [ "$output_file" == "" ] ; then echo "$0 input_file output_file [mapper=/bin/cat] [reducer=/usr/bin/uniq] [sortfields=2] [partfields=1] [extra_args]" ; exit ; fi
11
+ if [ "$output_file" == "" ] ; then echo "$0 input_file output_file [mapper=/bin/cat] [reducer=/usr/bin/uniq] [partfields=2] [sortfields=2] [extra_args]" ; exit ; fi
12
12
 
13
13
  HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
14
14
 
@@ -16,7 +16,9 @@ cmd="${HADOOP_HOME}/bin/hadoop \
16
16
  jar ${HADOOP_HOME}/contrib/streaming/hadoop-*-streaming.jar
17
17
  -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner
18
18
  -jobconf num.key.fields.for.partition=\"$partfields\"
19
- -jobconf stream.num.map.output.key.fields=\"$sortfields\"
19
+ -jobconf stream.num.map.output.key.fields=\"$sortfields\"
20
+ -jobconf stream.map.output.field.separator=\"'/t'\"
21
+ -jobconf mapred.text.key.partitioner.options=\"-k1,$partfields\"
20
22
  -mapper \"$map_script\"
21
23
  -reducer \"$reduce_script\"
22
24
  -input \"$input_file\"
@@ -28,22 +30,8 @@ echo "$cmd"
28
30
 
29
31
  $cmd
30
32
 
31
- # -jobconf mapred.text.key.partitioner.options="-k1,$partfields" \
32
- # -jobconf stream.map.output.field.separator='\t' \
33
- # -jobconf map.output.key.field.separator='\t' \
34
- # -jobconf mapred.map.tasks=3 \
35
- # -jobconf mapred.reduce.tasks=3 \
36
-
33
+ # Maybe?
37
34
  #
38
- # TODO:
39
- # http://issues.apache.org/jira/browse/MAPREDUCE-594
40
- # http://hadoop.apache.org/common/docs/current/api/org/apache/hadoop/mapred/KeyValueTextInputFormat.html
41
- # Instead of /bin/cat, Identity can be (I think)
42
35
  # -inputformat org.apache.hadoop.mapred.KeyValueTextInputFormat \
43
36
  # -mapper org.apache.hadoop.mapred.lib.IdentityMapper \
44
- # ...
45
- #
46
- # TODO
47
37
  #
48
- # New-style secondary sort:
49
- # http://hadoop.apache.org/common/docs/r0.20.0/streaming.html
data/bin/hdp-stream-flat CHANGED
@@ -1,9 +1,9 @@
1
1
  #!/usr/bin/env bash
2
2
 
3
- input_file=${1} ; shift
4
- output_file=${1} ; shift
5
- map_script=${1-/bin/cat} ; shift
6
- reduce_script=${1-/usr/bin/uniq} ; shift
3
+ input_file="${1}" ; shift
4
+ output_file="${1}" ; shift
5
+ map_script="${1-/bin/cat}" ; shift
6
+ reduce_script="${1-/usr/bin/uniq}" ; shift
7
7
 
8
8
  if [ "$output_file" == "" ] ; then echo "$0 input_file output_file [mapper=/bin/cat] [reducer=/usr/bin/uniq] [extra_args]" ; exit ; fi
9
9
 
@@ -14,7 +14,7 @@ HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
14
14
  # -jobconf mapred.reduce.tasks=3 \
15
15
 
16
16
  exec ${HADOOP_HOME}/bin/hadoop \
17
- jar ${HADOOP_HOME}/contrib/streaming/hadoop-*-streaming.jar \
17
+ jar ${HADOOP_HOME}/contrib/streaming/hadoop*streaming*.jar \
18
18
  "$@" \
19
19
  -jobconf "mapred.job.name=`basename $0`-$map_script-$input_file-$output_file" \
20
20
  -mapper "$map_script" \
data/bin/wu-sum CHANGED
@@ -1,4 +1,5 @@
1
1
  #!/usr/bin/env ruby
2
+ require 'rubygems'
2
3
  require 'wukong'
3
4
  require 'wukong/streamer/summing_reducer'
4
5
 
@@ -0,0 +1,90 @@
1
+ job_201006200508_0002 NORMAL flip parse_twitter_api_requests.rb---s3n//monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b 100.00%
2
+ s3 => hdfs bz2 parser, cond_em empty (?)
3
+ 201006200508_0002 35mins, 34sec 1 1812031232 0 12495736645 7240978546 8180472 388863907 parse_twitter_api_requests.rb---s3n//monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b
4
+
5
+ job_201006200508_0003 NORMAL flip parse_twitter_api_requests.rb---s3n//monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-dupes 100.00%
6
+ s3 => hdfs bz2 parser, cond_em duplicate
7
+ 201006200508_0003 15mins, 50sec 1 1812031232 0 11877866580 7240978546 8180472 383928615 parse_twitter_api_requests.rb---s3n//monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-dupes
8
+
9
+ job_201006200508_0004 NORMAL flip parse_twitter_api_requests.rb---/data/ripd/com.tw/com.twitter/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-from_hdfs_bz2 100.00%
10
+ hdfs => hdfs bz2 parser, cond_em empty
11
+ 201006200508_0004 36mins, 56sec 1 1812031232 13334645497 7240978546 8180472 395564272 parse_twitter_api_requests.rb---/data/ripd/com.tw/com.twitter/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-from_hdfs_bz2
12
+
13
+ job_201006200508_0005 NORMAL flip parse_twitter_api_requests.rb---/data/ripd/com.tw/com.twitter/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-from_hdfs_bz2_no_cond_em 100.00%
14
+ hdfs => hdfs bz2 parser, no_cond_em --
15
+ 201006200508_0005 35mins, 23sec 1 1812031232 13479823318 7240978546 8180472 396757046 parse_twitter_api_requests.rb---/data/ripd/com.tw/com.twitter/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-from_hdfs_bz2_no_cond_em
16
+
17
+ job_201006200508_0006 NORMAL flip hdp-stream-flat-/bin/cat-/data/ripd/com.tw/com.twitter/2009111*-/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111 100.00%
18
+ hdfs => hdfs bz2 `which cat`
19
+ 201006200508_0006 1mins, 10sec 1 1812031232 7240978549 7240978546 8180472 8180472 hdp-stream-flat-/bin/cat-/data/ripd/com.tw/com.twitter/2009111*-/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111
20
+
21
+ job_201006200508_0007 NORMAL flip hdp-stream-flat-/bin/cat-s3n://monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/2009111*-/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111-from_s3n 100.00%
22
+ s3 => hdfs bz2 `which cat`
23
+ 201006200508_0007 1mins, 55sec 1 1812031232 0 7240978549 7240978546 8180472 8180472 hdp-stream-flat-/bin/cat-s3n://monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/2009111*-/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111-from_s3n
24
+
25
+ job_201006200508_0008 NORMAL flip parse_twitter_api_requests.rb---/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-hdfs-flat-no_cond_em-no_db 100.00%
26
+ hdfs => hdfs flat parser no cond_em no db
27
+ 201006200508_0008 10mins, 59sec 1 7240978549 13545881166 7240978549 8180472 397172723 parse_twitter_api_requests.rb---/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-hdfs-flat-no_cond_em-no_db
28
+
29
+ job_201006200508_0015 NORMAL flip parse_twitter_api_requests.rb---/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-hdfs-flat-cond_em_users-no_db 100.00%
30
+ hdfs => hdfs flat parser cond_em on users only no DB
31
+ 201006200508_0015 23mins, 48sec 1 7240978549 13415414554 7240978549 8180472 396101235 parse_twitter_api_requests.rb---/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-hdfs-flat-cond_em_users-no_db
32
+
33
+ job_201006200508_0016 NORMAL flip parse_twitter_api_requests.rb---/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-hdfs-flat-cond_em_users-yes_db-nodupes 100.00%
34
+ hdfs => hdfs flat parser cond_em on users only - vanished saving id/sn to DB
35
+ 201006200508_0016 28mins, 7sec 1 0 7240978549 13414285504 7240978549 8180472 396091251 parse_twitter_api_requests.rb---/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-hdfs-flat-cond_em_users-yes_db-dupes
36
+
37
+ job_201006200508_0017 NORMAL flip parse_twitter_api_requests.rb---/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-hdfs-flat-cond_em_users-yes_db-dupes 100.00%
38
+ hdfs => hdfs flat parser cond_em on users only - duped saving id/sn to DB
39
+ 201006200508_0017 11mins, 51sec 1 0 7240978549 12221205449 7240978549 8180472 386114331 parse_twitter_api_requests.rb---/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-hdfs-flat-cond_em_users-yes_db-dupes
40
+
41
+ ===========================================================================
42
+ == Parse
43
+ ==
44
+
45
+ job_201006200508_0018 NORMAL flip parse_twitter_api_requests.rb---s3n//monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/2010056---/data/sn/tw/rawd/parsed/2010056 100.00%
46
+ 201006200508_0018 11hrs, 12mins, 43sec 1 25560337747 141729936525 128606199040 14198839 3918844056 parse_twitter_api_requests.rb---s3n//monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/2010056---/data/sn/tw/rawd/parsed/2010056
47
+
48
+ for foo in 0016 0017 0018 ; do echo $foo ; ~/ics/hadoop/chimpmark/bin/elephantscat.rb job_201006200508_$foo ; done
49
+ cat ~/timings/job/201006200508/*/*.tsv | wu-lign
50
+
51
+ job_id scraped_at run_time succ? s3n_in hdfs_in file_in hdfs_out file_out map_in map_out map_recs_in map_recs_out red_recs_in red_recs_out job_name
52
+ 201006200508_0002 35mins, 34sec 1 1812031232 0 0 12495736645 0 7240978546 0 8180472 388863907 0 0 parse_twitter_api_requests.rb---s3n//monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b
53
+ 201006200508_0003 15mins, 50sec 1 1812031232 0 0 11877866580 0 7240978546 0 8180472 383928615 0 0 parse_twitter_api_requests.rb---s3n//monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-dupes
54
+ 201006200508_0004 36mins, 56sec 1 1812031232 0 13334645497 0 7240978546 0 8180472 395564272 0 0 parse_twitter_api_requests.rb---/data/ripd/com.tw/com.twitter/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-from_hdfs_bz2
55
+ 201006200508_0005 35mins, 23sec 1 1812031232 0 13479823318 0 7240978546 0 8180472 396757046 0 0 parse_twitter_api_requests.rb---/data/ripd/com.tw/com.twitter/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-from_hdfs_bz2_no_cond_em
56
+ 201006200508_0006 1mins, 10sec 1 1812031232 0 7240978549 0 7240978546 0 8180472 8180472 0 0 hdp-stream-flat-/bin/cat-/data/ripd/com.tw/com.twitter/2009111*-/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111
57
+ 201006200508_0007 1mins, 55sec 1 1812031232 0 0 7240978549 0 7240978546 0 8180472 8180472 0 0 hdp-stream-flat-/bin/cat-s3n://monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/2009111*-/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111-from_s3n
58
+ 201006200508_0008 10mins, 59sec 1 7240978549 0 13545881166 0 7240978549 0 8180472 397172723 0 0 parse_twitter_api_requests.rb---/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-hdfs-flat-no_cond_em-no_db
59
+ 201006200508_0015 23mins, 48sec 1 7240978549 0 13415414554 0 7240978549 0 8180472 396101235 0 0 parse_twitter_api_requests.rb---/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-hdfs-flat-cond_em_users-no_db
60
+ 201006200508_0016 28mins, 7sec 1 7240978549 0 13414285504 0 7240978549 0 8180472 396091251 0 0 parse_twitter_api_requests.rb---/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-hdfs-flat-cond_em_users-yes_db-dupes
61
+ 201006200508_0017 11mins, 51sec 1 7240978549 0 12221205449 0 7240978549 0 8180472 386114331 0 0 parse_twitter_api_requests.rb---/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-hdfs-flat-cond_em_users-yes_db-dupes
62
+ 201006200508_0018 11hrs, 12mins, 43sec 1 25560337747 0 0 141729936525 0 128606199040 0 14198839 3918844056 0 0 parse_twitter_api_requests.rb---s3n//monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/2010056---/data/sn/tw/rawd/parsed/2010056
63
+ 201006200508_0021 8hrs, 50mins, 52sec 1 141779023755 62208536220 24722859867 73825391771 141729936525 189098533358 3918844056 3918844056 155139258 155139258 Unsplicer
64
+ 201006200508_0029 1mins, 20sec 1 1763173995 0 1762322014 0 1762322014 0 22764940 22764940 0 0 hdp-stream-flat-/bin/cat-/data/sn/tw/rawd/unspliced/twitter_user-/tmp/foo
65
+ 201006200508_0031 3hrs, 48mins, 6sec 1 14930014182 0 0 48106164389 0 113092707367 0 8408164 753481311 0 0 parse_twitter_api_requests.rb---s3n//monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/201004---/data/sn/tw/rawd/parsed/api/201004
66
+ 201006200508_0034 30mins, 46sec 1 7170990599 2203578261 8389754083 5031160348 7170990599 7170990510 143461243 143461241 143461241 67443309 bulk_load_conversation.rb---/data/sn/tw/fixd/objects/a_replies_b---/data/sn/tw/fixd/apeyeye/conversation/a_replies_b_json
67
+
68
+ Identity mapper Wukong `which cat` pig
69
+ Identity reducer wukong `which cat` pig
70
+ * no skew
71
+ * data/reducer > ram
72
+
73
+ Do a sort|uniq on 150GB
74
+
75
+
76
+ * 1.8 GB bz2, S3 => HDFS 1m55s
77
+ * 1.8 GB bz2, HDFS => HDFS 1m10s
78
+
79
+ TokyoTyrant, 1 node => 4 m1.large (Balancer) 15_000 inserts/sec
80
+ TokyoTyrant, 20 tasks => 4 m1.large (Balancer) 2_000 inserts/sec
81
+
82
+ ===========================================================================
83
+
84
+ Parse:
85
+
86
+ hdp-du s3n://monkeyshines.infochimps.org/data/ripd/com.tw/\*/ > /mnt/tmp/ripd_com.tw-du.tsv
87
+
88
+
89
+
90
+ 1050 entries 448483502374 417.7 GB
@@ -0,0 +1,142 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rubygems'
4
+ require 'wukong'
5
+ require 'wukong/streamer/count_keys'
6
+
7
+
8
+ #
9
+ # Ch3ck out dis moist azz code bitches!!
10
+ #
11
+ #
12
+
13
+ #
14
+ # Do nothing more than bin users here, arbitrary and probably bad
15
+ #
16
+ class Mapper < Wukong::Streamer::RecordStreamer
17
+ def process rank, followers
18
+ followers = followers.to_i
19
+ if followers > 100
20
+ yield [9,rank]
21
+ elsif followers > 75
22
+ yield [8,rank]
23
+ elsif followers > 50
24
+ yield [7,rank]
25
+ elsif followers > 25
26
+ yield [6,rank]
27
+ elsif followers > 15
28
+ yield [5,rank]
29
+ elsif followers > 10
30
+ yield [4,rank]
31
+ elsif followers > 5
32
+ yield [3,rank]
33
+ elsif followers > 4
34
+ yield [2,rank]
35
+ elsif followers > 1
36
+ yield [1,rank]
37
+ else
38
+ yield [0,rank]
39
+ end
40
+ end
41
+ end
42
+
43
+
44
+ #
45
+ # Calculate percentile rank for every pr value in a given follower bracket
46
+ #
47
+ class Reducer < Wukong::Streamer::AccumulatingReducer
48
+ attr_accessor :count_bin
49
+ def start! bin, rank
50
+ self.count_bin ||= {}
51
+ self.count_bin[bin] ||= {}
52
+ end
53
+
54
+ def accumulate bin, rank
55
+ rank = (rank.to_f*10.0).round.to_f/10.0
56
+ self.count_bin[bin][rank] ||= 0
57
+ self.count_bin[bin][rank] += 1
58
+ end
59
+
60
+ def finalize
61
+ count_bin[key] = generate_all_pairs(key).inject({}){|h,pair| h[pair.first] = pair.last; h}
62
+ yield [key, count_bin[key].values.sort.join(",")]
63
+ end
64
+
65
+ #
66
+ # Write the final table to disk as a ruby hash
67
+ #
68
+ def after_stream
69
+ table = File.open("trstrank_table.rb", 'w')
70
+ table << "TRSTRANK_TABLE = " << count_bin.inspect
71
+ table.close
72
+ end
73
+
74
+ #
75
+ # Return percentile of a given trstrank for a given follower bracket
76
+ #
77
+ def percentile bin, rank
78
+ ((count_less_than(bin,rank) + 0.5*frequency_of(bin,rank))/ total_num(bin) )*100.0
79
+ end
80
+
81
+ #
82
+ # Return the count of values less than rank
83
+ #
84
+ def count_less_than bin, rank
85
+ count_bin[bin].keys.inject(0){|count,key| count += count_bin[bin][key] if key.to_f < rank; count}
86
+ end
87
+
88
+ #
89
+ # Return the count of rank
90
+ #
91
+ def frequency_of bin, rank
92
+ count_bin[bin].keys.inject(0){|count,key| count += count_bin[bin][key] if key.to_f == rank; count}
93
+ end
94
+
95
+ #
96
+ # Return the total number in sample
97
+ #
98
+ def total_num bin
99
+ count_bin[bin].values.inject(0){|count,v| count += v; count}
100
+ end
101
+
102
+ #
103
+ # Generate a list of all pairs {trstrank => percentile}, interpolate when necessary
104
+ #
105
+ def generate_all_pairs bin
106
+ h = {}
107
+ count_bin[bin].keys.each do |rank|
108
+ h[rank.to_f] = percentile(bin, rank.to_f)
109
+ end
110
+ h[0.0] ||= 0.0
111
+ h[10.0] ||= 100.0
112
+ arr = h.to_a.sort!{|x,y| x.first <=> y.first}
113
+ list = arr.zip(arr[1..-1])
114
+ big_list = []
115
+ big_list << [0.0,0.0]
116
+ list.each do |pairs|
117
+ interpolate(pairs.first, pairs.last, 0.1).each{|pair| big_list << pair}
118
+ end
119
+ big_list.uniq.sort{|x,y| x.first <=> y.first}
120
+ end
121
+
122
+
123
+ #
124
+ # Nothing to see here, move along
125
+ #
126
+ def interpolate pair1, pair2, dx
127
+ return [pair1] if pair2.blank?
128
+ m = (pair2.last - pair1.last)/(pair2.first - pair1.first) # slope
129
+ b = pair2.last - m*pair2.first # y intercept
130
+ num = ((pair2.first - pair1.first)/dx).abs.round # number of points to interpolate
131
+ points = []
132
+ num.times do |i|
133
+ x = pair1.first + (i+1).to_f*dx
134
+ y = m*x + b
135
+ points << [x,y]
136
+ end
137
+ points # return an array of pairs
138
+ end
139
+
140
+ end
141
+
142
+ Wukong::Script.new(Mapper,Reducer).run
@@ -0,0 +1,52 @@
1
+ #!/usr/bin/env ruby
2
+ require 'wukong'
3
+
4
+ #
5
+ # Bigram counts
6
+ #
7
+ # head -n 100 /usr/share/dict/words | ./examples/corpus/words_to_bigrams.rb | sort | /tmp/words_to_bigrams.rb
8
+ #
9
+
10
+
11
+ #
12
+ # Kludge to work in Elastic map reduce:
13
+ #
14
+ # If your script is ./examples/corpus/words_to_bigrams.rb, make symlinks
15
+ # to it from ./examples/corpus/words_to_bigrams__map.rb and
16
+ # ./examples/corpus/words_to_bigrams__reduce.rb
17
+ #
18
+ if $0 =~ /__(map|reduce)\.rb$/
19
+ Settings[$1.to_sym] = true
20
+ end
21
+
22
+
23
+ #
24
+ # given one word per line
25
+ # emits all successive pairs of characters in that word
26
+ # eg 'boooo-urns' yields
27
+ # bo oo oo oo o- -u ur rn ns
28
+ #
29
+ class WordNGrams < Wukong::Streamer::Base
30
+ def process word
31
+ word[0..-2].chars.zip(word[1..-1].chars).each do |ngram_2|
32
+ yield ngram_2.join('')
33
+ end
34
+ end
35
+ end
36
+
37
+ #
38
+ # number of unique keys in a row
39
+ #
40
+ class KeyCountStreamer < Wukong::Streamer::AccumulatingReducer
41
+ def start! *args
42
+ @count = 0
43
+ end
44
+ def accumulate *args
45
+ @count += 1
46
+ end
47
+ def finalize
48
+ yield [key, @count]
49
+ end
50
+ end
51
+
52
+ Wukong::Script.new(WordNGrams, KeyCountStreamer).run
@@ -0,0 +1,41 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'wukong'
4
+
5
+ # An example (and test) of streaming batches of data into distributed cassandra db
6
+ # Stream in whatever you like that has a key and value. Notice that you must
7
+ # have already defined a column space called 'Cruft' in storage-conf.xml as well
8
+ # as a column family called 'OhBaby'
9
+
10
+ class Mapper < Wukong::Streamer::CassandraStreamer
11
+
12
+ # you must redefine the column space, batch size, and db-seeds or they will
13
+ # be defaults. For testing on local machine simply seed db with 127.0.0.1:9160
14
+
15
+ def initialize *args
16
+ self.column_space = 'Cruft'
17
+ self.batch_size = 100
18
+ self.db_seeds = "127.0.0.1:9160"
19
+ super(*args)
20
+ @iter = 0
21
+ end
22
+
23
+ def process key, value, *_, &blk
24
+ insert_into_db(key, value)
25
+ yield [key, value] if (@iter %10 == 0)
26
+ end
27
+
28
+ # you must specify the column family, key, and value here
29
+ def insert_into_db key, value
30
+ @iter += 1
31
+ cassandra_db.insert(:OhBaby, key, {"value" => value}, :consistency => Cassandra::Consistency::ANY) unless key.blank?
32
+ end
33
+ end
34
+
35
+ #
36
+ # Executes the script
37
+ #
38
+ Wukong::Script.new(
39
+ Mapper,
40
+ nil
41
+ ).run
@@ -1,4 +1,5 @@
1
1
  #!/usr/bin/env ruby
2
+ require 'rubygems'
2
3
  $: << File.dirname(__FILE__)+'/../../lib'
3
4
  require 'wukong'
4
5
  require 'wukong/models/graph'; include Wukong::Models
@@ -53,8 +54,8 @@ module GenMultiEdge
53
54
  rel = m.captures.first
54
55
  src = src.to_i ; dest = dest.to_i
55
56
  return if ((src == 0) || (dest == 0))
56
- yield ["%010d"%src, "%010d"%dest, "a_#{rel}_b"]
57
- yield ["%010d"%dest, "%010d"%src, "b_#{rel}_a"]
57
+ yield [src, dest, "a_#{rel}_b"]
58
+ yield [dest, src, "b_#{rel}_a"]
58
59
  end
59
60
  end
60
61
 
@@ -1,5 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
  $: << File.dirname(__FILE__)+'/../lib'
3
+ require 'rubygems'
3
4
  require 'wukong'
4
5
 
5
6
  #
@@ -2,6 +2,7 @@ require 'time'
2
2
  DateTime.class_eval do
3
3
  #
4
4
  # Parses the time but never fails.
5
+ # Return value is always in the UTC time zone.
5
6
  #
6
7
  # A flattened time -- a 12-digit YYYYmmddHHMMMSS -- is treated as a UTC
7
8
  # datetime.
@@ -9,11 +10,11 @@ DateTime.class_eval do
9
10
  def self.parse_safely dt
10
11
  begin
11
12
  if dt.to_s =~ /\A\d{12}Z?\z/
12
- parse(dt+'Z', true)
13
+ parse(dt+'Z', true).utc
13
14
  else
14
- parse(dt, true)
15
+ parse(dt, true).utc
15
16
  end
16
- rescue
17
+ rescue StandardError
17
18
  nil
18
19
  end
19
20
  end
@@ -0,0 +1,79 @@
1
+ module Enumerable
2
+
3
+ # Calculates a sum from the elements. Examples:
4
+ #
5
+ # payments.sum { |p| p.price * p.tax_rate }
6
+ # payments.sum(&:price)
7
+ #
8
+ # The latter is a shortcut for:
9
+ #
10
+ # payments.inject { |sum, p| sum + p.price }
11
+ #
12
+ # It can also calculate the sum without the use of a block.
13
+ #
14
+ # [5, 15, 10].sum # => 30
15
+ # ["foo", "bar"].sum # => "foobar"
16
+ # [[1, 2], [3, 1, 5]].sum => [1, 2, 3, 1, 5]
17
+ #
18
+ # The default sum of an empty list is zero. You can override this default:
19
+ #
20
+ # [].sum(Payment.new(0)) { |i| i.amount } # => Payment.new(0)
21
+ #
22
+ def sum(identity = 0, &block)
23
+ if block_given?
24
+ map(&block).sum(identity)
25
+ else
26
+ inject { |sum, element| sum + element } || identity
27
+ end
28
+ end
29
+
30
+ # Iterates over a collection, passing the current element *and* the
31
+ # +memo+ to the block. Handy for building up hashes or
32
+ # reducing collections down to one object. Examples:
33
+ #
34
+ # %w(foo bar).each_with_object({}) { |str, hsh| hsh[str] = str.upcase } #=> {'foo' => 'FOO', 'bar' => 'BAR'}
35
+ #
36
+ # *Note* that you can't use immutable objects like numbers, true or false as
37
+ # the memo. You would think the following returns 120, but since the memo is
38
+ # never changed, it does not.
39
+ #
40
+ # (1..5).each_with_object(1) { |value, memo| memo *= value } # => 1
41
+ #
42
+ def each_with_object(memo, &block)
43
+ returning memo do |m|
44
+ each do |element|
45
+ block.call(element, m)
46
+ end
47
+ end
48
+ end unless [].respond_to?(:each_with_object)
49
+
50
+ # Convert an enumerable to a hash. Examples:
51
+ #
52
+ # people.index_by(&:login)
53
+ # => { "nextangle" => <Person ...>, "chade-" => <Person ...>, ...}
54
+ # people.index_by { |person| "#{person.first_name} #{person.last_name}" }
55
+ # => { "Chade- Fowlersburg-e" => <Person ...>, "David Heinemeier Hansson" => <Person ...>, ...}
56
+ #
57
+ def index_by
58
+ inject({}) do |accum, elem|
59
+ accum[yield(elem)] = elem
60
+ accum
61
+ end
62
+ end
63
+
64
+ # Returns true if the collection has more than 1 element. Functionally equivalent to collection.size > 1.
65
+ # Works with a block too ala any?, so people.many? { |p| p.age > 26 } # => returns true if more than 1 person is over 26.
66
+ def many?(&block)
67
+ size = block_given? ? select(&block).size : self.size
68
+ size > 1
69
+ end
70
+
71
+ # Returns true if none of the elements match the given block.
72
+ #
73
+ # success = responses.none? {|r| r.status / 100 == 5 }
74
+ #
75
+ # This is a builtin method in Ruby 1.8.7 and later.
76
+ def none?(&block)
77
+ !any?(&block)
78
+ end unless [].respond_to?(:none?)
79
+ end
@@ -4,6 +4,7 @@
4
4
  #
5
5
  require 'wukong/extensions/blank'
6
6
  require 'wukong/extensions/class'
7
+ require 'wukong/extensions/enumerable'
7
8
  require 'wukong/extensions/symbol'
8
9
  require 'wukong/extensions/hash'
9
10
  require 'wukong/extensions/hash_like'