RubyGems - wukong - Versions diffs - 1.4.9 → 1.4.10 - Mend

wukong 1.4.9 → 1.4.10

Files changed (28) hide show

data/TODO.textile +13 -0
data/bin/hdp-bzip +3 -3
data/bin/hdp-kill-task +3 -0
data/bin/hdp-mkdir +0 -1
data/bin/hdp-put +1 -1
data/bin/hdp-sort +5 -17
data/bin/hdp-stream +5 -17
data/bin/hdp-stream-flat +5 -5
data/bin/wu-sum +1 -0
data/docpages/README-performance.textile +90 -0
data/examples/binning_percentile_estimator.rb +142 -0
data/examples/corpus/words_to_bigrams.rb +52 -0
data/examples/keystore/cassandra_batch_test.rb +41 -0
data/examples/network_graph/gen_multi_edge.rb +3 -2
data/examples/sample_records.rb +1 -0
data/lib/wukong/extensions/date_time.rb +4 -3
data/lib/wukong/extensions/enumerable.rb +79 -0
data/lib/wukong/extensions.rb +1 -0
data/lib/wukong/keystore/redis_db.rb +24 -0
data/lib/wukong/keystore/tyrant_db.rb +124 -0
data/lib/wukong/keystore/tyrant_notes.textile +145 -0
data/lib/wukong/periodic_monitor.rb +57 -0
data/lib/wukong/script/hadoop_command.rb +3 -1
data/lib/wukong/streamer/accumulating_reducer.rb +1 -0
data/lib/wukong/streamer/cassandra_streamer.rb +61 -0
data/lib/wukong/streamer.rb +12 -10
data/wukong.gemspec +34 -16
metadata +60 -16

data/TODO.textile ADDED Viewed

@@ -0,0 +1,13 @@
+* add GEM_PATH to hadoop_recycle_env
+* Hadoop_command function received an array for the input_path parameter
+** We should be able to specify comma *or* space separated paths; the last
+   space-separated path in Settings.rest becomes the output file, the others are
+   used as the input_file list.
+* Make configliere Settings and streamer_instance.options() be the same
+  thing. (instead of almost-but-confusingly-not-always the same thing).

data/bin/hdp-bzip CHANGED Viewed

@@ -14,15 +14,15 @@ echo "Removing output directory $OUTPUT"
 hadoop fs -rmr $OUTPUT
 cmd="${HADOOP_HOME}/bin/hadoop \
-     jar         ${HADOOP_HOME}/contrib/streaming/hadoop-*-streaming.jar		   \
+     jar         ${HADOOP_HOME}/contrib/streaming/hadoop-*streaming*.jar		   \
     -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner 			   \
     -jobconf     mapred.output.compress=true                                               \
     -jobconf     mapred.output.compression.codec=org.apache.hadoop.io.compress.BZip2Codec  \
     -jobconf     mapred.reduce.tasks=1                                                     \
     -mapper  	 \"/bin/cat\"                                                              \
-    -reducer	 \"/usr/bin/uniq\"                                                         \
+    -reducer	 \"/bin/cat\"                                                              \
     $INPUTS
     -output  	 $OUTPUT                                                                   \
     "
 echo $cmd
-$cmd
+$cmd

data/bin/hdp-kill-task ADDED Viewed

@@ -0,0 +1,3 @@
+#!/usr/bin/env bash
+exec hadoop fs -kill-task "$1"

data/bin/hdp-mkdir CHANGED Viewed

@@ -1,3 +1,2 @@
 #!/usr/bin/env bash
 exec hadoop fs -mkdir "$@"

data/bin/hdp-put CHANGED Viewed

@@ -1,3 +1,3 @@
 #!/usr/bin/env bash
-exec hadoop dfs -put "$1" "$2"
+exec hadoop dfs -put "$@"

data/bin/hdp-sort CHANGED Viewed

@@ -8,7 +8,7 @@ reduce_script=${1-/usr/bin/uniq} ; shift
 partfields=${1-2} 		; shift
 sortfields=${1-2} 		; shift
-if [ "$output_file" == "" ] ; then echo "$0 input_file output_file [mapper=/bin/cat] [reducer=/usr/bin/uniq] [sortfields=2] [partfields=1] [extra_args]" ; exit ; fi
+if [ "$output_file" == "" ] ; then echo "$0 input_file output_file [mapper=/bin/cat] [reducer=/usr/bin/uniq] [partfields=2] [sortfields=2] [extra_args]" ; exit ; fi
 HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
@@ -16,7 +16,9 @@ cmd="${HADOOP_HOME}/bin/hadoop \
      jar         ${HADOOP_HOME}/contrib/streaming/hadoop-*-streaming.jar
     -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner
     -jobconf     num.key.fields.for.partition=\"$partfields\"
-    -jobconf 	 stream.num.map.output.key.fields=\"$sortfields\"
+    -jobconf 	 stream.num.map.output.key.fields=\"$sortfields\"
+    -jobconf     stream.map.output.field.separator=\"'/t'\"
+    -jobconf     mapred.text.key.partitioner.options=\"-k1,$partfields\"
     -mapper  	 \"$map_script\"
     -reducer	 \"$reduce_script\"
     -input       \"$input_file\"
@@ -28,22 +30,8 @@ echo "$cmd"
 $cmd
-# -jobconf      mapred.text.key.partitioner.options="-k1,$partfields"                   \
-# -jobconf      stream.map.output.field.separator='\t'                                  \
-# -jobconf      map.output.key.field.separator='\t'                                     \
-# -jobconf      mapred.map.tasks=3                                                      \
-# -jobconf      mapred.reduce.tasks=3                                                   \
+# Maybe?
 #
-# TODO:
-#   http://issues.apache.org/jira/browse/MAPREDUCE-594
-#   http://hadoop.apache.org/common/docs/current/api/org/apache/hadoop/mapred/KeyValueTextInputFormat.html
-#   Instead of /bin/cat, Identity can be (I think)
 #     -inputformat    org.apache.hadoop.mapred.KeyValueTextInputFormat \
 #     -mapper         org.apache.hadoop.mapred.lib.IdentityMapper      \
-#     ...
-#
-# TODO
 #
-#   New-style secondary sort:
-#     http://hadoop.apache.org/common/docs/r0.20.0/streaming.html

data/bin/hdp-stream CHANGED Viewed

@@ -8,7 +8,7 @@ reduce_script=${1-/usr/bin/uniq} ; shift
 partfields=${1-2} 		; shift
 sortfields=${1-2} 		; shift
-if [ "$output_file" == "" ] ; then echo "$0 input_file output_file [mapper=/bin/cat] [reducer=/usr/bin/uniq] [sortfields=2] [partfields=1] [extra_args]" ; exit ; fi
+if [ "$output_file" == "" ] ; then echo "$0 input_file output_file [mapper=/bin/cat] [reducer=/usr/bin/uniq] [partfields=2] [sortfields=2] [extra_args]" ; exit ; fi
 HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
@@ -16,7 +16,9 @@ cmd="${HADOOP_HOME}/bin/hadoop \
      jar         ${HADOOP_HOME}/contrib/streaming/hadoop-*-streaming.jar
     -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner
     -jobconf     num.key.fields.for.partition=\"$partfields\"
-    -jobconf 	 stream.num.map.output.key.fields=\"$sortfields\"
+    -jobconf 	 stream.num.map.output.key.fields=\"$sortfields\"
+    -jobconf     stream.map.output.field.separator=\"'/t'\"
+    -jobconf     mapred.text.key.partitioner.options=\"-k1,$partfields\"
     -mapper  	 \"$map_script\"
     -reducer	 \"$reduce_script\"
     -input       \"$input_file\"
@@ -28,22 +30,8 @@ echo "$cmd"
 $cmd
-# -jobconf      mapred.text.key.partitioner.options="-k1,$partfields"                   \
-# -jobconf      stream.map.output.field.separator='\t'                                  \
-# -jobconf      map.output.key.field.separator='\t'                                     \
-# -jobconf      mapred.map.tasks=3                                                      \
-# -jobconf      mapred.reduce.tasks=3                                                   \
+# Maybe?
 #
-# TODO:
-#   http://issues.apache.org/jira/browse/MAPREDUCE-594
-#   http://hadoop.apache.org/common/docs/current/api/org/apache/hadoop/mapred/KeyValueTextInputFormat.html
-#   Instead of /bin/cat, Identity can be (I think)
 #     -inputformat    org.apache.hadoop.mapred.KeyValueTextInputFormat \
 #     -mapper         org.apache.hadoop.mapred.lib.IdentityMapper      \
-#     ...
-#
-# TODO
 #
-#   New-style secondary sort:
-#     http://hadoop.apache.org/common/docs/r0.20.0/streaming.html

data/bin/hdp-stream-flat CHANGED Viewed

@@ -1,9 +1,9 @@
 #!/usr/bin/env bash
-input_file=${1} 			; shift
-output_file=${1} 			; shift
-map_script=${1-/bin/cat}		; shift
-reduce_script=${1-/usr/bin/uniq}	; shift
+input_file="${1}" 			; shift
+output_file="${1}" 			; shift
+map_script="${1-/bin/cat}"		; shift
+reduce_script="${1-/usr/bin/uniq}"	; shift
 if [ "$output_file" == "" ] ; then echo "$0 input_file output_file [mapper=/bin/cat] [reducer=/usr/bin/uniq] [extra_args]" ; exit ; fi
@@ -14,7 +14,7 @@ HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
 # -jobconf mapred.reduce.tasks=3                                                    \
 exec ${HADOOP_HOME}/bin/hadoop \
-     jar         ${HADOOP_HOME}/contrib/streaming/hadoop-*-streaming.jar		\
+     jar         ${HADOOP_HOME}/contrib/streaming/hadoop*streaming*.jar		\
     "$@"                                                                                \
     -jobconf    "mapred.job.name=`basename $0`-$map_script-$input_file-$output_file"    \
     -mapper  	"$map_script"  								\

data/bin/wu-sum CHANGED Viewed

@@ -1,4 +1,5 @@
 #!/usr/bin/env ruby
+require 'rubygems'
 require 'wukong'
 require 'wukong/streamer/summing_reducer'

data/docpages/README-performance.textile ADDED Viewed

@@ -0,0 +1,90 @@
+job_201006200508_0002	NORMAL	flip	parse_twitter_api_requests.rb---s3n//monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b	100.00%
+	s3 => hdfs   	bz2     parser, cond_em 	empty (?)
+	201006200508_0002		35mins, 34sec	1	1812031232	         0		12495736645		7240978546		8180472	388863907			parse_twitter_api_requests.rb---s3n//monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b
+job_201006200508_0003	NORMAL	flip	parse_twitter_api_requests.rb---s3n//monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-dupes	100.00%
+	s3 => hdfs  	bz2	parser, cond_em 	duplicate
+	201006200508_0003		15mins, 50sec	1	1812031232	         0		11877866580		7240978546		8180472	383928615			parse_twitter_api_requests.rb---s3n//monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-dupes
+job_201006200508_0004	NORMAL	flip	parse_twitter_api_requests.rb---/data/ripd/com.tw/com.twitter/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-from_hdfs_bz2	100.00%
+	hdfs => hdfs 	bz2	parser, cond_em 	empty
+	201006200508_0004		36mins, 56sec	1	          	1812031232		13334645497		7240978546		8180472	395564272			parse_twitter_api_requests.rb---/data/ripd/com.tw/com.twitter/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-from_hdfs_bz2
+job_201006200508_0005	NORMAL	flip	parse_twitter_api_requests.rb---/data/ripd/com.tw/com.twitter/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-from_hdfs_bz2_no_cond_em	100.00%
+	hdfs => hdfs 	bz2	parser, no_cond_em	--
+	201006200508_0005		35mins, 23sec	1	          	1812031232		13479823318		7240978546		8180472	396757046			parse_twitter_api_requests.rb---/data/ripd/com.tw/com.twitter/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-from_hdfs_bz2_no_cond_em
+job_201006200508_0006	NORMAL	flip	hdp-stream-flat-/bin/cat-/data/ripd/com.tw/com.twitter/2009111*-/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111	100.00%
+	hdfs => hdfs   	bz2	`which cat`
+	201006200508_0006		 1mins, 10sec 	1	          	1812031232		 7240978549		7240978546		8180472	  8180472			hdp-stream-flat-/bin/cat-/data/ripd/com.tw/com.twitter/2009111*-/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111
+job_201006200508_0007	NORMAL	flip	hdp-stream-flat-/bin/cat-s3n://monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/2009111*-/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111-from_s3n	100.00%
+	s3 => hdfs   	bz2	`which cat`
+	201006200508_0007		 1mins, 55sec 	1	1812031232	         0		 7240978549		7240978546		8180472	  8180472			hdp-stream-flat-/bin/cat-s3n://monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/2009111*-/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111-from_s3n
+job_201006200508_0008	NORMAL	flip	parse_twitter_api_requests.rb---/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-hdfs-flat-no_cond_em-no_db	100.00%
+	hdfs => hdfs        flat    parser  no cond_em                      no db
+	201006200508_0008		10mins, 59sec	1	          	7240978549		13545881166		7240978549		8180472	397172723			parse_twitter_api_requests.rb---/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-hdfs-flat-no_cond_em-no_db
+job_201006200508_0015	NORMAL	flip	parse_twitter_api_requests.rb---/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-hdfs-flat-cond_em_users-no_db	100.00%
+	hdfs => hdfs        flat    parser  cond_em on users only           no DB
+	201006200508_0015		23mins, 48sec	1	          	7240978549		13415414554		7240978549		8180472	396101235			parse_twitter_api_requests.rb---/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-hdfs-flat-cond_em_users-no_db
+job_201006200508_0016	NORMAL	flip	parse_twitter_api_requests.rb---/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-hdfs-flat-cond_em_users-yes_db-nodupes	100.00%
+	hdfs => hdfs        flat    parser  cond_em on users only - vanished    saving id/sn to DB
+        201006200508_0016		28mins, 7sec        	1	          0	7240978549		 13414285504		  7240978549		 8180472	 396091251			parse_twitter_api_requests.rb---/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-hdfs-flat-cond_em_users-yes_db-dupes
+job_201006200508_0017	NORMAL	flip	parse_twitter_api_requests.rb---/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-hdfs-flat-cond_em_users-yes_db-dupes	100.00%
+	hdfs => hdfs        flat    parser  cond_em on users only - duped   	saving id/sn to DB
+        201006200508_0017		11mins, 51sec       	1	          0	7240978549		 12221205449		  7240978549		 8180472	 386114331			parse_twitter_api_requests.rb---/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-hdfs-flat-cond_em_users-yes_db-dupes
+===========================================================================
+== Parse
+==
+job_201006200508_0018	NORMAL	flip	parse_twitter_api_requests.rb---s3n//monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/2010056---/data/sn/tw/rawd/parsed/2010056	100.00%
+        201006200508_0018		11hrs, 12mins, 43sec	1	25560337747	          		141729936525		128606199040		14198839	3918844056			parse_twitter_api_requests.rb---s3n//monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/2010056---/data/sn/tw/rawd/parsed/2010056
+for foo in 0016 0017 0018 ; do echo $foo ; ~/ics/hadoop/chimpmark/bin/elephantscat.rb job_201006200508_$foo ; done
+cat ~/timings/job/201006200508/*/*.tsv | wu-lign
+        job_id	scraped_at          	run_time        	succ?	s3n_in   	hdfs_in  	    file_in	    hdfs_out	   file_out	      map_in	     map_out	map_recs_in	map_recs_out	red_recs_in	red_recs_out	job_name
+        201006200508_0002		       35mins, 34sec	1	1812031232 	           0	          0	 12495736645	          0	  7240978546	           0	   8180472	 388863907	        0	        0	parse_twitter_api_requests.rb---s3n//monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b
+        201006200508_0003		       15mins, 50sec	1	1812031232 	           0	          0	 11877866580	          0	  7240978546	           0	   8180472	 383928615	        0	        0	parse_twitter_api_requests.rb---s3n//monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-dupes
+        201006200508_0004		       36mins, 56sec	1	           	  1812031232	          0	 13334645497	          0	  7240978546	           0	   8180472	 395564272	        0	        0	parse_twitter_api_requests.rb---/data/ripd/com.tw/com.twitter/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-from_hdfs_bz2
+        201006200508_0005		       35mins, 23sec	1	           	  1812031232	          0	 13479823318	          0	  7240978546	           0	   8180472	 396757046	        0	        0	parse_twitter_api_requests.rb---/data/ripd/com.tw/com.twitter/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-from_hdfs_bz2_no_cond_em
+        201006200508_0006		        1mins, 10sec 	1	           	  1812031232	          0	  7240978549	          0	  7240978546	           0	   8180472	   8180472	        0	        0	hdp-stream-flat-/bin/cat-/data/ripd/com.tw/com.twitter/2009111*-/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111
+        201006200508_0007		        1mins, 55sec 	1	1812031232 	           0	          0	  7240978549	          0	  7240978546	           0	   8180472	   8180472	        0	        0	hdp-stream-flat-/bin/cat-s3n://monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/2009111*-/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111-from_s3n
+        201006200508_0008		       10mins, 59sec	1	           	  7240978549	          0	 13545881166	          0	  7240978549	           0	   8180472	 397172723	        0	        0	parse_twitter_api_requests.rb---/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-hdfs-flat-no_cond_em-no_db
+        201006200508_0015		       23mins, 48sec	1	           	  7240978549	          0	 13415414554	          0	  7240978549	           0	   8180472	 396101235	        0	        0	parse_twitter_api_requests.rb---/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-hdfs-flat-cond_em_users-no_db
+        201006200508_0016		        28mins, 7sec	1	           	  7240978549	          0	 13414285504	          0	  7240978549	           0	   8180472	 396091251	        0	        0	parse_twitter_api_requests.rb---/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-hdfs-flat-cond_em_users-yes_db-dupes
+        201006200508_0017		       11mins, 51sec	1	           	  7240978549	          0	 12221205449	          0	  7240978549	           0	   8180472	 386114331	        0	        0	parse_twitter_api_requests.rb---/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-hdfs-flat-cond_em_users-yes_db-dupes
+        201006200508_0018		11hrs, 12mins, 43sec	1	25560337747	           0	          0	141729936525	          0	128606199040	           0	  14198839	3918844056	        0	        0	parse_twitter_api_requests.rb---s3n//monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/2010056---/data/sn/tw/rawd/parsed/2010056
+        201006200508_0021		 8hrs, 50mins, 52sec 	1	           	141779023755	62208536220	 24722859867	73825391771	141729936525	189098533358	3918844056	3918844056	155139258	155139258	Unsplicer
+        201006200508_0029		        1mins, 20sec  	1	           	  1763173995	          0	  1762322014	          0	  1762322014	           0	  22764940	  22764940	        0	        0	hdp-stream-flat-/bin/cat-/data/sn/tw/rawd/unspliced/twitter_user-/tmp/foo
+        201006200508_0031		  3hrs, 48mins, 6sec  	1	14930014182	           0	          0	 48106164389	          0	113092707367	           0	   8408164	 753481311	        0	        0	parse_twitter_api_requests.rb---s3n//monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/201004---/data/sn/tw/rawd/parsed/api/201004
+        201006200508_0034		       30mins, 46sec	1	           	  7170990599	 2203578261	  8389754083	 5031160348	  7170990599	  7170990510	 143461243	 143461241	143461241	 67443309	bulk_load_conversation.rb---/data/sn/tw/fixd/objects/a_replies_b---/data/sn/tw/fixd/apeyeye/conversation/a_replies_b_json
+Identity mapper         Wukong          `which cat`             pig
+Identity reducer        wukong          `which cat`             pig
+* no skew
+* data/reducer > ram
+Do a sort|uniq on 150GB
+* 1.8 GB bz2, S3 => HDFS                                1m55s
+* 1.8 GB bz2, HDFS => HDFS                              1m10s
+TokyoTyrant, 1 node => 4 m1.large (Balancer)            15_000 inserts/sec
+TokyoTyrant, 20 tasks => 4 m1.large (Balancer)           2_000 inserts/sec
+===========================================================================
+Parse:
+hdp-du s3n://monkeyshines.infochimps.org/data/ripd/com.tw/\*/ > /mnt/tmp/ripd_com.tw-du.tsv
+                                                    1050 entries       	   448483502374	       417.7 GB

data/examples/binning_percentile_estimator.rb ADDED Viewed

@@ -0,0 +1,142 @@
+#!/usr/bin/env ruby
+require 'rubygems'
+require 'wukong'
+require 'wukong/streamer/count_keys'
+#
+# Ch3ck out dis moist azz code bitches!!
+#
+#
+#
+# Do nothing more than bin users here, arbitrary and probably bad
+#
+class Mapper < Wukong::Streamer::RecordStreamer
+  def process rank, followers
+    followers = followers.to_i
+    if followers > 100
+      yield [9,rank]
+    elsif followers > 75
+      yield [8,rank]
+    elsif followers > 50
+      yield [7,rank]
+    elsif followers > 25
+      yield [6,rank]
+    elsif followers > 15
+      yield [5,rank]
+    elsif followers > 10
+      yield [4,rank]
+    elsif followers > 5
+      yield [3,rank]
+    elsif followers > 4
+      yield [2,rank]
+    elsif followers > 1
+      yield [1,rank]
+    else
+      yield [0,rank]
+    end
+  end
+end
+#
+# Calculate percentile rank for every pr value in a given follower bracket
+#
+class Reducer < Wukong::Streamer::AccumulatingReducer
+  attr_accessor :count_bin
+  def start! bin, rank
+    self.count_bin            ||= {}
+    self.count_bin[bin]       ||= {}
+  end
+  def accumulate bin, rank
+    rank = (rank.to_f*10.0).round.to_f/10.0
+    self.count_bin[bin][rank] ||= 0
+    self.count_bin[bin][rank] += 1
+  end
+  def finalize
+    count_bin[key] = generate_all_pairs(key).inject({}){|h,pair| h[pair.first] = pair.last; h}
+    yield [key, count_bin[key].values.sort.join(",")]
+  end
+  #
+  # Write the final table to disk as a ruby hash
+  #
+  def after_stream
+    table = File.open("trstrank_table.rb", 'w')
+    table << "TRSTRANK_TABLE = " << count_bin.inspect
+    table.close
+  end
+  #
+  # Return percentile of a given trstrank for a given follower bracket
+  #
+  def percentile bin, rank
+    ((count_less_than(bin,rank) + 0.5*frequency_of(bin,rank))/ total_num(bin) )*100.0
+  end
+  #
+  # Return the count of values less than rank
+  #
+  def count_less_than bin, rank
+    count_bin[bin].keys.inject(0){|count,key| count += count_bin[bin][key] if key.to_f < rank; count}
+  end
+  #
+  # Return the count of rank
+  #
+  def frequency_of bin, rank
+    count_bin[bin].keys.inject(0){|count,key| count += count_bin[bin][key] if key.to_f == rank; count}
+  end
+  #
+  # Return the total number in sample
+  #
+  def total_num bin
+    count_bin[bin].values.inject(0){|count,v| count += v; count}
+  end
+  #
+  # Generate a list of all pairs {trstrank => percentile}, interpolate when necessary
+  #
+  def generate_all_pairs bin
+    h = {}
+    count_bin[bin].keys.each do |rank|
+      h[rank.to_f] = percentile(bin, rank.to_f)
+    end
+    h[0.0]  ||= 0.0
+    h[10.0] ||= 100.0
+    arr      = h.to_a.sort!{|x,y| x.first <=> y.first}
+    list     = arr.zip(arr[1..-1])
+    big_list = []
+    big_list << [0.0,0.0]
+    list.each do |pairs|
+      interpolate(pairs.first, pairs.last, 0.1).each{|pair| big_list << pair}
+    end
+    big_list.uniq.sort{|x,y| x.first <=> y.first}
+  end
+  #
+  # Nothing to see here, move along
+  #
+  def interpolate pair1, pair2, dx
+    return [pair1] if pair2.blank?
+    m   = (pair2.last - pair1.last)/(pair2.first - pair1.first) # slope
+    b   = pair2.last - m*pair2.first                            # y intercept
+    num = ((pair2.first - pair1.first)/dx).abs.round            # number of points to interpolate
+    points = []
+    num.times do |i|
+      x = pair1.first + (i+1).to_f*dx
+      y = m*x + b
+      points << [x,y]
+    end
+    points                                                       # return an array of pairs
+  end
+end
+Wukong::Script.new(Mapper,Reducer).run

data/examples/corpus/words_to_bigrams.rb ADDED Viewed

@@ -0,0 +1,52 @@
+#!/usr/bin/env ruby
+require 'wukong'
+#
+# Bigram counts
+#
+# head -n 100 /usr/share/dict/words | ./examples/corpus/words_to_bigrams.rb  | sort |  /tmp/words_to_bigrams.rb
+#
+#
+# Kludge to work in Elastic map reduce:
+#
+# If your script is ./examples/corpus/words_to_bigrams.rb, make symlinks
+# to it from ./examples/corpus/words_to_bigrams__map.rb and
+# ./examples/corpus/words_to_bigrams__reduce.rb
+#
+if $0 =~ /__(map|reduce)\.rb$/
+  Settings[$1.to_sym] = true
+end
+#
+# given one word per line
+# emits all successive pairs of characters in that word
+# eg 'boooo-urns' yields
+#   bo oo oo oo o- -u ur rn ns
+#
+class WordNGrams < Wukong::Streamer::Base
+  def process word
+    word[0..-2].chars.zip(word[1..-1].chars).each do |ngram_2|
+      yield ngram_2.join('')
+    end
+  end
+end
+#
+# number of unique keys in a row
+#
+class KeyCountStreamer < Wukong::Streamer::AccumulatingReducer
+  def start! *args
+    @count = 0
+  end
+  def accumulate *args
+    @count += 1
+  end
+  def finalize
+    yield [key, @count]
+  end
+end
+Wukong::Script.new(WordNGrams, KeyCountStreamer).run

data/examples/keystore/cassandra_batch_test.rb ADDED Viewed

@@ -0,0 +1,41 @@
+#!/usr/bin/env ruby
+require 'rubygems'
+require 'wukong'
+# An example (and test) of streaming batches of data into distributed cassandra db
+# Stream in whatever you like that has a key and value. Notice that you must
+# have already defined a column space called 'Cruft' in storage-conf.xml as well
+# as a column family called 'OhBaby'
+class Mapper < Wukong::Streamer::CassandraStreamer
+  # you must redefine the column space, batch size, and db-seeds  or they will
+  # be defaults. For testing on local machine simply seed db with 127.0.0.1:9160
+  def initialize *args
+    self.column_space = 'Cruft'
+    self.batch_size = 100
+    self.db_seeds = "127.0.0.1:9160"
+    super(*args)
+    @iter = 0
+  end
+  def process key, value, *_, &blk
+    insert_into_db(key, value)
+    yield [key, value] if (@iter %10 == 0)
+  end
+  # you must specify the column family, key, and value here
+  def insert_into_db key, value
+    @iter += 1
+    cassandra_db.insert(:OhBaby, key, {"value" => value}, :consistency => Cassandra::Consistency::ANY) unless key.blank?
+  end
+end
+#
+# Executes the script
+#
+Wukong::Script.new(
+  Mapper,
+  nil
+).run

data/examples/network_graph/gen_multi_edge.rb CHANGED Viewed

@@ -1,4 +1,5 @@
 #!/usr/bin/env ruby
+require 'rubygems'
 $: << File.dirname(__FILE__)+'/../../lib'
 require 'wukong'
 require 'wukong/models/graph'; include Wukong::Models
@@ -53,8 +54,8 @@ module GenMultiEdge
       rel = m.captures.first
       src = src.to_i ; dest = dest.to_i
       return if ((src == 0) || (dest == 0))
-      yield ["%010d"%src,  "%010d"%dest, "a_#{rel}_b"]
-      yield ["%010d"%dest, "%010d"%src,  "b_#{rel}_a"]
+      yield [src,  dest, "a_#{rel}_b"]
+      yield [dest, src,  "b_#{rel}_a"]
     end
   end

data/examples/sample_records.rb CHANGED Viewed

@@ -1,5 +1,6 @@
 #!/usr/bin/env ruby
 $: << File.dirname(__FILE__)+'/../lib'
+require 'rubygems'
 require 'wukong'
 #

data/lib/wukong/extensions/date_time.rb CHANGED Viewed

@@ -2,6 +2,7 @@ require 'time'
 DateTime.class_eval do
   #
   # Parses the time but never fails.
+  # Return value is always in the UTC time zone.
   #
   # A flattened time -- a 12-digit YYYYmmddHHMMMSS -- is treated as a UTC
   # datetime.
@@ -9,11 +10,11 @@ DateTime.class_eval do
   def self.parse_safely dt
     begin
       if dt.to_s =~ /\A\d{12}Z?\z/
-        parse(dt+'Z', true)
+        parse(dt+'Z', true).utc
       else
-        parse(dt, true)
+        parse(dt, true).utc
       end
-    rescue
+    rescue StandardError
       nil
     end
   end

data/lib/wukong/extensions/enumerable.rb ADDED Viewed

@@ -0,0 +1,79 @@
+module Enumerable
+  # Calculates a sum from the elements. Examples:
+  #
+  #  payments.sum { |p| p.price * p.tax_rate }
+  #  payments.sum(&:price)
+  #
+  # The latter is a shortcut for:
+  #
+  #  payments.inject { |sum, p| sum + p.price }
+  #
+  # It can also calculate the sum without the use of a block.
+  #
+  #  [5, 15, 10].sum # => 30
+  #  ["foo", "bar"].sum # => "foobar"
+  #  [[1, 2], [3, 1, 5]].sum => [1, 2, 3, 1, 5]
+  #
+  # The default sum of an empty list is zero. You can override this default:
+  #
+  #  [].sum(Payment.new(0)) { |i| i.amount } # => Payment.new(0)
+  #
+  def sum(identity = 0, &block)
+    if block_given?
+      map(&block).sum(identity)
+    else
+      inject { |sum, element| sum + element } || identity
+    end
+  end
+  # Iterates over a collection, passing the current element *and* the
+  # +memo+ to the block. Handy for building up hashes or
+  # reducing collections down to one object. Examples:
+  #
+  #   %w(foo bar).each_with_object({}) { |str, hsh| hsh[str] = str.upcase } #=> {'foo' => 'FOO', 'bar' => 'BAR'}
+  #
+  # *Note* that you can't use immutable objects like numbers, true or false as
+  # the memo. You would think the following returns 120, but since the memo is
+  # never changed, it does not.
+  #
+  #   (1..5).each_with_object(1) { |value, memo| memo *= value } # => 1
+  #
+  def each_with_object(memo, &block)
+    returning memo do |m|
+      each do |element|
+        block.call(element, m)
+      end
+    end
+  end unless [].respond_to?(:each_with_object)
+  # Convert an enumerable to a hash. Examples:
+  #
+  #   people.index_by(&:login)
+  #     => { "nextangle" => <Person ...>, "chade-" => <Person ...>, ...}
+  #   people.index_by { |person| "#{person.first_name} #{person.last_name}" }
+  #     => { "Chade- Fowlersburg-e" => <Person ...>, "David Heinemeier Hansson" => <Person ...>, ...}
+  #
+  def index_by
+    inject({}) do |accum, elem|
+      accum[yield(elem)] = elem
+      accum
+    end
+  end
+  # Returns true if the collection has more than 1 element. Functionally equivalent to collection.size > 1.
+  # Works with a block too ala any?, so people.many? { |p| p.age > 26 } # => returns true if more than 1 person is over 26.
+  def many?(&block)
+    size = block_given? ? select(&block).size : self.size
+    size > 1
+  end
+  # Returns true if none of the elements match the given block.
+  #
+  #   success = responses.none? {|r| r.status / 100 == 5 }
+  #
+  # This is a builtin method in Ruby 1.8.7 and later.
+  def none?(&block)
+    !any?(&block)
+  end unless [].respond_to?(:none?)
+end

data/lib/wukong/extensions.rb CHANGED Viewed

@@ -4,6 +4,7 @@
 #
 require 'wukong/extensions/blank'
 require 'wukong/extensions/class'
+require 'wukong/extensions/enumerable'
 require 'wukong/extensions/symbol'
 require 'wukong/extensions/hash'
 require 'wukong/extensions/hash_like'