wukong 1.5.4 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (87) hide show
  1. data/CHANGELOG.textile +32 -0
  2. data/README.textile +58 -12
  3. data/TODO.textile +0 -8
  4. data/bin/hdp-bzip +12 -17
  5. data/bin/hdp-kill-task +1 -1
  6. data/bin/hdp-sort +7 -7
  7. data/bin/hdp-stream +7 -7
  8. data/bin/hdp-stream-flat +2 -3
  9. data/bin/setcat +11 -0
  10. data/bin/uniq-ord +59 -0
  11. data/examples/corpus/bucket_counter.rb +47 -0
  12. data/examples/corpus/dbpedia_abstract_to_sentences.rb +85 -0
  13. data/examples/corpus/sentence_coocurrence.rb +70 -0
  14. data/examples/emr/README.textile +110 -0
  15. data/examples/emr/dot_wukong_dir/emr_bootstrap.sh +1 -0
  16. data/examples/emr/elastic_mapreduce_example.rb +2 -2
  17. data/examples/ignore_me/counting.rb +56 -0
  18. data/examples/ignore_me/grouper.rb +71 -0
  19. data/examples/network_graph/adjacency_list.rb +2 -2
  20. data/examples/network_graph/breadth_first_search.rb +14 -21
  21. data/examples/network_graph/gen_multi_edge.rb +22 -13
  22. data/examples/pagerank/pagerank.rb +1 -1
  23. data/examples/pagerank/pagerank_initialize.rb +6 -10
  24. data/examples/sample_records.rb +6 -16
  25. data/examples/server_logs/apache_log_parser.rb +7 -22
  26. data/examples/server_logs/breadcrumbs.rb +39 -0
  27. data/examples/server_logs/logline.rb +27 -0
  28. data/examples/size.rb +3 -2
  29. data/examples/{binning_percentile_estimator.rb → stats/binning_percentile_estimator.rb} +9 -11
  30. data/examples/{rank_and_bin.rb → stats/rank_and_bin.rb} +2 -2
  31. data/examples/stupidly_simple_filter.rb +11 -14
  32. data/examples/word_count.rb +16 -36
  33. data/lib/wukong/and_pig.rb +2 -15
  34. data/lib/wukong/logger.rb +7 -28
  35. data/lib/wukong/periodic_monitor.rb +24 -9
  36. data/lib/wukong/script/emr_command.rb +1 -0
  37. data/lib/wukong/script/hadoop_command.rb +31 -29
  38. data/lib/wukong/script.rb +19 -14
  39. data/lib/wukong/store/cassandra_model.rb +2 -1
  40. data/lib/wukong/streamer/accumulating_reducer.rb +5 -9
  41. data/lib/wukong/streamer/base.rb +44 -3
  42. data/lib/wukong/streamer/counting_reducer.rb +12 -12
  43. data/lib/wukong/streamer/filter.rb +2 -2
  44. data/lib/wukong/streamer/list_reducer.rb +3 -3
  45. data/lib/wukong/streamer/reducer.rb +11 -0
  46. data/lib/wukong/streamer.rb +7 -3
  47. data/lib/wukong.rb +7 -3
  48. data/{examples → old}/cassandra_streaming/berlitz_for_cassandra.textile +0 -0
  49. data/{examples → old}/cassandra_streaming/client_interface_notes.textile +0 -0
  50. data/{examples → old}/cassandra_streaming/client_schema.textile +0 -0
  51. data/{examples → old}/cassandra_streaming/tuning.textile +0 -0
  52. data/wukong.gemspec +257 -285
  53. metadata +45 -62
  54. data/examples/cassandra_streaming/avromapper.rb +0 -85
  55. data/examples/cassandra_streaming/cassandra.avpr +0 -468
  56. data/examples/cassandra_streaming/cassandra_random_partitioner.rb +0 -62
  57. data/examples/cassandra_streaming/catter.sh +0 -45
  58. data/examples/cassandra_streaming/client_schema.avpr +0 -211
  59. data/examples/cassandra_streaming/foofile.avr +0 -0
  60. data/examples/cassandra_streaming/pymap.sh +0 -1
  61. data/examples/cassandra_streaming/pyreduce.sh +0 -1
  62. data/examples/cassandra_streaming/smutation.avpr +0 -188
  63. data/examples/cassandra_streaming/streamer.sh +0 -51
  64. data/examples/cassandra_streaming/struct_loader.rb +0 -24
  65. data/examples/count_keys.rb +0 -56
  66. data/examples/count_keys_at_mapper.rb +0 -57
  67. data/examples/emr/README-elastic_map_reduce.textile +0 -26
  68. data/examples/keystore/cassandra_batch_test.rb +0 -41
  69. data/examples/keystore/conditional_outputter_example.rb +0 -70
  70. data/examples/store/chunked_store_example.rb +0 -18
  71. data/lib/wukong/dfs.rb +0 -81
  72. data/lib/wukong/keystore/cassandra_conditional_outputter.rb +0 -122
  73. data/lib/wukong/keystore/redis_db.rb +0 -24
  74. data/lib/wukong/keystore/tyrant_db.rb +0 -137
  75. data/lib/wukong/keystore/tyrant_notes.textile +0 -145
  76. data/lib/wukong/models/graph.rb +0 -25
  77. data/lib/wukong/monitor/chunked_store.rb +0 -23
  78. data/lib/wukong/monitor/periodic_logger.rb +0 -34
  79. data/lib/wukong/monitor/periodic_monitor.rb +0 -70
  80. data/lib/wukong/monitor.rb +0 -7
  81. data/lib/wukong/rdf.rb +0 -104
  82. data/lib/wukong/streamer/cassandra_streamer.rb +0 -61
  83. data/lib/wukong/streamer/count_keys.rb +0 -30
  84. data/lib/wukong/streamer/count_lines.rb +0 -26
  85. data/lib/wukong/streamer/em_streamer.rb +0 -7
  86. data/lib/wukong/streamer/preprocess_with_pipe_streamer.rb +0 -22
  87. data/lib/wukong/wukong_class.rb +0 -21
@@ -1,468 +0,0 @@
1
- {
2
- "protocol" : "Cassandra",
3
- "namespace" : "org.apache.cassandra.avro",
4
- "types" : [ {
5
- "type" : "enum",
6
- "name" : "AccessLevel",
7
- "symbols" : [ "NONE", "READONLY", "READWRITE", "FALL" ]
8
- }, {
9
- "type" : "record",
10
- "name" : "ColumnPath",
11
- "fields" : [ {
12
- "name" : "column_family",
13
- "type" : "string"
14
- }, {
15
- "name" : "super_column",
16
- "type" : [ "bytes", "null" ]
17
- }, {
18
- "name" : "column",
19
- "type" : [ "bytes", "null" ]
20
- } ]
21
- }, {
22
- "type" : "record",
23
- "name" : "ColumnParent",
24
- "fields" : [ {
25
- "name" : "column_family",
26
- "type" : "string"
27
- }, {
28
- "name" : "super_column",
29
- "type" : [ "bytes", "null" ]
30
- } ]
31
- }, {
32
- "type" : "record",
33
- "name" : "Clock",
34
- "fields" : [ {
35
- "name" : "timestamp",
36
- "type" : "long"
37
- } ]
38
- }, {
39
- "type" : "record",
40
- "name" : "Column",
41
- "fields" : [ {
42
- "name" : "name",
43
- "type" : "bytes"
44
- }, {
45
- "name" : "value",
46
- "type" : "bytes"
47
- }, {
48
- "name" : "clock",
49
- "type" : "Clock"
50
- }, {
51
- "name" : "ttl",
52
- "type" : [ "int", "null" ]
53
- } ]
54
- }, {
55
- "type" : "record",
56
- "name" : "SuperColumn",
57
- "fields" : [ {
58
- "name" : "name",
59
- "type" : "bytes"
60
- }, {
61
- "name" : "columns",
62
- "type" : {
63
- "type" : "array",
64
- "items" : "Column"
65
- }
66
- } ]
67
- }, {
68
- "type" : "record",
69
- "name" : "ColumnOrSuperColumn",
70
- "fields" : [ {
71
- "name" : "column",
72
- "type" : [ "Column", "null" ]
73
- }, {
74
- "name" : "super_column",
75
- "type" : [ "SuperColumn", "null" ]
76
- } ]
77
- }, {
78
- "type" : "record",
79
- "name" : "SliceRange",
80
- "fields" : [ {
81
- "name" : "start",
82
- "type" : "bytes"
83
- }, {
84
- "name" : "finish",
85
- "type" : "bytes"
86
- }, {
87
- "name" : "reversed",
88
- "type" : "boolean"
89
- }, {
90
- "name" : "count",
91
- "type" : "int"
92
- }, {
93
- "name" : "bitmasks",
94
- "type" : [ {
95
- "type" : "array",
96
- "items" : "bytes"
97
- }, "null" ]
98
- } ]
99
- }, {
100
- "type" : "record",
101
- "name" : "SlicePredicate",
102
- "fields" : [ {
103
- "name" : "column_names",
104
- "type" : [ {
105
- "type" : "array",
106
- "items" : "bytes"
107
- }, "null" ]
108
- }, {
109
- "name" : "slice_range",
110
- "type" : [ "SliceRange", "null" ]
111
- } ]
112
- }, {
113
- "type" : "record",
114
- "name" : "Deletion",
115
- "fields" : [ {
116
- "name" : "clock",
117
- "type" : "Clock"
118
- }, {
119
- "name" : "super_column",
120
- "type" : [ "bytes", "null" ]
121
- }, {
122
- "name" : "predicate",
123
- "type" : [ "SlicePredicate", "null" ]
124
- } ]
125
- }, {
126
- "type" : "record",
127
- "name" : "Mutation",
128
- "fields" : [ {
129
- "name" : "column_or_supercolumn",
130
- "type" : [ "ColumnOrSuperColumn", "null" ]
131
- }, {
132
- "name" : "deletion",
133
- "type" : [ "Deletion", "null" ]
134
- } ]
135
- }, {
136
- "type" : "enum",
137
- "name" : "IndexType",
138
- "symbols" : [ "KEYS" ]
139
- }, {
140
- "type" : "record",
141
- "name" : "ColumnDef",
142
- "fields" : [ {
143
- "name" : "name",
144
- "type" : "bytes"
145
- }, {
146
- "name" : "validation_class",
147
- "type" : "string"
148
- }, {
149
- "name" : "index_type",
150
- "type" : [ "IndexType", "null" ]
151
- }, {
152
- "name" : "index_name",
153
- "type" : [ "string", "null" ]
154
- } ]
155
- }, {
156
- "type" : "record",
157
- "name" : "CfDef",
158
- "fields" : [ {
159
- "name" : "keyspace",
160
- "type" : "string"
161
- }, {
162
- "name" : "name",
163
- "type" : "string"
164
- }, {
165
- "name" : "column_type",
166
- "type" : [ "string", "null" ]
167
- }, {
168
- "name" : "clock_type",
169
- "type" : [ "string", "null" ]
170
- }, {
171
- "name" : "comparator_type",
172
- "type" : [ "string", "null" ]
173
- }, {
174
- "name" : "subcomparator_type",
175
- "type" : [ "string", "null" ]
176
- }, {
177
- "name" : "reconciler",
178
- "type" : [ "string", "null" ]
179
- }, {
180
- "name" : "comment",
181
- "type" : [ "string", "null" ]
182
- }, {
183
- "name" : "row_cache_size",
184
- "type" : [ "double", "null" ]
185
- }, {
186
- "name" : "preload_row_cache",
187
- "type" : [ "boolean", "null" ]
188
- }, {
189
- "name" : "key_cache_size",
190
- "type" : [ "double", "null" ]
191
- }, {
192
- "name" : "read_repair_chance",
193
- "type" : [ "double", "null" ]
194
- }, {
195
- "name" : "gc_grace_seconds",
196
- "type" : [ "int", "null" ]
197
- }, {
198
- "name" : "column_metadata",
199
- "type" : [ {
200
- "type" : "array",
201
- "items" : "ColumnDef"
202
- }, "null" ]
203
- }, {
204
- "name" : "id",
205
- "type" : [ "int", "null" ]
206
- } ]
207
- }, {
208
- "type" : "record",
209
- "name" : "KsDef",
210
- "fields" : [ {
211
- "name" : "name",
212
- "type" : "string"
213
- }, {
214
- "name" : "strategy_class",
215
- "type" : "string"
216
- }, {
217
- "name" : "strategy_options",
218
- "type" : [ {
219
- "type" : "map",
220
- "values" : "string"
221
- }, "null" ]
222
- }, {
223
- "name" : "replication_factor",
224
- "type" : "int"
225
- }, {
226
- "name" : "cf_defs",
227
- "type" : {
228
- "type" : "array",
229
- "items" : "CfDef"
230
- }
231
- } ]
232
- }, {
233
- "type" : "record",
234
- "name" : "StreamingMutation",
235
- "fields" : [ {
236
- "name" : "key",
237
- "type" : "bytes"
238
- }, {
239
- "name" : "name",
240
- "type" : "bytes"
241
- }, {
242
- "name" : "value",
243
- "type" : "bytes"
244
- }, {
245
- "name" : "timestamp",
246
- "type" : "long"
247
- }, {
248
- "name" : "ttl",
249
- "type" : "int"
250
- } ]
251
- }, {
252
- "type" : "record",
253
- "name" : "MutationsMapEntry",
254
- "fields" : [ {
255
- "name" : "key",
256
- "type" : "bytes"
257
- }, {
258
- "name" : "mutations",
259
- "type" : {
260
- "type" : "map",
261
- "values" : {
262
- "type" : "array",
263
- "items" : "Mutation"
264
- }
265
- }
266
- } ]
267
- }, {
268
- "type" : "record",
269
- "name" : "CoscsMapEntry",
270
- "fields" : [ {
271
- "name" : "key",
272
- "type" : "bytes"
273
- }, {
274
- "name" : "columns",
275
- "type" : {
276
- "type" : "array",
277
- "items" : "ColumnOrSuperColumn"
278
- }
279
- } ]
280
- }, {
281
- "type" : "enum",
282
- "name" : "ConsistencyLevel",
283
- "symbols" : [ "ZERO", "ONE", "QUORUM", "DCQUORUM", "DCQUORUMSYNC", "ALL" ]
284
- }, {
285
- "type" : "error",
286
- "name" : "InvalidRequestException",
287
- "fields" : [ {
288
- "name" : "why",
289
- "type" : [ "string", "null" ]
290
- } ]
291
- }, {
292
- "type" : "error",
293
- "name" : "NotFoundException",
294
- "fields" : [ {
295
- "name" : "why",
296
- "type" : [ "string", "null" ]
297
- } ]
298
- }, {
299
- "type" : "error",
300
- "name" : "UnavailableException",
301
- "fields" : [ {
302
- "name" : "why",
303
- "type" : [ "string", "null" ]
304
- } ]
305
- }, {
306
- "type" : "error",
307
- "name" : "TimedOutException",
308
- "fields" : [ {
309
- "name" : "why",
310
- "type" : [ "string", "null" ]
311
- } ]
312
- } ],
313
- "messages" : {
314
- "get" : {
315
- "request" : [ {
316
- "name" : "key",
317
- "type" : "bytes"
318
- }, {
319
- "name" : "column_path",
320
- "type" : "ColumnPath"
321
- }, {
322
- "name" : "consistency_level",
323
- "type" : "ConsistencyLevel"
324
- } ],
325
- "response" : "ColumnOrSuperColumn",
326
- "errors" : [ "InvalidRequestException", "NotFoundException", "UnavailableException", "TimedOutException" ]
327
- },
328
- "get_slice" : {
329
- "request" : [ {
330
- "name" : "key",
331
- "type" : "bytes"
332
- }, {
333
- "name" : "column_parent",
334
- "type" : "ColumnParent"
335
- }, {
336
- "name" : "predicate",
337
- "type" : "SlicePredicate"
338
- }, {
339
- "name" : "consistency_level",
340
- "type" : "ConsistencyLevel"
341
- } ],
342
- "response" : {
343
- "type" : "array",
344
- "items" : "ColumnOrSuperColumn"
345
- },
346
- "errors" : [ "InvalidRequestException", "UnavailableException", "TimedOutException" ]
347
- },
348
- "multiget_slice" : {
349
- "request" : [ {
350
- "name" : "keys",
351
- "type" : {
352
- "type" : "array",
353
- "items" : "bytes"
354
- }
355
- }, {
356
- "name" : "column_parent",
357
- "type" : "ColumnParent"
358
- }, {
359
- "name" : "predicate",
360
- "type" : "SlicePredicate"
361
- }, {
362
- "name" : "consistency_level",
363
- "type" : "ConsistencyLevel"
364
- } ],
365
- "response" : {
366
- "type" : "array",
367
- "items" : "CoscsMapEntry"
368
- },
369
- "errors" : [ "InvalidRequestException", "UnavailableException", "TimedOutException" ]
370
- },
371
- "get_count" : {
372
- "request" : [ {
373
- "name" : "key",
374
- "type" : "bytes"
375
- }, {
376
- "name" : "column_parent",
377
- "type" : "ColumnParent"
378
- }, {
379
- "name" : "predicate",
380
- "type" : "SlicePredicate"
381
- }, {
382
- "name" : "consistency_level",
383
- "type" : "ConsistencyLevel"
384
- } ],
385
- "response" : "int",
386
- "errors" : [ "InvalidRequestException", "UnavailableException", "TimedOutException" ]
387
- },
388
- "insert" : {
389
- "request" : [ {
390
- "name" : "key",
391
- "type" : "bytes"
392
- }, {
393
- "name" : "column_parent",
394
- "type" : "ColumnParent"
395
- }, {
396
- "name" : "column",
397
- "type" : "Column"
398
- }, {
399
- "name" : "consistency_level",
400
- "type" : "ConsistencyLevel"
401
- } ],
402
- "response" : "null",
403
- "errors" : [ "InvalidRequestException", "UnavailableException", "TimedOutException" ]
404
- },
405
- "remove" : {
406
- "request" : [ {
407
- "name" : "key",
408
- "type" : "bytes"
409
- }, {
410
- "name" : "column_path",
411
- "type" : "ColumnPath"
412
- }, {
413
- "name" : "clock",
414
- "type" : "Clock"
415
- }, {
416
- "name" : "consistency_level",
417
- "type" : "ConsistencyLevel"
418
- } ],
419
- "response" : "null",
420
- "errors" : [ "InvalidRequestException", "UnavailableException", "TimedOutException" ]
421
- },
422
- "batch_mutate" : {
423
- "request" : [ {
424
- "name" : "mutation_map",
425
- "type" : {
426
- "type" : "array",
427
- "items" : "MutationsMapEntry"
428
- }
429
- }, {
430
- "name" : "consistency_level",
431
- "type" : "ConsistencyLevel"
432
- } ],
433
- "response" : "null",
434
- "errors" : [ "InvalidRequestException", "UnavailableException", "TimedOutException" ]
435
- },
436
- "system_add_keyspace" : {
437
- "request" : [ {
438
- "name" : "ks_def",
439
- "type" : "KsDef"
440
- } ],
441
- "response" : "null",
442
- "errors" : [ "InvalidRequestException" ]
443
- },
444
- "set_keyspace" : {
445
- "request" : [ {
446
- "name" : "keyspace",
447
- "type" : "string"
448
- } ],
449
- "response" : "null",
450
- "errors" : [ "InvalidRequestException" ]
451
- },
452
- "describe_keyspaces" : {
453
- "request" : [ ],
454
- "response" : {
455
- "type" : "array",
456
- "items" : "string"
457
- }
458
- },
459
- "describe_cluster_name" : {
460
- "request" : [ ],
461
- "response" : "string"
462
- },
463
- "describe_version" : {
464
- "request" : [ ],
465
- "response" : "string"
466
- }
467
- }
468
- }
@@ -1,62 +0,0 @@
1
- #!/usr/bin/env ruby
2
- require 'rubygems'
3
- require 'avro'
4
- require 'wukong'
5
- require 'wukong/periodic_monitor'
6
- Settings.define :log_interval, :default => 10_000
7
-
8
- require 'digest/md5'
9
- Settings.define :ring_nodes
10
-
11
- MAX_HASH = 2**127
12
- RING_NODES = 72
13
- RING_WIDTH = MAX_HASH / RING_NODES
14
- OUT_DIR = '/mnt/tmp/partitioned_words'
15
-
16
- # for foo in pw0/part-000* ; do echo $foo ; time cat $foo | ~/ics/wukong/examples/cassandra_streaming/cassandra_random_partitioner.rb --map 2>/tmp/split-`basename $foo`.log & done
17
-
18
- module CassandraRandomPartitioner
19
- def partition_hash key
20
- uval = Digest::MD5.hexdigest(key).to_i(16)
21
- (uval > 2**127) ? (2**128 - uval) : uval
22
- end
23
-
24
- def partition key
25
- partition_hash(key) / RING_WIDTH
26
- end
27
-
28
- def files
29
- @files ||= Hash.new{|h,part| h[part] = File.open(OUT_DIR+"/chunk-#{"%03d" % part}", 'w') }
30
- end
31
-
32
- end
33
-
34
- module PeriodicLog
35
- def log
36
- @log ||= PeriodicMonitor.new
37
- end
38
- end
39
-
40
- class HashingStreamer < Wukong::Streamer::RecordStreamer
41
- include CassandraRandomPartitioner
42
- include PeriodicLog
43
-
44
- def process word, count, *_
45
- log.periodically( word, count )
46
- part = partition(word)
47
- # yield [part, word, count]
48
- files[part] << [word, count].join("\t") << "\n"
49
- end
50
- end
51
-
52
- class HashingReducer < Wukong::Streamer::RecordStreamer
53
- include CassandraRandomPartitioner
54
- include PeriodicLog
55
-
56
- def process part, word, count, *_
57
- log.periodically( word, count )
58
- yield [word, count]
59
- end
60
- end
61
-
62
- Wukong::Script.new(HashingStreamer, HashingReducer, :map_speculative => false).run
@@ -1,45 +0,0 @@
1
- #!/usr/bin/env bash
2
-
3
- #
4
- # Cat a binary-encoded avro file into the bulk loader
5
- #
6
-
7
- input_file="$1" ; shift
8
- output_file="$1" ; shift
9
- map_script=${1-/bin/cat} ; shift
10
- reduce_script=${1-/usr/bin/uniq} ; shift
11
-
12
- dest_keyspace=${dest_keyspace-soc_net_tw}
13
- dest_col_family=${dest_col_family-Wordbag}
14
-
15
- hostname=`hostname`
16
-
17
- # Path to cassandra and hadoop dirs
18
- script_dir=$(readlink -f `dirname $0`)
19
- CASSANDRA_HOME=${CASSANDRA_HOME-/usr/local/share/cassandra}
20
- HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
21
- avro_file=${avro_file-$CASSANDRA_HOME/interface/avro/cassandra.avpr}
22
-
23
- ARCHIVES=`/bin/ls -1 $CASSANDRA_HOME/build/apache-cassandra*.jar`
24
- for jar in `/bin/ls -1 $CASSANDRA_HOME/build/lib/jars/*.jar $CASSANDRA_HOME/lib/*.jar`; do
25
- ARCHIVES=$ARCHIVES,$jar
26
- done
27
-
28
- ${HADOOP_HOME}/bin/hadoop \
29
- jar ${HADOOP_HOME}/contrib/streaming/hadoop-*streaming*.jar \
30
- -D stream.map.output=cassandra_avro_output \
31
- -D stream.io.identifier.resolver.class=org.apache.cassandra.hadoop.streaming.AvroResolver \
32
- -D cassandra.output.keyspace="$dest_keyspace" \
33
- -D cassandra.output.columnfamily="$dest_col_family" \
34
- -D cassandra.partitioner.class=org.apache.cassandra.dht.RandomPartitioner \
35
- -D cassandra.thrift.address="10.104.9.68" \
36
- -D cassandra.thrift.port=9160 \
37
- -D mapred.reduce.tasks=0 \
38
- -libjars $ARCHIVES \
39
- -file $avro_file \
40
- -outputformat org.apache.cassandra.hadoop.ColumnFamilyOutputFormat \
41
- -mapper `which cat` \
42
- -input "$input_file" \
43
- -output "$output_file" \
44
- "$@"
45
-