wukong 1.5.4 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG.textile +32 -0
- data/README.textile +58 -12
- data/TODO.textile +0 -8
- data/bin/hdp-bzip +12 -17
- data/bin/hdp-kill-task +1 -1
- data/bin/hdp-sort +7 -7
- data/bin/hdp-stream +7 -7
- data/bin/hdp-stream-flat +2 -3
- data/bin/setcat +11 -0
- data/bin/uniq-ord +59 -0
- data/examples/corpus/bucket_counter.rb +47 -0
- data/examples/corpus/dbpedia_abstract_to_sentences.rb +85 -0
- data/examples/corpus/sentence_coocurrence.rb +70 -0
- data/examples/emr/README.textile +110 -0
- data/examples/emr/dot_wukong_dir/emr_bootstrap.sh +1 -0
- data/examples/emr/elastic_mapreduce_example.rb +2 -2
- data/examples/ignore_me/counting.rb +56 -0
- data/examples/ignore_me/grouper.rb +71 -0
- data/examples/network_graph/adjacency_list.rb +2 -2
- data/examples/network_graph/breadth_first_search.rb +14 -21
- data/examples/network_graph/gen_multi_edge.rb +22 -13
- data/examples/pagerank/pagerank.rb +1 -1
- data/examples/pagerank/pagerank_initialize.rb +6 -10
- data/examples/sample_records.rb +6 -16
- data/examples/server_logs/apache_log_parser.rb +7 -22
- data/examples/server_logs/breadcrumbs.rb +39 -0
- data/examples/server_logs/logline.rb +27 -0
- data/examples/size.rb +3 -2
- data/examples/{binning_percentile_estimator.rb → stats/binning_percentile_estimator.rb} +9 -11
- data/examples/{rank_and_bin.rb → stats/rank_and_bin.rb} +2 -2
- data/examples/stupidly_simple_filter.rb +11 -14
- data/examples/word_count.rb +16 -36
- data/lib/wukong/and_pig.rb +2 -15
- data/lib/wukong/logger.rb +7 -28
- data/lib/wukong/periodic_monitor.rb +24 -9
- data/lib/wukong/script/emr_command.rb +1 -0
- data/lib/wukong/script/hadoop_command.rb +31 -29
- data/lib/wukong/script.rb +19 -14
- data/lib/wukong/store/cassandra_model.rb +2 -1
- data/lib/wukong/streamer/accumulating_reducer.rb +5 -9
- data/lib/wukong/streamer/base.rb +44 -3
- data/lib/wukong/streamer/counting_reducer.rb +12 -12
- data/lib/wukong/streamer/filter.rb +2 -2
- data/lib/wukong/streamer/list_reducer.rb +3 -3
- data/lib/wukong/streamer/reducer.rb +11 -0
- data/lib/wukong/streamer.rb +7 -3
- data/lib/wukong.rb +7 -3
- data/{examples → old}/cassandra_streaming/berlitz_for_cassandra.textile +0 -0
- data/{examples → old}/cassandra_streaming/client_interface_notes.textile +0 -0
- data/{examples → old}/cassandra_streaming/client_schema.textile +0 -0
- data/{examples → old}/cassandra_streaming/tuning.textile +0 -0
- data/wukong.gemspec +257 -285
- metadata +45 -62
- data/examples/cassandra_streaming/avromapper.rb +0 -85
- data/examples/cassandra_streaming/cassandra.avpr +0 -468
- data/examples/cassandra_streaming/cassandra_random_partitioner.rb +0 -62
- data/examples/cassandra_streaming/catter.sh +0 -45
- data/examples/cassandra_streaming/client_schema.avpr +0 -211
- data/examples/cassandra_streaming/foofile.avr +0 -0
- data/examples/cassandra_streaming/pymap.sh +0 -1
- data/examples/cassandra_streaming/pyreduce.sh +0 -1
- data/examples/cassandra_streaming/smutation.avpr +0 -188
- data/examples/cassandra_streaming/streamer.sh +0 -51
- data/examples/cassandra_streaming/struct_loader.rb +0 -24
- data/examples/count_keys.rb +0 -56
- data/examples/count_keys_at_mapper.rb +0 -57
- data/examples/emr/README-elastic_map_reduce.textile +0 -26
- data/examples/keystore/cassandra_batch_test.rb +0 -41
- data/examples/keystore/conditional_outputter_example.rb +0 -70
- data/examples/store/chunked_store_example.rb +0 -18
- data/lib/wukong/dfs.rb +0 -81
- data/lib/wukong/keystore/cassandra_conditional_outputter.rb +0 -122
- data/lib/wukong/keystore/redis_db.rb +0 -24
- data/lib/wukong/keystore/tyrant_db.rb +0 -137
- data/lib/wukong/keystore/tyrant_notes.textile +0 -145
- data/lib/wukong/models/graph.rb +0 -25
- data/lib/wukong/monitor/chunked_store.rb +0 -23
- data/lib/wukong/monitor/periodic_logger.rb +0 -34
- data/lib/wukong/monitor/periodic_monitor.rb +0 -70
- data/lib/wukong/monitor.rb +0 -7
- data/lib/wukong/rdf.rb +0 -104
- data/lib/wukong/streamer/cassandra_streamer.rb +0 -61
- data/lib/wukong/streamer/count_keys.rb +0 -30
- data/lib/wukong/streamer/count_lines.rb +0 -26
- data/lib/wukong/streamer/em_streamer.rb +0 -7
- data/lib/wukong/streamer/preprocess_with_pipe_streamer.rb +0 -22
- data/lib/wukong/wukong_class.rb +0 -21
@@ -1,468 +0,0 @@
|
|
1
|
-
{
|
2
|
-
"protocol" : "Cassandra",
|
3
|
-
"namespace" : "org.apache.cassandra.avro",
|
4
|
-
"types" : [ {
|
5
|
-
"type" : "enum",
|
6
|
-
"name" : "AccessLevel",
|
7
|
-
"symbols" : [ "NONE", "READONLY", "READWRITE", "FALL" ]
|
8
|
-
}, {
|
9
|
-
"type" : "record",
|
10
|
-
"name" : "ColumnPath",
|
11
|
-
"fields" : [ {
|
12
|
-
"name" : "column_family",
|
13
|
-
"type" : "string"
|
14
|
-
}, {
|
15
|
-
"name" : "super_column",
|
16
|
-
"type" : [ "bytes", "null" ]
|
17
|
-
}, {
|
18
|
-
"name" : "column",
|
19
|
-
"type" : [ "bytes", "null" ]
|
20
|
-
} ]
|
21
|
-
}, {
|
22
|
-
"type" : "record",
|
23
|
-
"name" : "ColumnParent",
|
24
|
-
"fields" : [ {
|
25
|
-
"name" : "column_family",
|
26
|
-
"type" : "string"
|
27
|
-
}, {
|
28
|
-
"name" : "super_column",
|
29
|
-
"type" : [ "bytes", "null" ]
|
30
|
-
} ]
|
31
|
-
}, {
|
32
|
-
"type" : "record",
|
33
|
-
"name" : "Clock",
|
34
|
-
"fields" : [ {
|
35
|
-
"name" : "timestamp",
|
36
|
-
"type" : "long"
|
37
|
-
} ]
|
38
|
-
}, {
|
39
|
-
"type" : "record",
|
40
|
-
"name" : "Column",
|
41
|
-
"fields" : [ {
|
42
|
-
"name" : "name",
|
43
|
-
"type" : "bytes"
|
44
|
-
}, {
|
45
|
-
"name" : "value",
|
46
|
-
"type" : "bytes"
|
47
|
-
}, {
|
48
|
-
"name" : "clock",
|
49
|
-
"type" : "Clock"
|
50
|
-
}, {
|
51
|
-
"name" : "ttl",
|
52
|
-
"type" : [ "int", "null" ]
|
53
|
-
} ]
|
54
|
-
}, {
|
55
|
-
"type" : "record",
|
56
|
-
"name" : "SuperColumn",
|
57
|
-
"fields" : [ {
|
58
|
-
"name" : "name",
|
59
|
-
"type" : "bytes"
|
60
|
-
}, {
|
61
|
-
"name" : "columns",
|
62
|
-
"type" : {
|
63
|
-
"type" : "array",
|
64
|
-
"items" : "Column"
|
65
|
-
}
|
66
|
-
} ]
|
67
|
-
}, {
|
68
|
-
"type" : "record",
|
69
|
-
"name" : "ColumnOrSuperColumn",
|
70
|
-
"fields" : [ {
|
71
|
-
"name" : "column",
|
72
|
-
"type" : [ "Column", "null" ]
|
73
|
-
}, {
|
74
|
-
"name" : "super_column",
|
75
|
-
"type" : [ "SuperColumn", "null" ]
|
76
|
-
} ]
|
77
|
-
}, {
|
78
|
-
"type" : "record",
|
79
|
-
"name" : "SliceRange",
|
80
|
-
"fields" : [ {
|
81
|
-
"name" : "start",
|
82
|
-
"type" : "bytes"
|
83
|
-
}, {
|
84
|
-
"name" : "finish",
|
85
|
-
"type" : "bytes"
|
86
|
-
}, {
|
87
|
-
"name" : "reversed",
|
88
|
-
"type" : "boolean"
|
89
|
-
}, {
|
90
|
-
"name" : "count",
|
91
|
-
"type" : "int"
|
92
|
-
}, {
|
93
|
-
"name" : "bitmasks",
|
94
|
-
"type" : [ {
|
95
|
-
"type" : "array",
|
96
|
-
"items" : "bytes"
|
97
|
-
}, "null" ]
|
98
|
-
} ]
|
99
|
-
}, {
|
100
|
-
"type" : "record",
|
101
|
-
"name" : "SlicePredicate",
|
102
|
-
"fields" : [ {
|
103
|
-
"name" : "column_names",
|
104
|
-
"type" : [ {
|
105
|
-
"type" : "array",
|
106
|
-
"items" : "bytes"
|
107
|
-
}, "null" ]
|
108
|
-
}, {
|
109
|
-
"name" : "slice_range",
|
110
|
-
"type" : [ "SliceRange", "null" ]
|
111
|
-
} ]
|
112
|
-
}, {
|
113
|
-
"type" : "record",
|
114
|
-
"name" : "Deletion",
|
115
|
-
"fields" : [ {
|
116
|
-
"name" : "clock",
|
117
|
-
"type" : "Clock"
|
118
|
-
}, {
|
119
|
-
"name" : "super_column",
|
120
|
-
"type" : [ "bytes", "null" ]
|
121
|
-
}, {
|
122
|
-
"name" : "predicate",
|
123
|
-
"type" : [ "SlicePredicate", "null" ]
|
124
|
-
} ]
|
125
|
-
}, {
|
126
|
-
"type" : "record",
|
127
|
-
"name" : "Mutation",
|
128
|
-
"fields" : [ {
|
129
|
-
"name" : "column_or_supercolumn",
|
130
|
-
"type" : [ "ColumnOrSuperColumn", "null" ]
|
131
|
-
}, {
|
132
|
-
"name" : "deletion",
|
133
|
-
"type" : [ "Deletion", "null" ]
|
134
|
-
} ]
|
135
|
-
}, {
|
136
|
-
"type" : "enum",
|
137
|
-
"name" : "IndexType",
|
138
|
-
"symbols" : [ "KEYS" ]
|
139
|
-
}, {
|
140
|
-
"type" : "record",
|
141
|
-
"name" : "ColumnDef",
|
142
|
-
"fields" : [ {
|
143
|
-
"name" : "name",
|
144
|
-
"type" : "bytes"
|
145
|
-
}, {
|
146
|
-
"name" : "validation_class",
|
147
|
-
"type" : "string"
|
148
|
-
}, {
|
149
|
-
"name" : "index_type",
|
150
|
-
"type" : [ "IndexType", "null" ]
|
151
|
-
}, {
|
152
|
-
"name" : "index_name",
|
153
|
-
"type" : [ "string", "null" ]
|
154
|
-
} ]
|
155
|
-
}, {
|
156
|
-
"type" : "record",
|
157
|
-
"name" : "CfDef",
|
158
|
-
"fields" : [ {
|
159
|
-
"name" : "keyspace",
|
160
|
-
"type" : "string"
|
161
|
-
}, {
|
162
|
-
"name" : "name",
|
163
|
-
"type" : "string"
|
164
|
-
}, {
|
165
|
-
"name" : "column_type",
|
166
|
-
"type" : [ "string", "null" ]
|
167
|
-
}, {
|
168
|
-
"name" : "clock_type",
|
169
|
-
"type" : [ "string", "null" ]
|
170
|
-
}, {
|
171
|
-
"name" : "comparator_type",
|
172
|
-
"type" : [ "string", "null" ]
|
173
|
-
}, {
|
174
|
-
"name" : "subcomparator_type",
|
175
|
-
"type" : [ "string", "null" ]
|
176
|
-
}, {
|
177
|
-
"name" : "reconciler",
|
178
|
-
"type" : [ "string", "null" ]
|
179
|
-
}, {
|
180
|
-
"name" : "comment",
|
181
|
-
"type" : [ "string", "null" ]
|
182
|
-
}, {
|
183
|
-
"name" : "row_cache_size",
|
184
|
-
"type" : [ "double", "null" ]
|
185
|
-
}, {
|
186
|
-
"name" : "preload_row_cache",
|
187
|
-
"type" : [ "boolean", "null" ]
|
188
|
-
}, {
|
189
|
-
"name" : "key_cache_size",
|
190
|
-
"type" : [ "double", "null" ]
|
191
|
-
}, {
|
192
|
-
"name" : "read_repair_chance",
|
193
|
-
"type" : [ "double", "null" ]
|
194
|
-
}, {
|
195
|
-
"name" : "gc_grace_seconds",
|
196
|
-
"type" : [ "int", "null" ]
|
197
|
-
}, {
|
198
|
-
"name" : "column_metadata",
|
199
|
-
"type" : [ {
|
200
|
-
"type" : "array",
|
201
|
-
"items" : "ColumnDef"
|
202
|
-
}, "null" ]
|
203
|
-
}, {
|
204
|
-
"name" : "id",
|
205
|
-
"type" : [ "int", "null" ]
|
206
|
-
} ]
|
207
|
-
}, {
|
208
|
-
"type" : "record",
|
209
|
-
"name" : "KsDef",
|
210
|
-
"fields" : [ {
|
211
|
-
"name" : "name",
|
212
|
-
"type" : "string"
|
213
|
-
}, {
|
214
|
-
"name" : "strategy_class",
|
215
|
-
"type" : "string"
|
216
|
-
}, {
|
217
|
-
"name" : "strategy_options",
|
218
|
-
"type" : [ {
|
219
|
-
"type" : "map",
|
220
|
-
"values" : "string"
|
221
|
-
}, "null" ]
|
222
|
-
}, {
|
223
|
-
"name" : "replication_factor",
|
224
|
-
"type" : "int"
|
225
|
-
}, {
|
226
|
-
"name" : "cf_defs",
|
227
|
-
"type" : {
|
228
|
-
"type" : "array",
|
229
|
-
"items" : "CfDef"
|
230
|
-
}
|
231
|
-
} ]
|
232
|
-
}, {
|
233
|
-
"type" : "record",
|
234
|
-
"name" : "StreamingMutation",
|
235
|
-
"fields" : [ {
|
236
|
-
"name" : "key",
|
237
|
-
"type" : "bytes"
|
238
|
-
}, {
|
239
|
-
"name" : "name",
|
240
|
-
"type" : "bytes"
|
241
|
-
}, {
|
242
|
-
"name" : "value",
|
243
|
-
"type" : "bytes"
|
244
|
-
}, {
|
245
|
-
"name" : "timestamp",
|
246
|
-
"type" : "long"
|
247
|
-
}, {
|
248
|
-
"name" : "ttl",
|
249
|
-
"type" : "int"
|
250
|
-
} ]
|
251
|
-
}, {
|
252
|
-
"type" : "record",
|
253
|
-
"name" : "MutationsMapEntry",
|
254
|
-
"fields" : [ {
|
255
|
-
"name" : "key",
|
256
|
-
"type" : "bytes"
|
257
|
-
}, {
|
258
|
-
"name" : "mutations",
|
259
|
-
"type" : {
|
260
|
-
"type" : "map",
|
261
|
-
"values" : {
|
262
|
-
"type" : "array",
|
263
|
-
"items" : "Mutation"
|
264
|
-
}
|
265
|
-
}
|
266
|
-
} ]
|
267
|
-
}, {
|
268
|
-
"type" : "record",
|
269
|
-
"name" : "CoscsMapEntry",
|
270
|
-
"fields" : [ {
|
271
|
-
"name" : "key",
|
272
|
-
"type" : "bytes"
|
273
|
-
}, {
|
274
|
-
"name" : "columns",
|
275
|
-
"type" : {
|
276
|
-
"type" : "array",
|
277
|
-
"items" : "ColumnOrSuperColumn"
|
278
|
-
}
|
279
|
-
} ]
|
280
|
-
}, {
|
281
|
-
"type" : "enum",
|
282
|
-
"name" : "ConsistencyLevel",
|
283
|
-
"symbols" : [ "ZERO", "ONE", "QUORUM", "DCQUORUM", "DCQUORUMSYNC", "ALL" ]
|
284
|
-
}, {
|
285
|
-
"type" : "error",
|
286
|
-
"name" : "InvalidRequestException",
|
287
|
-
"fields" : [ {
|
288
|
-
"name" : "why",
|
289
|
-
"type" : [ "string", "null" ]
|
290
|
-
} ]
|
291
|
-
}, {
|
292
|
-
"type" : "error",
|
293
|
-
"name" : "NotFoundException",
|
294
|
-
"fields" : [ {
|
295
|
-
"name" : "why",
|
296
|
-
"type" : [ "string", "null" ]
|
297
|
-
} ]
|
298
|
-
}, {
|
299
|
-
"type" : "error",
|
300
|
-
"name" : "UnavailableException",
|
301
|
-
"fields" : [ {
|
302
|
-
"name" : "why",
|
303
|
-
"type" : [ "string", "null" ]
|
304
|
-
} ]
|
305
|
-
}, {
|
306
|
-
"type" : "error",
|
307
|
-
"name" : "TimedOutException",
|
308
|
-
"fields" : [ {
|
309
|
-
"name" : "why",
|
310
|
-
"type" : [ "string", "null" ]
|
311
|
-
} ]
|
312
|
-
} ],
|
313
|
-
"messages" : {
|
314
|
-
"get" : {
|
315
|
-
"request" : [ {
|
316
|
-
"name" : "key",
|
317
|
-
"type" : "bytes"
|
318
|
-
}, {
|
319
|
-
"name" : "column_path",
|
320
|
-
"type" : "ColumnPath"
|
321
|
-
}, {
|
322
|
-
"name" : "consistency_level",
|
323
|
-
"type" : "ConsistencyLevel"
|
324
|
-
} ],
|
325
|
-
"response" : "ColumnOrSuperColumn",
|
326
|
-
"errors" : [ "InvalidRequestException", "NotFoundException", "UnavailableException", "TimedOutException" ]
|
327
|
-
},
|
328
|
-
"get_slice" : {
|
329
|
-
"request" : [ {
|
330
|
-
"name" : "key",
|
331
|
-
"type" : "bytes"
|
332
|
-
}, {
|
333
|
-
"name" : "column_parent",
|
334
|
-
"type" : "ColumnParent"
|
335
|
-
}, {
|
336
|
-
"name" : "predicate",
|
337
|
-
"type" : "SlicePredicate"
|
338
|
-
}, {
|
339
|
-
"name" : "consistency_level",
|
340
|
-
"type" : "ConsistencyLevel"
|
341
|
-
} ],
|
342
|
-
"response" : {
|
343
|
-
"type" : "array",
|
344
|
-
"items" : "ColumnOrSuperColumn"
|
345
|
-
},
|
346
|
-
"errors" : [ "InvalidRequestException", "UnavailableException", "TimedOutException" ]
|
347
|
-
},
|
348
|
-
"multiget_slice" : {
|
349
|
-
"request" : [ {
|
350
|
-
"name" : "keys",
|
351
|
-
"type" : {
|
352
|
-
"type" : "array",
|
353
|
-
"items" : "bytes"
|
354
|
-
}
|
355
|
-
}, {
|
356
|
-
"name" : "column_parent",
|
357
|
-
"type" : "ColumnParent"
|
358
|
-
}, {
|
359
|
-
"name" : "predicate",
|
360
|
-
"type" : "SlicePredicate"
|
361
|
-
}, {
|
362
|
-
"name" : "consistency_level",
|
363
|
-
"type" : "ConsistencyLevel"
|
364
|
-
} ],
|
365
|
-
"response" : {
|
366
|
-
"type" : "array",
|
367
|
-
"items" : "CoscsMapEntry"
|
368
|
-
},
|
369
|
-
"errors" : [ "InvalidRequestException", "UnavailableException", "TimedOutException" ]
|
370
|
-
},
|
371
|
-
"get_count" : {
|
372
|
-
"request" : [ {
|
373
|
-
"name" : "key",
|
374
|
-
"type" : "bytes"
|
375
|
-
}, {
|
376
|
-
"name" : "column_parent",
|
377
|
-
"type" : "ColumnParent"
|
378
|
-
}, {
|
379
|
-
"name" : "predicate",
|
380
|
-
"type" : "SlicePredicate"
|
381
|
-
}, {
|
382
|
-
"name" : "consistency_level",
|
383
|
-
"type" : "ConsistencyLevel"
|
384
|
-
} ],
|
385
|
-
"response" : "int",
|
386
|
-
"errors" : [ "InvalidRequestException", "UnavailableException", "TimedOutException" ]
|
387
|
-
},
|
388
|
-
"insert" : {
|
389
|
-
"request" : [ {
|
390
|
-
"name" : "key",
|
391
|
-
"type" : "bytes"
|
392
|
-
}, {
|
393
|
-
"name" : "column_parent",
|
394
|
-
"type" : "ColumnParent"
|
395
|
-
}, {
|
396
|
-
"name" : "column",
|
397
|
-
"type" : "Column"
|
398
|
-
}, {
|
399
|
-
"name" : "consistency_level",
|
400
|
-
"type" : "ConsistencyLevel"
|
401
|
-
} ],
|
402
|
-
"response" : "null",
|
403
|
-
"errors" : [ "InvalidRequestException", "UnavailableException", "TimedOutException" ]
|
404
|
-
},
|
405
|
-
"remove" : {
|
406
|
-
"request" : [ {
|
407
|
-
"name" : "key",
|
408
|
-
"type" : "bytes"
|
409
|
-
}, {
|
410
|
-
"name" : "column_path",
|
411
|
-
"type" : "ColumnPath"
|
412
|
-
}, {
|
413
|
-
"name" : "clock",
|
414
|
-
"type" : "Clock"
|
415
|
-
}, {
|
416
|
-
"name" : "consistency_level",
|
417
|
-
"type" : "ConsistencyLevel"
|
418
|
-
} ],
|
419
|
-
"response" : "null",
|
420
|
-
"errors" : [ "InvalidRequestException", "UnavailableException", "TimedOutException" ]
|
421
|
-
},
|
422
|
-
"batch_mutate" : {
|
423
|
-
"request" : [ {
|
424
|
-
"name" : "mutation_map",
|
425
|
-
"type" : {
|
426
|
-
"type" : "array",
|
427
|
-
"items" : "MutationsMapEntry"
|
428
|
-
}
|
429
|
-
}, {
|
430
|
-
"name" : "consistency_level",
|
431
|
-
"type" : "ConsistencyLevel"
|
432
|
-
} ],
|
433
|
-
"response" : "null",
|
434
|
-
"errors" : [ "InvalidRequestException", "UnavailableException", "TimedOutException" ]
|
435
|
-
},
|
436
|
-
"system_add_keyspace" : {
|
437
|
-
"request" : [ {
|
438
|
-
"name" : "ks_def",
|
439
|
-
"type" : "KsDef"
|
440
|
-
} ],
|
441
|
-
"response" : "null",
|
442
|
-
"errors" : [ "InvalidRequestException" ]
|
443
|
-
},
|
444
|
-
"set_keyspace" : {
|
445
|
-
"request" : [ {
|
446
|
-
"name" : "keyspace",
|
447
|
-
"type" : "string"
|
448
|
-
} ],
|
449
|
-
"response" : "null",
|
450
|
-
"errors" : [ "InvalidRequestException" ]
|
451
|
-
},
|
452
|
-
"describe_keyspaces" : {
|
453
|
-
"request" : [ ],
|
454
|
-
"response" : {
|
455
|
-
"type" : "array",
|
456
|
-
"items" : "string"
|
457
|
-
}
|
458
|
-
},
|
459
|
-
"describe_cluster_name" : {
|
460
|
-
"request" : [ ],
|
461
|
-
"response" : "string"
|
462
|
-
},
|
463
|
-
"describe_version" : {
|
464
|
-
"request" : [ ],
|
465
|
-
"response" : "string"
|
466
|
-
}
|
467
|
-
}
|
468
|
-
}
|
@@ -1,62 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
require 'rubygems'
|
3
|
-
require 'avro'
|
4
|
-
require 'wukong'
|
5
|
-
require 'wukong/periodic_monitor'
|
6
|
-
Settings.define :log_interval, :default => 10_000
|
7
|
-
|
8
|
-
require 'digest/md5'
|
9
|
-
Settings.define :ring_nodes
|
10
|
-
|
11
|
-
MAX_HASH = 2**127
|
12
|
-
RING_NODES = 72
|
13
|
-
RING_WIDTH = MAX_HASH / RING_NODES
|
14
|
-
OUT_DIR = '/mnt/tmp/partitioned_words'
|
15
|
-
|
16
|
-
# for foo in pw0/part-000* ; do echo $foo ; time cat $foo | ~/ics/wukong/examples/cassandra_streaming/cassandra_random_partitioner.rb --map 2>/tmp/split-`basename $foo`.log & done
|
17
|
-
|
18
|
-
module CassandraRandomPartitioner
|
19
|
-
def partition_hash key
|
20
|
-
uval = Digest::MD5.hexdigest(key).to_i(16)
|
21
|
-
(uval > 2**127) ? (2**128 - uval) : uval
|
22
|
-
end
|
23
|
-
|
24
|
-
def partition key
|
25
|
-
partition_hash(key) / RING_WIDTH
|
26
|
-
end
|
27
|
-
|
28
|
-
def files
|
29
|
-
@files ||= Hash.new{|h,part| h[part] = File.open(OUT_DIR+"/chunk-#{"%03d" % part}", 'w') }
|
30
|
-
end
|
31
|
-
|
32
|
-
end
|
33
|
-
|
34
|
-
module PeriodicLog
|
35
|
-
def log
|
36
|
-
@log ||= PeriodicMonitor.new
|
37
|
-
end
|
38
|
-
end
|
39
|
-
|
40
|
-
class HashingStreamer < Wukong::Streamer::RecordStreamer
|
41
|
-
include CassandraRandomPartitioner
|
42
|
-
include PeriodicLog
|
43
|
-
|
44
|
-
def process word, count, *_
|
45
|
-
log.periodically( word, count )
|
46
|
-
part = partition(word)
|
47
|
-
# yield [part, word, count]
|
48
|
-
files[part] << [word, count].join("\t") << "\n"
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
52
|
-
class HashingReducer < Wukong::Streamer::RecordStreamer
|
53
|
-
include CassandraRandomPartitioner
|
54
|
-
include PeriodicLog
|
55
|
-
|
56
|
-
def process part, word, count, *_
|
57
|
-
log.periodically( word, count )
|
58
|
-
yield [word, count]
|
59
|
-
end
|
60
|
-
end
|
61
|
-
|
62
|
-
Wukong::Script.new(HashingStreamer, HashingReducer, :map_speculative => false).run
|
@@ -1,45 +0,0 @@
|
|
1
|
-
#!/usr/bin/env bash
|
2
|
-
|
3
|
-
#
|
4
|
-
# Cat a binary-encoded avro file into the bulk loader
|
5
|
-
#
|
6
|
-
|
7
|
-
input_file="$1" ; shift
|
8
|
-
output_file="$1" ; shift
|
9
|
-
map_script=${1-/bin/cat} ; shift
|
10
|
-
reduce_script=${1-/usr/bin/uniq} ; shift
|
11
|
-
|
12
|
-
dest_keyspace=${dest_keyspace-soc_net_tw}
|
13
|
-
dest_col_family=${dest_col_family-Wordbag}
|
14
|
-
|
15
|
-
hostname=`hostname`
|
16
|
-
|
17
|
-
# Path to cassandra and hadoop dirs
|
18
|
-
script_dir=$(readlink -f `dirname $0`)
|
19
|
-
CASSANDRA_HOME=${CASSANDRA_HOME-/usr/local/share/cassandra}
|
20
|
-
HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
|
21
|
-
avro_file=${avro_file-$CASSANDRA_HOME/interface/avro/cassandra.avpr}
|
22
|
-
|
23
|
-
ARCHIVES=`/bin/ls -1 $CASSANDRA_HOME/build/apache-cassandra*.jar`
|
24
|
-
for jar in `/bin/ls -1 $CASSANDRA_HOME/build/lib/jars/*.jar $CASSANDRA_HOME/lib/*.jar`; do
|
25
|
-
ARCHIVES=$ARCHIVES,$jar
|
26
|
-
done
|
27
|
-
|
28
|
-
${HADOOP_HOME}/bin/hadoop \
|
29
|
-
jar ${HADOOP_HOME}/contrib/streaming/hadoop-*streaming*.jar \
|
30
|
-
-D stream.map.output=cassandra_avro_output \
|
31
|
-
-D stream.io.identifier.resolver.class=org.apache.cassandra.hadoop.streaming.AvroResolver \
|
32
|
-
-D cassandra.output.keyspace="$dest_keyspace" \
|
33
|
-
-D cassandra.output.columnfamily="$dest_col_family" \
|
34
|
-
-D cassandra.partitioner.class=org.apache.cassandra.dht.RandomPartitioner \
|
35
|
-
-D cassandra.thrift.address="10.104.9.68" \
|
36
|
-
-D cassandra.thrift.port=9160 \
|
37
|
-
-D mapred.reduce.tasks=0 \
|
38
|
-
-libjars $ARCHIVES \
|
39
|
-
-file $avro_file \
|
40
|
-
-outputformat org.apache.cassandra.hadoop.ColumnFamilyOutputFormat \
|
41
|
-
-mapper `which cat` \
|
42
|
-
-input "$input_file" \
|
43
|
-
-output "$output_file" \
|
44
|
-
"$@"
|
45
|
-
|