wukong 1.5.4 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (87) hide show
  1. data/CHANGELOG.textile +32 -0
  2. data/README.textile +58 -12
  3. data/TODO.textile +0 -8
  4. data/bin/hdp-bzip +12 -17
  5. data/bin/hdp-kill-task +1 -1
  6. data/bin/hdp-sort +7 -7
  7. data/bin/hdp-stream +7 -7
  8. data/bin/hdp-stream-flat +2 -3
  9. data/bin/setcat +11 -0
  10. data/bin/uniq-ord +59 -0
  11. data/examples/corpus/bucket_counter.rb +47 -0
  12. data/examples/corpus/dbpedia_abstract_to_sentences.rb +85 -0
  13. data/examples/corpus/sentence_coocurrence.rb +70 -0
  14. data/examples/emr/README.textile +110 -0
  15. data/examples/emr/dot_wukong_dir/emr_bootstrap.sh +1 -0
  16. data/examples/emr/elastic_mapreduce_example.rb +2 -2
  17. data/examples/ignore_me/counting.rb +56 -0
  18. data/examples/ignore_me/grouper.rb +71 -0
  19. data/examples/network_graph/adjacency_list.rb +2 -2
  20. data/examples/network_graph/breadth_first_search.rb +14 -21
  21. data/examples/network_graph/gen_multi_edge.rb +22 -13
  22. data/examples/pagerank/pagerank.rb +1 -1
  23. data/examples/pagerank/pagerank_initialize.rb +6 -10
  24. data/examples/sample_records.rb +6 -16
  25. data/examples/server_logs/apache_log_parser.rb +7 -22
  26. data/examples/server_logs/breadcrumbs.rb +39 -0
  27. data/examples/server_logs/logline.rb +27 -0
  28. data/examples/size.rb +3 -2
  29. data/examples/{binning_percentile_estimator.rb → stats/binning_percentile_estimator.rb} +9 -11
  30. data/examples/{rank_and_bin.rb → stats/rank_and_bin.rb} +2 -2
  31. data/examples/stupidly_simple_filter.rb +11 -14
  32. data/examples/word_count.rb +16 -36
  33. data/lib/wukong/and_pig.rb +2 -15
  34. data/lib/wukong/logger.rb +7 -28
  35. data/lib/wukong/periodic_monitor.rb +24 -9
  36. data/lib/wukong/script/emr_command.rb +1 -0
  37. data/lib/wukong/script/hadoop_command.rb +31 -29
  38. data/lib/wukong/script.rb +19 -14
  39. data/lib/wukong/store/cassandra_model.rb +2 -1
  40. data/lib/wukong/streamer/accumulating_reducer.rb +5 -9
  41. data/lib/wukong/streamer/base.rb +44 -3
  42. data/lib/wukong/streamer/counting_reducer.rb +12 -12
  43. data/lib/wukong/streamer/filter.rb +2 -2
  44. data/lib/wukong/streamer/list_reducer.rb +3 -3
  45. data/lib/wukong/streamer/reducer.rb +11 -0
  46. data/lib/wukong/streamer.rb +7 -3
  47. data/lib/wukong.rb +7 -3
  48. data/{examples → old}/cassandra_streaming/berlitz_for_cassandra.textile +0 -0
  49. data/{examples → old}/cassandra_streaming/client_interface_notes.textile +0 -0
  50. data/{examples → old}/cassandra_streaming/client_schema.textile +0 -0
  51. data/{examples → old}/cassandra_streaming/tuning.textile +0 -0
  52. data/wukong.gemspec +257 -285
  53. metadata +45 -62
  54. data/examples/cassandra_streaming/avromapper.rb +0 -85
  55. data/examples/cassandra_streaming/cassandra.avpr +0 -468
  56. data/examples/cassandra_streaming/cassandra_random_partitioner.rb +0 -62
  57. data/examples/cassandra_streaming/catter.sh +0 -45
  58. data/examples/cassandra_streaming/client_schema.avpr +0 -211
  59. data/examples/cassandra_streaming/foofile.avr +0 -0
  60. data/examples/cassandra_streaming/pymap.sh +0 -1
  61. data/examples/cassandra_streaming/pyreduce.sh +0 -1
  62. data/examples/cassandra_streaming/smutation.avpr +0 -188
  63. data/examples/cassandra_streaming/streamer.sh +0 -51
  64. data/examples/cassandra_streaming/struct_loader.rb +0 -24
  65. data/examples/count_keys.rb +0 -56
  66. data/examples/count_keys_at_mapper.rb +0 -57
  67. data/examples/emr/README-elastic_map_reduce.textile +0 -26
  68. data/examples/keystore/cassandra_batch_test.rb +0 -41
  69. data/examples/keystore/conditional_outputter_example.rb +0 -70
  70. data/examples/store/chunked_store_example.rb +0 -18
  71. data/lib/wukong/dfs.rb +0 -81
  72. data/lib/wukong/keystore/cassandra_conditional_outputter.rb +0 -122
  73. data/lib/wukong/keystore/redis_db.rb +0 -24
  74. data/lib/wukong/keystore/tyrant_db.rb +0 -137
  75. data/lib/wukong/keystore/tyrant_notes.textile +0 -145
  76. data/lib/wukong/models/graph.rb +0 -25
  77. data/lib/wukong/monitor/chunked_store.rb +0 -23
  78. data/lib/wukong/monitor/periodic_logger.rb +0 -34
  79. data/lib/wukong/monitor/periodic_monitor.rb +0 -70
  80. data/lib/wukong/monitor.rb +0 -7
  81. data/lib/wukong/rdf.rb +0 -104
  82. data/lib/wukong/streamer/cassandra_streamer.rb +0 -61
  83. data/lib/wukong/streamer/count_keys.rb +0 -30
  84. data/lib/wukong/streamer/count_lines.rb +0 -26
  85. data/lib/wukong/streamer/em_streamer.rb +0 -7
  86. data/lib/wukong/streamer/preprocess_with_pipe_streamer.rb +0 -22
  87. data/lib/wukong/wukong_class.rb +0 -21
metadata CHANGED
@@ -1,12 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wukong
3
3
  version: !ruby/object:Gem::Version
4
- prerelease: false
4
+ hash: 15
5
+ prerelease:
5
6
  segments:
6
- - 1
7
- - 5
8
- - 4
9
- version: 1.5.4
7
+ - 2
8
+ - 0
9
+ - 0
10
+ version: 2.0.0
10
11
  platform: ruby
11
12
  authors:
12
13
  - Philip (flip) Kromer
@@ -14,7 +15,7 @@ autorequire:
14
15
  bindir: bin
15
16
  cert_chain: []
16
17
 
17
- date: 2010-11-02 00:00:00 -05:00
18
+ date: 2011-01-29 00:00:00 -06:00
18
19
  default_executable:
19
20
  dependencies:
20
21
  - !ruby/object:Gem::Dependency
@@ -25,6 +26,7 @@ dependencies:
25
26
  requirements:
26
27
  - - ">="
27
28
  - !ruby/object:Gem::Version
29
+ hash: 13
28
30
  segments:
29
31
  - 1
30
32
  - 2
@@ -40,6 +42,7 @@ dependencies:
40
42
  requirements:
41
43
  - - ">="
42
44
  - !ruby/object:Gem::Version
45
+ hash: 3
43
46
  segments:
44
47
  - 0
45
48
  version: "0"
@@ -53,6 +56,7 @@ dependencies:
53
56
  requirements:
54
57
  - - ">="
55
58
  - !ruby/object:Gem::Version
59
+ hash: 3
56
60
  segments:
57
61
  - 0
58
62
  version: "0"
@@ -66,6 +70,7 @@ dependencies:
66
70
  requirements:
67
71
  - - ">="
68
72
  - !ruby/object:Gem::Version
73
+ hash: 3
69
74
  segments:
70
75
  - 0
71
76
  version: "0"
@@ -79,6 +84,7 @@ dependencies:
79
84
  requirements:
80
85
  - - ">="
81
86
  - !ruby/object:Gem::Version
87
+ hash: 3
82
88
  segments:
83
89
  - 0
84
90
  version: "0"
@@ -92,6 +98,7 @@ dependencies:
92
98
  requirements:
93
99
  - - ">="
94
100
  - !ruby/object:Gem::Version
101
+ hash: 3
95
102
  segments:
96
103
  - 0
97
104
  version: "0"
@@ -144,7 +151,9 @@ files:
144
151
  - bin/hdp-sync
145
152
  - bin/hdp-wc
146
153
  - bin/md5sort
154
+ - bin/setcat
147
155
  - bin/tabchar
156
+ - bin/uniq-ord
148
157
  - bin/uniqc
149
158
  - bin/wu-date
150
159
  - bin/wu-datetime
@@ -216,38 +225,23 @@ files:
216
225
  - docpages/usage.textile
217
226
  - docpages/wutils.textile
218
227
  - examples/README.txt
219
- - examples/binning_percentile_estimator.rb
220
- - examples/cassandra_streaming/avromapper.rb
221
- - examples/cassandra_streaming/berlitz_for_cassandra.textile
222
- - examples/cassandra_streaming/cassandra.avpr
223
- - examples/cassandra_streaming/cassandra_random_partitioner.rb
224
- - examples/cassandra_streaming/catter.sh
225
- - examples/cassandra_streaming/client_interface_notes.textile
226
- - examples/cassandra_streaming/client_schema.avpr
227
- - examples/cassandra_streaming/client_schema.textile
228
- - examples/cassandra_streaming/foofile.avr
229
- - examples/cassandra_streaming/pymap.sh
230
- - examples/cassandra_streaming/pyreduce.sh
231
- - examples/cassandra_streaming/smutation.avpr
232
- - examples/cassandra_streaming/streamer.sh
233
- - examples/cassandra_streaming/struct_loader.rb
234
- - examples/cassandra_streaming/tuning.textile
235
228
  - examples/contrib/jeans/README.markdown
236
229
  - examples/contrib/jeans/data/normalized_sizes
237
230
  - examples/contrib/jeans/data/orders.tsv
238
231
  - examples/contrib/jeans/data/sizes
239
232
  - examples/contrib/jeans/normalize.rb
240
233
  - examples/contrib/jeans/sizes.rb
234
+ - examples/corpus/bucket_counter.rb
235
+ - examples/corpus/dbpedia_abstract_to_sentences.rb
236
+ - examples/corpus/sentence_coocurrence.rb
241
237
  - examples/corpus/words_to_bigrams.rb
242
- - examples/count_keys.rb
243
- - examples/count_keys_at_mapper.rb
244
- - examples/emr/README-elastic_map_reduce.textile
238
+ - examples/emr/README.textile
245
239
  - examples/emr/dot_wukong_dir/credentials.json
246
240
  - examples/emr/dot_wukong_dir/emr.yaml
247
241
  - examples/emr/dot_wukong_dir/emr_bootstrap.sh
248
242
  - examples/emr/elastic_mapreduce_example.rb
249
- - examples/keystore/cassandra_batch_test.rb
250
- - examples/keystore/conditional_outputter_example.rb
243
+ - examples/ignore_me/counting.rb
244
+ - examples/ignore_me/grouper.rb
251
245
  - examples/network_graph/adjacency_list.rb
252
246
  - examples/network_graph/breadth_first_search.rb
253
247
  - examples/network_graph/gen_2paths.rb
@@ -258,15 +252,16 @@ files:
258
252
  - examples/pagerank/pagerank.rb
259
253
  - examples/pagerank/pagerank_initialize.rb
260
254
  - examples/pagerank/run_pagerank.sh
261
- - examples/rank_and_bin.rb
262
255
  - examples/sample_records.rb
263
256
  - examples/server_logs/apache_log_parser.rb
264
257
  - examples/server_logs/breadcrumbs.rb
258
+ - examples/server_logs/logline.rb
265
259
  - examples/server_logs/user_agent.rb
266
260
  - examples/size.rb
267
261
  - examples/stats/avg_value_frequency.rb
262
+ - examples/stats/binning_percentile_estimator.rb
268
263
  - examples/stats/data/avg_value_frequency.tsv
269
- - examples/store/chunked_store_example.rb
264
+ - examples/stats/rank_and_bin.rb
270
265
  - examples/stupidly_simple_filter.rb
271
266
  - examples/word_count.rb
272
267
  - lib/wukong.rb
@@ -275,7 +270,6 @@ files:
275
270
  - lib/wukong/datatypes.rb
276
271
  - lib/wukong/datatypes/enum.rb
277
272
  - lib/wukong/datatypes/fake_types.rb
278
- - lib/wukong/dfs.rb
279
273
  - lib/wukong/encoding.rb
280
274
  - lib/wukong/encoding/asciize.rb
281
275
  - lib/wukong/extensions.rb
@@ -295,18 +289,8 @@ files:
295
289
  - lib/wukong/extensions/struct.rb
296
290
  - lib/wukong/extensions/symbol.rb
297
291
  - lib/wukong/filename_pattern.rb
298
- - lib/wukong/keystore/cassandra_conditional_outputter.rb
299
- - lib/wukong/keystore/redis_db.rb
300
- - lib/wukong/keystore/tyrant_db.rb
301
- - lib/wukong/keystore/tyrant_notes.textile
302
292
  - lib/wukong/logger.rb
303
- - lib/wukong/models/graph.rb
304
- - lib/wukong/monitor.rb
305
- - lib/wukong/monitor/chunked_store.rb
306
- - lib/wukong/monitor/periodic_logger.rb
307
- - lib/wukong/monitor/periodic_monitor.rb
308
293
  - lib/wukong/periodic_monitor.rb
309
- - lib/wukong/rdf.rb
310
294
  - lib/wukong/schema.rb
311
295
  - lib/wukong/script.rb
312
296
  - lib/wukong/script/avro_command.rb
@@ -334,23 +318,22 @@ files:
334
318
  - lib/wukong/streamer.rb
335
319
  - lib/wukong/streamer/accumulating_reducer.rb
336
320
  - lib/wukong/streamer/base.rb
337
- - lib/wukong/streamer/cassandra_streamer.rb
338
- - lib/wukong/streamer/count_keys.rb
339
- - lib/wukong/streamer/count_lines.rb
340
321
  - lib/wukong/streamer/counting_reducer.rb
341
- - lib/wukong/streamer/em_streamer.rb
342
322
  - lib/wukong/streamer/filter.rb
343
323
  - lib/wukong/streamer/line_streamer.rb
344
324
  - lib/wukong/streamer/list_reducer.rb
345
- - lib/wukong/streamer/preprocess_with_pipe_streamer.rb
346
325
  - lib/wukong/streamer/rank_and_bin_reducer.rb
347
326
  - lib/wukong/streamer/record_streamer.rb
327
+ - lib/wukong/streamer/reducer.rb
348
328
  - lib/wukong/streamer/set_reducer.rb
349
329
  - lib/wukong/streamer/struct_streamer.rb
350
330
  - lib/wukong/streamer/summing_reducer.rb
351
331
  - lib/wukong/streamer/uniq_by_last_reducer.rb
352
332
  - lib/wukong/typed_struct.rb
353
- - lib/wukong/wukong_class.rb
333
+ - old/cassandra_streaming/berlitz_for_cassandra.textile
334
+ - old/cassandra_streaming/client_interface_notes.textile
335
+ - old/cassandra_streaming/client_schema.textile
336
+ - old/cassandra_streaming/tuning.textile
354
337
  - spec/data/a_atsigns_b.tsv
355
338
  - spec/data/a_follows_b.tsv
356
339
  - spec/data/tweet.tsv
@@ -365,8 +348,8 @@ homepage: http://mrflip.github.com/wukong
365
348
  licenses: []
366
349
 
367
350
  post_install_message:
368
- rdoc_options:
369
- - --charset=UTF-8
351
+ rdoc_options: []
352
+
370
353
  require_paths:
371
354
  - lib
372
355
  required_ruby_version: !ruby/object:Gem::Requirement
@@ -374,6 +357,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
374
357
  requirements:
375
358
  - - ">="
376
359
  - !ruby/object:Gem::Version
360
+ hash: 3
377
361
  segments:
378
362
  - 0
379
363
  version: "0"
@@ -382,32 +366,27 @@ required_rubygems_version: !ruby/object:Gem::Requirement
382
366
  requirements:
383
367
  - - ">="
384
368
  - !ruby/object:Gem::Version
369
+ hash: 3
385
370
  segments:
386
371
  - 0
387
372
  version: "0"
388
373
  requirements: []
389
374
 
390
375
  rubyforge_project:
391
- rubygems_version: 1.3.7
376
+ rubygems_version: 1.4.2
392
377
  signing_key:
393
378
  specification_version: 3
394
379
  summary: Hadoop Streaming for Ruby. Wukong makes Hadoop so easy a chimpanzee can use it, yet handles terabyte-scale computation with ease.
395
380
  test_files:
396
- - spec/spec_helper.rb
397
- - spec/wukong/encoding_spec.rb
398
- - spec/wukong/script_spec.rb
399
- - examples/binning_percentile_estimator.rb
400
- - examples/cassandra_streaming/avromapper.rb
401
- - examples/cassandra_streaming/cassandra_random_partitioner.rb
402
- - examples/cassandra_streaming/struct_loader.rb
403
381
  - examples/contrib/jeans/normalize.rb
404
382
  - examples/contrib/jeans/sizes.rb
383
+ - examples/corpus/bucket_counter.rb
384
+ - examples/corpus/dbpedia_abstract_to_sentences.rb
385
+ - examples/corpus/sentence_coocurrence.rb
405
386
  - examples/corpus/words_to_bigrams.rb
406
- - examples/count_keys.rb
407
- - examples/count_keys_at_mapper.rb
408
387
  - examples/emr/elastic_mapreduce_example.rb
409
- - examples/keystore/cassandra_batch_test.rb
410
- - examples/keystore/conditional_outputter_example.rb
388
+ - examples/ignore_me/counting.rb
389
+ - examples/ignore_me/grouper.rb
411
390
  - examples/network_graph/adjacency_list.rb
412
391
  - examples/network_graph/breadth_first_search.rb
413
392
  - examples/network_graph/gen_2paths.rb
@@ -415,13 +394,17 @@ test_files:
415
394
  - examples/network_graph/gen_symmetric_links.rb
416
395
  - examples/pagerank/pagerank.rb
417
396
  - examples/pagerank/pagerank_initialize.rb
418
- - examples/rank_and_bin.rb
419
397
  - examples/sample_records.rb
420
398
  - examples/server_logs/apache_log_parser.rb
421
399
  - examples/server_logs/breadcrumbs.rb
400
+ - examples/server_logs/logline.rb
422
401
  - examples/server_logs/user_agent.rb
423
402
  - examples/size.rb
424
403
  - examples/stats/avg_value_frequency.rb
425
- - examples/store/chunked_store_example.rb
404
+ - examples/stats/binning_percentile_estimator.rb
405
+ - examples/stats/rank_and_bin.rb
426
406
  - examples/stupidly_simple_filter.rb
427
407
  - examples/word_count.rb
408
+ - spec/spec_helper.rb
409
+ - spec/wukong/encoding_spec.rb
410
+ - spec/wukong/script_spec.rb
@@ -1,85 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- # To install avro gem
4
- # cd avro/lang/ruby ; gem package ; sudo gem install pkg/avro-1.4.0.pre1.gem
5
-
6
- require 'rubygems'
7
- require 'avro'
8
- require 'wukong'
9
- require 'wukong/periodic_monitor'
10
-
11
- Settings.define :cassandra_avro_schema, :default => ('/usr/local/share/cassandra/interface/avro/cassandra.avpr')
12
- Settings.define :cassandra_thrift_uri, :default => `hostname`.chomp.strip+':9160'
13
- Settings.define :log_interval, :default => 10_000
14
-
15
- class AvroStreamer < Wukong::Streamer::RecordStreamer
16
- def initialize *args
17
- super(*args)
18
- @writer = SmutWriter.new
19
- @log = PeriodicMonitor.new
20
- end
21
-
22
- def process word, count, *_
23
- @writer.write_directly(word, 'count', count)
24
- @log.periodically( word, count )
25
- end
26
- end
27
-
28
- class SmutWriter
29
- # Reads in the protocol schema
30
- # creates the necessary encoder and writer.
31
- def initialize
32
- schema_file = Settings.cassandra_avro_schema
33
- @proto = Avro::Protocol.parse(File.read(schema_file))
34
- @schema = @proto.types.detect{|schema| schema.name == 'StreamingMutation'}
35
- @enc = Avro::IO::BinaryEncoder.new($stdout)
36
- @writer = Avro::IO::DatumWriter.new(@schema)
37
- end
38
-
39
- # Directly write the simplified StreamingMutation schema; uses patch from @stuhood
40
- def write_directly key, col_name, value
41
- @enc.write_bytes(key)
42
- @enc.write_bytes(col_name)
43
- @enc.write_bytes(value)
44
- @enc.write_long(Time.epoch_microseconds)
45
- @enc.write_int(0)
46
- end
47
-
48
- # Write using the datumwriter
49
- def write key, col_name, value
50
- @writer.write(smutation(key, col_name, value), @enc)
51
- end
52
-
53
- # Simplified StreamingMutation schema uses patch from @stuhood
54
- def smutation key, name, value
55
- {
56
- 'key' => key,
57
- 'name' => name.to_s,
58
- 'value' => value.to_s,
59
- 'timestamp' => Time.epoch_microseconds,
60
- 'ttl' => 0
61
- }
62
- end
63
-
64
- # The StreamingMutation schema defined in trunk.
65
- # Becomes monstrously inefficient due to implementation of unions.
66
- def smutation_from_trunk key, name, value
67
- {
68
- 'key' => key,
69
- 'mutation' => { 'column_or_supercolumn' => { 'column' => {
70
- 'name' => name.to_s,
71
- 'value' => value.to_s,
72
- 'clock' => { 'timestamp' => Time.epoch_microseconds },
73
- 'ttl' => 0
74
- }}}
75
- }
76
- end
77
- end
78
-
79
- Time.class_eval do
80
- def self.epoch_microseconds
81
- (Time.now.utc.to_i * 1_000_000)
82
- end
83
- end
84
-
85
- Wukong::Script.new(AvroStreamer, nil, :map_speculative => false).run