wukong 1.5.4 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. data/CHANGELOG.textile +32 -0
  2. data/README.textile +58 -12
  3. data/TODO.textile +0 -8
  4. data/bin/hdp-bzip +12 -17
  5. data/bin/hdp-kill-task +1 -1
  6. data/bin/hdp-sort +7 -7
  7. data/bin/hdp-stream +7 -7
  8. data/bin/hdp-stream-flat +2 -3
  9. data/bin/setcat +11 -0
  10. data/bin/uniq-ord +59 -0
  11. data/examples/corpus/bucket_counter.rb +47 -0
  12. data/examples/corpus/dbpedia_abstract_to_sentences.rb +85 -0
  13. data/examples/corpus/sentence_coocurrence.rb +70 -0
  14. data/examples/emr/README.textile +110 -0
  15. data/examples/emr/dot_wukong_dir/emr_bootstrap.sh +1 -0
  16. data/examples/emr/elastic_mapreduce_example.rb +2 -2
  17. data/examples/ignore_me/counting.rb +56 -0
  18. data/examples/ignore_me/grouper.rb +71 -0
  19. data/examples/network_graph/adjacency_list.rb +2 -2
  20. data/examples/network_graph/breadth_first_search.rb +14 -21
  21. data/examples/network_graph/gen_multi_edge.rb +22 -13
  22. data/examples/pagerank/pagerank.rb +1 -1
  23. data/examples/pagerank/pagerank_initialize.rb +6 -10
  24. data/examples/sample_records.rb +6 -16
  25. data/examples/server_logs/apache_log_parser.rb +7 -22
  26. data/examples/server_logs/breadcrumbs.rb +39 -0
  27. data/examples/server_logs/logline.rb +27 -0
  28. data/examples/size.rb +3 -2
  29. data/examples/{binning_percentile_estimator.rb → stats/binning_percentile_estimator.rb} +9 -11
  30. data/examples/{rank_and_bin.rb → stats/rank_and_bin.rb} +2 -2
  31. data/examples/stupidly_simple_filter.rb +11 -14
  32. data/examples/word_count.rb +16 -36
  33. data/lib/wukong/and_pig.rb +2 -15
  34. data/lib/wukong/logger.rb +7 -28
  35. data/lib/wukong/periodic_monitor.rb +24 -9
  36. data/lib/wukong/script/emr_command.rb +1 -0
  37. data/lib/wukong/script/hadoop_command.rb +31 -29
  38. data/lib/wukong/script.rb +19 -14
  39. data/lib/wukong/store/cassandra_model.rb +2 -1
  40. data/lib/wukong/streamer/accumulating_reducer.rb +5 -9
  41. data/lib/wukong/streamer/base.rb +44 -3
  42. data/lib/wukong/streamer/counting_reducer.rb +12 -12
  43. data/lib/wukong/streamer/filter.rb +2 -2
  44. data/lib/wukong/streamer/list_reducer.rb +3 -3
  45. data/lib/wukong/streamer/reducer.rb +11 -0
  46. data/lib/wukong/streamer.rb +7 -3
  47. data/lib/wukong.rb +7 -3
  48. data/{examples → old}/cassandra_streaming/berlitz_for_cassandra.textile +0 -0
  49. data/{examples → old}/cassandra_streaming/client_interface_notes.textile +0 -0
  50. data/{examples → old}/cassandra_streaming/client_schema.textile +0 -0
  51. data/{examples → old}/cassandra_streaming/tuning.textile +0 -0
  52. data/wukong.gemspec +257 -285
  53. metadata +45 -62
  54. data/examples/cassandra_streaming/avromapper.rb +0 -85
  55. data/examples/cassandra_streaming/cassandra.avpr +0 -468
  56. data/examples/cassandra_streaming/cassandra_random_partitioner.rb +0 -62
  57. data/examples/cassandra_streaming/catter.sh +0 -45
  58. data/examples/cassandra_streaming/client_schema.avpr +0 -211
  59. data/examples/cassandra_streaming/foofile.avr +0 -0
  60. data/examples/cassandra_streaming/pymap.sh +0 -1
  61. data/examples/cassandra_streaming/pyreduce.sh +0 -1
  62. data/examples/cassandra_streaming/smutation.avpr +0 -188
  63. data/examples/cassandra_streaming/streamer.sh +0 -51
  64. data/examples/cassandra_streaming/struct_loader.rb +0 -24
  65. data/examples/count_keys.rb +0 -56
  66. data/examples/count_keys_at_mapper.rb +0 -57
  67. data/examples/emr/README-elastic_map_reduce.textile +0 -26
  68. data/examples/keystore/cassandra_batch_test.rb +0 -41
  69. data/examples/keystore/conditional_outputter_example.rb +0 -70
  70. data/examples/store/chunked_store_example.rb +0 -18
  71. data/lib/wukong/dfs.rb +0 -81
  72. data/lib/wukong/keystore/cassandra_conditional_outputter.rb +0 -122
  73. data/lib/wukong/keystore/redis_db.rb +0 -24
  74. data/lib/wukong/keystore/tyrant_db.rb +0 -137
  75. data/lib/wukong/keystore/tyrant_notes.textile +0 -145
  76. data/lib/wukong/models/graph.rb +0 -25
  77. data/lib/wukong/monitor/chunked_store.rb +0 -23
  78. data/lib/wukong/monitor/periodic_logger.rb +0 -34
  79. data/lib/wukong/monitor/periodic_monitor.rb +0 -70
  80. data/lib/wukong/monitor.rb +0 -7
  81. data/lib/wukong/rdf.rb +0 -104
  82. data/lib/wukong/streamer/cassandra_streamer.rb +0 -61
  83. data/lib/wukong/streamer/count_keys.rb +0 -30
  84. data/lib/wukong/streamer/count_lines.rb +0 -26
  85. data/lib/wukong/streamer/em_streamer.rb +0 -7
  86. data/lib/wukong/streamer/preprocess_with_pipe_streamer.rb +0 -22
  87. data/lib/wukong/wukong_class.rb +0 -21
metadata CHANGED
@@ -1,12 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wukong
3
3
  version: !ruby/object:Gem::Version
4
- prerelease: false
4
+ hash: 15
5
+ prerelease:
5
6
  segments:
6
- - 1
7
- - 5
8
- - 4
9
- version: 1.5.4
7
+ - 2
8
+ - 0
9
+ - 0
10
+ version: 2.0.0
10
11
  platform: ruby
11
12
  authors:
12
13
  - Philip (flip) Kromer
@@ -14,7 +15,7 @@ autorequire:
14
15
  bindir: bin
15
16
  cert_chain: []
16
17
 
17
- date: 2010-11-02 00:00:00 -05:00
18
+ date: 2011-01-29 00:00:00 -06:00
18
19
  default_executable:
19
20
  dependencies:
20
21
  - !ruby/object:Gem::Dependency
@@ -25,6 +26,7 @@ dependencies:
25
26
  requirements:
26
27
  - - ">="
27
28
  - !ruby/object:Gem::Version
29
+ hash: 13
28
30
  segments:
29
31
  - 1
30
32
  - 2
@@ -40,6 +42,7 @@ dependencies:
40
42
  requirements:
41
43
  - - ">="
42
44
  - !ruby/object:Gem::Version
45
+ hash: 3
43
46
  segments:
44
47
  - 0
45
48
  version: "0"
@@ -53,6 +56,7 @@ dependencies:
53
56
  requirements:
54
57
  - - ">="
55
58
  - !ruby/object:Gem::Version
59
+ hash: 3
56
60
  segments:
57
61
  - 0
58
62
  version: "0"
@@ -66,6 +70,7 @@ dependencies:
66
70
  requirements:
67
71
  - - ">="
68
72
  - !ruby/object:Gem::Version
73
+ hash: 3
69
74
  segments:
70
75
  - 0
71
76
  version: "0"
@@ -79,6 +84,7 @@ dependencies:
79
84
  requirements:
80
85
  - - ">="
81
86
  - !ruby/object:Gem::Version
87
+ hash: 3
82
88
  segments:
83
89
  - 0
84
90
  version: "0"
@@ -92,6 +98,7 @@ dependencies:
92
98
  requirements:
93
99
  - - ">="
94
100
  - !ruby/object:Gem::Version
101
+ hash: 3
95
102
  segments:
96
103
  - 0
97
104
  version: "0"
@@ -144,7 +151,9 @@ files:
144
151
  - bin/hdp-sync
145
152
  - bin/hdp-wc
146
153
  - bin/md5sort
154
+ - bin/setcat
147
155
  - bin/tabchar
156
+ - bin/uniq-ord
148
157
  - bin/uniqc
149
158
  - bin/wu-date
150
159
  - bin/wu-datetime
@@ -216,38 +225,23 @@ files:
216
225
  - docpages/usage.textile
217
226
  - docpages/wutils.textile
218
227
  - examples/README.txt
219
- - examples/binning_percentile_estimator.rb
220
- - examples/cassandra_streaming/avromapper.rb
221
- - examples/cassandra_streaming/berlitz_for_cassandra.textile
222
- - examples/cassandra_streaming/cassandra.avpr
223
- - examples/cassandra_streaming/cassandra_random_partitioner.rb
224
- - examples/cassandra_streaming/catter.sh
225
- - examples/cassandra_streaming/client_interface_notes.textile
226
- - examples/cassandra_streaming/client_schema.avpr
227
- - examples/cassandra_streaming/client_schema.textile
228
- - examples/cassandra_streaming/foofile.avr
229
- - examples/cassandra_streaming/pymap.sh
230
- - examples/cassandra_streaming/pyreduce.sh
231
- - examples/cassandra_streaming/smutation.avpr
232
- - examples/cassandra_streaming/streamer.sh
233
- - examples/cassandra_streaming/struct_loader.rb
234
- - examples/cassandra_streaming/tuning.textile
235
228
  - examples/contrib/jeans/README.markdown
236
229
  - examples/contrib/jeans/data/normalized_sizes
237
230
  - examples/contrib/jeans/data/orders.tsv
238
231
  - examples/contrib/jeans/data/sizes
239
232
  - examples/contrib/jeans/normalize.rb
240
233
  - examples/contrib/jeans/sizes.rb
234
+ - examples/corpus/bucket_counter.rb
235
+ - examples/corpus/dbpedia_abstract_to_sentences.rb
236
+ - examples/corpus/sentence_coocurrence.rb
241
237
  - examples/corpus/words_to_bigrams.rb
242
- - examples/count_keys.rb
243
- - examples/count_keys_at_mapper.rb
244
- - examples/emr/README-elastic_map_reduce.textile
238
+ - examples/emr/README.textile
245
239
  - examples/emr/dot_wukong_dir/credentials.json
246
240
  - examples/emr/dot_wukong_dir/emr.yaml
247
241
  - examples/emr/dot_wukong_dir/emr_bootstrap.sh
248
242
  - examples/emr/elastic_mapreduce_example.rb
249
- - examples/keystore/cassandra_batch_test.rb
250
- - examples/keystore/conditional_outputter_example.rb
243
+ - examples/ignore_me/counting.rb
244
+ - examples/ignore_me/grouper.rb
251
245
  - examples/network_graph/adjacency_list.rb
252
246
  - examples/network_graph/breadth_first_search.rb
253
247
  - examples/network_graph/gen_2paths.rb
@@ -258,15 +252,16 @@ files:
258
252
  - examples/pagerank/pagerank.rb
259
253
  - examples/pagerank/pagerank_initialize.rb
260
254
  - examples/pagerank/run_pagerank.sh
261
- - examples/rank_and_bin.rb
262
255
  - examples/sample_records.rb
263
256
  - examples/server_logs/apache_log_parser.rb
264
257
  - examples/server_logs/breadcrumbs.rb
258
+ - examples/server_logs/logline.rb
265
259
  - examples/server_logs/user_agent.rb
266
260
  - examples/size.rb
267
261
  - examples/stats/avg_value_frequency.rb
262
+ - examples/stats/binning_percentile_estimator.rb
268
263
  - examples/stats/data/avg_value_frequency.tsv
269
- - examples/store/chunked_store_example.rb
264
+ - examples/stats/rank_and_bin.rb
270
265
  - examples/stupidly_simple_filter.rb
271
266
  - examples/word_count.rb
272
267
  - lib/wukong.rb
@@ -275,7 +270,6 @@ files:
275
270
  - lib/wukong/datatypes.rb
276
271
  - lib/wukong/datatypes/enum.rb
277
272
  - lib/wukong/datatypes/fake_types.rb
278
- - lib/wukong/dfs.rb
279
273
  - lib/wukong/encoding.rb
280
274
  - lib/wukong/encoding/asciize.rb
281
275
  - lib/wukong/extensions.rb
@@ -295,18 +289,8 @@ files:
295
289
  - lib/wukong/extensions/struct.rb
296
290
  - lib/wukong/extensions/symbol.rb
297
291
  - lib/wukong/filename_pattern.rb
298
- - lib/wukong/keystore/cassandra_conditional_outputter.rb
299
- - lib/wukong/keystore/redis_db.rb
300
- - lib/wukong/keystore/tyrant_db.rb
301
- - lib/wukong/keystore/tyrant_notes.textile
302
292
  - lib/wukong/logger.rb
303
- - lib/wukong/models/graph.rb
304
- - lib/wukong/monitor.rb
305
- - lib/wukong/monitor/chunked_store.rb
306
- - lib/wukong/monitor/periodic_logger.rb
307
- - lib/wukong/monitor/periodic_monitor.rb
308
293
  - lib/wukong/periodic_monitor.rb
309
- - lib/wukong/rdf.rb
310
294
  - lib/wukong/schema.rb
311
295
  - lib/wukong/script.rb
312
296
  - lib/wukong/script/avro_command.rb
@@ -334,23 +318,22 @@ files:
334
318
  - lib/wukong/streamer.rb
335
319
  - lib/wukong/streamer/accumulating_reducer.rb
336
320
  - lib/wukong/streamer/base.rb
337
- - lib/wukong/streamer/cassandra_streamer.rb
338
- - lib/wukong/streamer/count_keys.rb
339
- - lib/wukong/streamer/count_lines.rb
340
321
  - lib/wukong/streamer/counting_reducer.rb
341
- - lib/wukong/streamer/em_streamer.rb
342
322
  - lib/wukong/streamer/filter.rb
343
323
  - lib/wukong/streamer/line_streamer.rb
344
324
  - lib/wukong/streamer/list_reducer.rb
345
- - lib/wukong/streamer/preprocess_with_pipe_streamer.rb
346
325
  - lib/wukong/streamer/rank_and_bin_reducer.rb
347
326
  - lib/wukong/streamer/record_streamer.rb
327
+ - lib/wukong/streamer/reducer.rb
348
328
  - lib/wukong/streamer/set_reducer.rb
349
329
  - lib/wukong/streamer/struct_streamer.rb
350
330
  - lib/wukong/streamer/summing_reducer.rb
351
331
  - lib/wukong/streamer/uniq_by_last_reducer.rb
352
332
  - lib/wukong/typed_struct.rb
353
- - lib/wukong/wukong_class.rb
333
+ - old/cassandra_streaming/berlitz_for_cassandra.textile
334
+ - old/cassandra_streaming/client_interface_notes.textile
335
+ - old/cassandra_streaming/client_schema.textile
336
+ - old/cassandra_streaming/tuning.textile
354
337
  - spec/data/a_atsigns_b.tsv
355
338
  - spec/data/a_follows_b.tsv
356
339
  - spec/data/tweet.tsv
@@ -365,8 +348,8 @@ homepage: http://mrflip.github.com/wukong
365
348
  licenses: []
366
349
 
367
350
  post_install_message:
368
- rdoc_options:
369
- - --charset=UTF-8
351
+ rdoc_options: []
352
+
370
353
  require_paths:
371
354
  - lib
372
355
  required_ruby_version: !ruby/object:Gem::Requirement
@@ -374,6 +357,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
374
357
  requirements:
375
358
  - - ">="
376
359
  - !ruby/object:Gem::Version
360
+ hash: 3
377
361
  segments:
378
362
  - 0
379
363
  version: "0"
@@ -382,32 +366,27 @@ required_rubygems_version: !ruby/object:Gem::Requirement
382
366
  requirements:
383
367
  - - ">="
384
368
  - !ruby/object:Gem::Version
369
+ hash: 3
385
370
  segments:
386
371
  - 0
387
372
  version: "0"
388
373
  requirements: []
389
374
 
390
375
  rubyforge_project:
391
- rubygems_version: 1.3.7
376
+ rubygems_version: 1.4.2
392
377
  signing_key:
393
378
  specification_version: 3
394
379
  summary: Hadoop Streaming for Ruby. Wukong makes Hadoop so easy a chimpanzee can use it, yet handles terabyte-scale computation with ease.
395
380
  test_files:
396
- - spec/spec_helper.rb
397
- - spec/wukong/encoding_spec.rb
398
- - spec/wukong/script_spec.rb
399
- - examples/binning_percentile_estimator.rb
400
- - examples/cassandra_streaming/avromapper.rb
401
- - examples/cassandra_streaming/cassandra_random_partitioner.rb
402
- - examples/cassandra_streaming/struct_loader.rb
403
381
  - examples/contrib/jeans/normalize.rb
404
382
  - examples/contrib/jeans/sizes.rb
383
+ - examples/corpus/bucket_counter.rb
384
+ - examples/corpus/dbpedia_abstract_to_sentences.rb
385
+ - examples/corpus/sentence_coocurrence.rb
405
386
  - examples/corpus/words_to_bigrams.rb
406
- - examples/count_keys.rb
407
- - examples/count_keys_at_mapper.rb
408
387
  - examples/emr/elastic_mapreduce_example.rb
409
- - examples/keystore/cassandra_batch_test.rb
410
- - examples/keystore/conditional_outputter_example.rb
388
+ - examples/ignore_me/counting.rb
389
+ - examples/ignore_me/grouper.rb
411
390
  - examples/network_graph/adjacency_list.rb
412
391
  - examples/network_graph/breadth_first_search.rb
413
392
  - examples/network_graph/gen_2paths.rb
@@ -415,13 +394,17 @@ test_files:
415
394
  - examples/network_graph/gen_symmetric_links.rb
416
395
  - examples/pagerank/pagerank.rb
417
396
  - examples/pagerank/pagerank_initialize.rb
418
- - examples/rank_and_bin.rb
419
397
  - examples/sample_records.rb
420
398
  - examples/server_logs/apache_log_parser.rb
421
399
  - examples/server_logs/breadcrumbs.rb
400
+ - examples/server_logs/logline.rb
422
401
  - examples/server_logs/user_agent.rb
423
402
  - examples/size.rb
424
403
  - examples/stats/avg_value_frequency.rb
425
- - examples/store/chunked_store_example.rb
404
+ - examples/stats/binning_percentile_estimator.rb
405
+ - examples/stats/rank_and_bin.rb
426
406
  - examples/stupidly_simple_filter.rb
427
407
  - examples/word_count.rb
408
+ - spec/spec_helper.rb
409
+ - spec/wukong/encoding_spec.rb
410
+ - spec/wukong/script_spec.rb
@@ -1,85 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- # To install avro gem
4
- # cd avro/lang/ruby ; gem package ; sudo gem install pkg/avro-1.4.0.pre1.gem
5
-
6
- require 'rubygems'
7
- require 'avro'
8
- require 'wukong'
9
- require 'wukong/periodic_monitor'
10
-
11
- Settings.define :cassandra_avro_schema, :default => ('/usr/local/share/cassandra/interface/avro/cassandra.avpr')
12
- Settings.define :cassandra_thrift_uri, :default => `hostname`.chomp.strip+':9160'
13
- Settings.define :log_interval, :default => 10_000
14
-
15
- class AvroStreamer < Wukong::Streamer::RecordStreamer
16
- def initialize *args
17
- super(*args)
18
- @writer = SmutWriter.new
19
- @log = PeriodicMonitor.new
20
- end
21
-
22
- def process word, count, *_
23
- @writer.write_directly(word, 'count', count)
24
- @log.periodically( word, count )
25
- end
26
- end
27
-
28
- class SmutWriter
29
- # Reads in the protocol schema
30
- # creates the necessary encoder and writer.
31
- def initialize
32
- schema_file = Settings.cassandra_avro_schema
33
- @proto = Avro::Protocol.parse(File.read(schema_file))
34
- @schema = @proto.types.detect{|schema| schema.name == 'StreamingMutation'}
35
- @enc = Avro::IO::BinaryEncoder.new($stdout)
36
- @writer = Avro::IO::DatumWriter.new(@schema)
37
- end
38
-
39
- # Directly write the simplified StreamingMutation schema; uses patch from @stuhood
40
- def write_directly key, col_name, value
41
- @enc.write_bytes(key)
42
- @enc.write_bytes(col_name)
43
- @enc.write_bytes(value)
44
- @enc.write_long(Time.epoch_microseconds)
45
- @enc.write_int(0)
46
- end
47
-
48
- # Write using the datumwriter
49
- def write key, col_name, value
50
- @writer.write(smutation(key, col_name, value), @enc)
51
- end
52
-
53
- # Simplified StreamingMutation schema uses patch from @stuhood
54
- def smutation key, name, value
55
- {
56
- 'key' => key,
57
- 'name' => name.to_s,
58
- 'value' => value.to_s,
59
- 'timestamp' => Time.epoch_microseconds,
60
- 'ttl' => 0
61
- }
62
- end
63
-
64
- # The StreamingMutation schema defined in trunk.
65
- # Becomes monstrously inefficient due to implementation of unions.
66
- def smutation_from_trunk key, name, value
67
- {
68
- 'key' => key,
69
- 'mutation' => { 'column_or_supercolumn' => { 'column' => {
70
- 'name' => name.to_s,
71
- 'value' => value.to_s,
72
- 'clock' => { 'timestamp' => Time.epoch_microseconds },
73
- 'ttl' => 0
74
- }}}
75
- }
76
- end
77
- end
78
-
79
- Time.class_eval do
80
- def self.epoch_microseconds
81
- (Time.now.utc.to_i * 1_000_000)
82
- end
83
- end
84
-
85
- Wukong::Script.new(AvroStreamer, nil, :map_speculative => false).run