wukong 1.5.4 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. data/CHANGELOG.textile +32 -0
  2. data/README.textile +58 -12
  3. data/TODO.textile +0 -8
  4. data/bin/hdp-bzip +12 -17
  5. data/bin/hdp-kill-task +1 -1
  6. data/bin/hdp-sort +7 -7
  7. data/bin/hdp-stream +7 -7
  8. data/bin/hdp-stream-flat +2 -3
  9. data/bin/setcat +11 -0
  10. data/bin/uniq-ord +59 -0
  11. data/examples/corpus/bucket_counter.rb +47 -0
  12. data/examples/corpus/dbpedia_abstract_to_sentences.rb +85 -0
  13. data/examples/corpus/sentence_coocurrence.rb +70 -0
  14. data/examples/emr/README.textile +110 -0
  15. data/examples/emr/dot_wukong_dir/emr_bootstrap.sh +1 -0
  16. data/examples/emr/elastic_mapreduce_example.rb +2 -2
  17. data/examples/ignore_me/counting.rb +56 -0
  18. data/examples/ignore_me/grouper.rb +71 -0
  19. data/examples/network_graph/adjacency_list.rb +2 -2
  20. data/examples/network_graph/breadth_first_search.rb +14 -21
  21. data/examples/network_graph/gen_multi_edge.rb +22 -13
  22. data/examples/pagerank/pagerank.rb +1 -1
  23. data/examples/pagerank/pagerank_initialize.rb +6 -10
  24. data/examples/sample_records.rb +6 -16
  25. data/examples/server_logs/apache_log_parser.rb +7 -22
  26. data/examples/server_logs/breadcrumbs.rb +39 -0
  27. data/examples/server_logs/logline.rb +27 -0
  28. data/examples/size.rb +3 -2
  29. data/examples/{binning_percentile_estimator.rb → stats/binning_percentile_estimator.rb} +9 -11
  30. data/examples/{rank_and_bin.rb → stats/rank_and_bin.rb} +2 -2
  31. data/examples/stupidly_simple_filter.rb +11 -14
  32. data/examples/word_count.rb +16 -36
  33. data/lib/wukong/and_pig.rb +2 -15
  34. data/lib/wukong/logger.rb +7 -28
  35. data/lib/wukong/periodic_monitor.rb +24 -9
  36. data/lib/wukong/script/emr_command.rb +1 -0
  37. data/lib/wukong/script/hadoop_command.rb +31 -29
  38. data/lib/wukong/script.rb +19 -14
  39. data/lib/wukong/store/cassandra_model.rb +2 -1
  40. data/lib/wukong/streamer/accumulating_reducer.rb +5 -9
  41. data/lib/wukong/streamer/base.rb +44 -3
  42. data/lib/wukong/streamer/counting_reducer.rb +12 -12
  43. data/lib/wukong/streamer/filter.rb +2 -2
  44. data/lib/wukong/streamer/list_reducer.rb +3 -3
  45. data/lib/wukong/streamer/reducer.rb +11 -0
  46. data/lib/wukong/streamer.rb +7 -3
  47. data/lib/wukong.rb +7 -3
  48. data/{examples → old}/cassandra_streaming/berlitz_for_cassandra.textile +0 -0
  49. data/{examples → old}/cassandra_streaming/client_interface_notes.textile +0 -0
  50. data/{examples → old}/cassandra_streaming/client_schema.textile +0 -0
  51. data/{examples → old}/cassandra_streaming/tuning.textile +0 -0
  52. data/wukong.gemspec +257 -285
  53. metadata +45 -62
  54. data/examples/cassandra_streaming/avromapper.rb +0 -85
  55. data/examples/cassandra_streaming/cassandra.avpr +0 -468
  56. data/examples/cassandra_streaming/cassandra_random_partitioner.rb +0 -62
  57. data/examples/cassandra_streaming/catter.sh +0 -45
  58. data/examples/cassandra_streaming/client_schema.avpr +0 -211
  59. data/examples/cassandra_streaming/foofile.avr +0 -0
  60. data/examples/cassandra_streaming/pymap.sh +0 -1
  61. data/examples/cassandra_streaming/pyreduce.sh +0 -1
  62. data/examples/cassandra_streaming/smutation.avpr +0 -188
  63. data/examples/cassandra_streaming/streamer.sh +0 -51
  64. data/examples/cassandra_streaming/struct_loader.rb +0 -24
  65. data/examples/count_keys.rb +0 -56
  66. data/examples/count_keys_at_mapper.rb +0 -57
  67. data/examples/emr/README-elastic_map_reduce.textile +0 -26
  68. data/examples/keystore/cassandra_batch_test.rb +0 -41
  69. data/examples/keystore/conditional_outputter_example.rb +0 -70
  70. data/examples/store/chunked_store_example.rb +0 -18
  71. data/lib/wukong/dfs.rb +0 -81
  72. data/lib/wukong/keystore/cassandra_conditional_outputter.rb +0 -122
  73. data/lib/wukong/keystore/redis_db.rb +0 -24
  74. data/lib/wukong/keystore/tyrant_db.rb +0 -137
  75. data/lib/wukong/keystore/tyrant_notes.textile +0 -145
  76. data/lib/wukong/models/graph.rb +0 -25
  77. data/lib/wukong/monitor/chunked_store.rb +0 -23
  78. data/lib/wukong/monitor/periodic_logger.rb +0 -34
  79. data/lib/wukong/monitor/periodic_monitor.rb +0 -70
  80. data/lib/wukong/monitor.rb +0 -7
  81. data/lib/wukong/rdf.rb +0 -104
  82. data/lib/wukong/streamer/cassandra_streamer.rb +0 -61
  83. data/lib/wukong/streamer/count_keys.rb +0 -30
  84. data/lib/wukong/streamer/count_lines.rb +0 -26
  85. data/lib/wukong/streamer/em_streamer.rb +0 -7
  86. data/lib/wukong/streamer/preprocess_with_pipe_streamer.rb +0 -22
  87. data/lib/wukong/wukong_class.rb +0 -21
@@ -1,23 +0,0 @@
1
- require 'monkeyshines/monitor/periodic_monitor'
2
- module Monkeyshines
3
- module Monitor
4
- module ChunkedStore
5
- attr_accessor :file_pattern
6
- def initialize file_pattern
7
- self.file_pattern = file_pattern
8
- super file_pattern.make
9
- end
10
-
11
- def close_and_reopen
12
- close
13
- self.filename = file_pattern.make
14
- dump_file
15
- end
16
-
17
- def save *args
18
- chunk_monitor.periodically{ close_rename_and_open }
19
- super *args
20
- end
21
- end
22
- end
23
- end
@@ -1,34 +0,0 @@
1
- module Monkeyshines
2
- module Monitor
3
-
4
- #
5
- # Emits a log line but only every +iter_interval+ calls or +time_interval+
6
- # lapse.
7
- #
8
- # Since the contents of the block aren't called until the criteria are met,
9
- # you can put relatively expensive operations in the log without killing
10
- # your iteration time.
11
- #
12
- class PeriodicLogger < PeriodicMonitor
13
- #
14
- # Call with a block that returns a string or array to log.
15
- # If you return
16
- #
17
- # Ex: log if it has been at least 5 minutes since last announcement:
18
- #
19
- # periodic_logger = Monkeyshines::Monitor::PeriodicLogger.new(:time => 300)
20
- # loop do
21
- # # ... stuff ...
22
- # periodic_logger.periodically{ [morbenfactor, crunkosity, exuberance] }
23
- # end
24
- #
25
- def periodically &block
26
- super do
27
- now = Time.now.utc.to_f
28
- result = [ "%10d"%iter, "%7.1f"%since, "%7.1f"%inst_rate(now), (block ? block.call : nil) ].flatten.compact
29
- Log.info result.join("\t")
30
- end
31
- end
32
- end
33
- end
34
- end
@@ -1,70 +0,0 @@
1
- module Wukong::Monitor
2
- #
3
- # Accepts a lightweight call every iteration.
4
- #
5
- # Once either a time or an iteration criterion is met, executes the block
6
- # and resets the timer until next execution.
7
- #
8
- # Note that the +time_interval+ is measured *excution to execution* and not
9
- # in multiples of iter_interval. Say I set a time_interval of 300s, and
10
- # happen to iterate at 297s and 310s after start. Then the monitor will
11
- # execute at 310s, and the next execution will happen on or after 610s.
12
- #
13
- # Also note that when *either* criterion is met, *both* criteria are
14
- # reset. Say I set a time interval of 300s and an +iter_interval+ of 10_000;
15
- # and that at 250s I reach iteration 10_000. Then the monitor will execute
16
- # on or after 20_000 iteration or 550s, whichever happens first.
17
- #
18
- class PeriodicMonitor
19
- attr_accessor :time_interval, :iter_interval
20
- attr_accessor :last_time, :current_iter, :iter, :started_at
21
-
22
- def initialize options={}
23
- self.started_at = Time.now.utc.to_f
24
- self.last_time = started_at
25
- self.iter = 0
26
- self.current_iter = 0
27
- self.time_interval = options[:time]
28
- self.iter_interval = options[:iters]
29
- end
30
-
31
- # True if more than +iter_interval+ has elapsed since last execution.
32
- def enough_iterations?
33
- iter % iter_interval == 0 if iter_interval
34
- end
35
-
36
- # True if more than +time_interval+ has elapsed since last execution.
37
- def enough_time? now
38
- (now - last_time) > time_interval if time_interval
39
- end
40
-
41
- # Time since monitor was created
42
- def since
43
- Time.now.utc.to_f - started_at
44
- end
45
- # Overall iterations per second
46
- def rate
47
- iter.to_f / since.to_f
48
- end
49
- # "Instantaneous" iterations per second
50
- def inst_rate now
51
- current_iter.to_f / (now-last_time).to_f
52
- end
53
-
54
- #
55
- # if the interval conditions are met, executes block; otherwise just does
56
- # bookkeeping and returns.
57
- #
58
- def periodically &block
59
- self.iter += 1
60
- self.current_iter += 1
61
- now = Time.now.utc.to_f
62
- if enough_iterations? || enough_time?(now)
63
- block.call(iter, (now-last_time))
64
- self.last_time = now
65
- self.current_iter = 0
66
- end
67
- end
68
- end
69
-
70
- end
@@ -1,7 +0,0 @@
1
- module Monkeyshines
2
- module Monitor
3
- autoload :PeriodicMonitor, 'monkeyshines/monitor/periodic_monitor'
4
- autoload :PeriodicLogger, 'monkeyshines/monitor/periodic_logger'
5
- end
6
- end
7
-
data/lib/wukong/rdf.rb DELETED
@@ -1,104 +0,0 @@
1
- module Wukong
2
- #
3
- # Dump wukong object as RDF triples:
4
- #
5
- # <key attr val module Wukong
6
- #
7
- # Dump wukong object as RDF triples:
8
- #
9
- # <key> <attr> <val> # <extra>
10
- #
11
- # Each element of the triple is XML encoded such that it contains no tab,
12
- # newline or carriage returns, and the three are tab-separated. Any extra
13
- # fields -- reification info, for instance -- are appended as a comment.
14
- #
15
- # This makes the result not only a valid RDF triple file but perfectly
16
- # palatable to Wukong for further processing.
17
- #
18
- module Rdf
19
-
20
- #
21
- # RDF-formatted date
22
- #
23
- def self.encode_datetime dt
24
- DateTime.parse_safely(dt).xmlschema
25
- end
26
-
27
- #
28
- # Emit a component (subject or object) with the right semantic encoding
29
- #
30
- # Use :boolskip if a false property should just be left out.
31
- #
32
- def rdf_component val, type
33
- case type
34
- when :tweet then %Q{<http://twitter.com/statuses/show/#{val}.xml>}
35
- when :user then %Q{<http://twitter.com/users/show/#{val}.xml>}
36
- when :bool then ((!val) || (val==0) || (val=="0")) ? '"false"^^<xsd:boolean>' : '"true"^^<xsd:boolean>'
37
- when :boolskip then ((!val) || (val==0) || (val=="0")) ? nil : '"true"^^<xsd:boolean>'
38
- when :int then %Q{"#{val.to_i}"^^<xsd:integer>}
39
- when :date then %Q{"#{TwitterRdf.encode_datetime(val)}"^^<xsd:dateTime>}
40
- when :str then %Q{"#{val}"}
41
- else raise "Don't know how to encode #{type}"
42
- end
43
- end
44
-
45
- #
46
- # Express relationship (predicate) in RDF
47
- #
48
- def rdf_pred pred
49
- case pred
50
- when :created_at then %Q{<http://twitter.com/##{pred}>}
51
- else %Q{<http://twitter.com/##{pred}>}
52
- end
53
- end
54
-
55
- #
56
- # RDF Triple string for the given (subject, object, predicate)
57
- # http://www.w3.org/TR/rdf-testcases/#ntriples
58
- #
59
- def self.rdf_triple subj, pred, obj, comment=nil
60
- comment = "\t# " + comment.to_s unless comment.blank?
61
- %Q{%-55s\t%-39s\t%-23s\t.%s} % [subj, pred, obj, comment]
62
- end
63
-
64
- def mutable?(attr)
65
- false
66
- end
67
-
68
- #
69
- # Extract [subject, predicate, object, (extra)] tuples.
70
- #
71
- # (extra) is set to +scraped at+ for #mutable? attributes, blank otherwise.
72
- #
73
- def to_rdf3_tuples
74
- members_with_types.map do |attr, type|
75
- next if self[attr].blank?
76
- subj = rdf_resource
77
- pred = rdf_pred(attr)
78
- obj = rdf_component(self[attr], type) or next
79
- comment = scraped_at if mutable?(attr)
80
- [subj, pred, obj, comment]
81
- end.compact
82
- end
83
-
84
- #
85
- # Convert an object to an rdf triple.
86
- #
87
- # Appends scraped at to #mutable? attributes
88
- #
89
- def to_rdf3
90
- to_rdf3_tuples.map do |tuple|
91
- self.class.rdf_triple tuple
92
- end.join("\n")
93
- end
94
-
95
- end
96
- end
97
- >
98
- #
99
- #
100
- module Rdf
101
- def to_rdf
102
- end
103
- end
104
- end
@@ -1,61 +0,0 @@
1
- # Defines a base class for streaming data into a cassandra db connection.
2
- require 'cassandra' ; include Cassandra::Constants
3
- module Wukong
4
- module Streamer
5
-
6
- class CassandraStreamer < Wukong::Streamer::Base
7
- attr_accessor :batch_count, :batch_record_count, :batch_size, :column_space, :db_seeds, :cassandra_db
8
-
9
- def initialize *args
10
- super *args
11
- self.batch_count = 0
12
- self.batch_record_count = 0
13
- self.column_space ||= 'Twitter'
14
- self.batch_size ||= 100
15
- self.db_seeds ||= %w[10.244.191.178 10.243.19.223 10.243.17.219 10.245.70.85 10.244.206.241].map{ |s| s.to_s+':9160'}
16
- self.cassandra_db ||= Cassandra.new(self.column_space, self.db_seeds)
17
- end
18
-
19
- def stream
20
- while still_lines? do
21
- start_batch do
22
- while still_lines? && batch_not_full? do
23
- line = get_line
24
- record = recordize(line.chomp) or next
25
- next if record.blank?
26
- process(*record) do |output_record|
27
- emit output_record
28
- end
29
- self.batch_record_count += 1
30
- end
31
- end
32
- end
33
- end
34
-
35
- def process *args, &blk
36
- Raise "Overwrite this method to insert into cassandra db"
37
- end
38
-
39
- def start_batch &blk
40
- self.batch_record_count = 0
41
- self.batch_count += 1
42
- self.cassandra_db.batch(&blk)
43
- end
44
-
45
- def get_line
46
- $stdin.gets
47
- end
48
-
49
- def still_lines?
50
- !$stdin.eof?
51
- end
52
-
53
- def batch_not_full?
54
- self.batch_record_count < self.batch_size
55
- end
56
-
57
- end
58
- end
59
-
60
- end
61
-
@@ -1,30 +0,0 @@
1
- module Wukong
2
- module Streamer
3
- #
4
- # Emit each unique key and the count of its occurrences
5
- #
6
- class CountKeys < Wukong::Streamer::AccumulatingReducer
7
- attr_accessor :key_count
8
-
9
- def formatted_key_count
10
- "%10d"%key_count.to_i
11
- end
12
-
13
- # reset the counter to zero
14
- def start! *args
15
- self.key_count = 0
16
- end
17
-
18
- # record one more for this key
19
- def accumulate *vals
20
- self.key_count += 1
21
- end
22
-
23
- # emit each key field and the count, tab-separated.
24
- def finalize
25
- yield [key, formatted_key_count]
26
- end
27
- end
28
-
29
- end
30
- end
@@ -1,26 +0,0 @@
1
- module Wukong
2
- module Streamer
3
- #
4
- # For each identical line in the map phase output, emit one representative
5
- # line followed by the count of occrrences (separated by a tab).
6
- #
7
- # (This is the functional equivalent of +'uniq -c'+)
8
- #
9
- class CountLines < Wukong::Streamer::Base
10
- def formatted_count item, key_count
11
- "%s\t%10d" % [item, key_count.to_i]
12
- end
13
-
14
- #
15
- # Delegate to +uniq -c+, but put the count last for idempotence.
16
- #
17
- def stream
18
- %x{/usr/bin/uniq -c}.split("\n").each do |line|
19
- key_count, item = line.chomp.strip.split(/\s+/, 2)
20
- puts formatted_count(item, key_count)
21
- end
22
- end
23
- end
24
-
25
- end
26
- end
@@ -1,7 +0,0 @@
1
- module Wukong
2
- module Streamer
3
- class EmStreamer
4
-
5
- end
6
- end
7
- end
@@ -1,22 +0,0 @@
1
- module Wukong
2
- module Streamer
3
- module PreprocessWithPipeStreamer
4
- #
5
- # Runs STDIN through a shell command and then begins processing.
6
- #
7
- # If you don't need to do anything to the output of the command, just
8
- # inherit from Wukong::Script and override the #map_command.
9
- #
10
- # You must provide a @preprocess_pipe_command@ method that returns a shell
11
- # command to run the input through.
12
- #
13
- def stream
14
- #
15
- `#{preprocess_pipe_command}`.each do |line|
16
- item = itemize(line) ; next if item.blank?
17
- process(*item)
18
- end
19
- end
20
- end
21
- end
22
- end
@@ -1,21 +0,0 @@
1
- # require 'active_support/core_ext/class/inheritable_attributes.rb'
2
- require 'extlib/class'
3
-
4
- module Wukong
5
- #
6
- # Use to instrument an actual class to behave
7
- #
8
- module WukongClass
9
-
10
-
11
- def [](attr)
12
- self.send attr
13
- end
14
- def []=(attr, val)
15
- self.send("#{attr}=", val)
16
- end
17
-
18
- end
19
-
20
-
21
- end