wukong 1.5.4 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (87) hide show
  1. data/CHANGELOG.textile +32 -0
  2. data/README.textile +58 -12
  3. data/TODO.textile +0 -8
  4. data/bin/hdp-bzip +12 -17
  5. data/bin/hdp-kill-task +1 -1
  6. data/bin/hdp-sort +7 -7
  7. data/bin/hdp-stream +7 -7
  8. data/bin/hdp-stream-flat +2 -3
  9. data/bin/setcat +11 -0
  10. data/bin/uniq-ord +59 -0
  11. data/examples/corpus/bucket_counter.rb +47 -0
  12. data/examples/corpus/dbpedia_abstract_to_sentences.rb +85 -0
  13. data/examples/corpus/sentence_coocurrence.rb +70 -0
  14. data/examples/emr/README.textile +110 -0
  15. data/examples/emr/dot_wukong_dir/emr_bootstrap.sh +1 -0
  16. data/examples/emr/elastic_mapreduce_example.rb +2 -2
  17. data/examples/ignore_me/counting.rb +56 -0
  18. data/examples/ignore_me/grouper.rb +71 -0
  19. data/examples/network_graph/adjacency_list.rb +2 -2
  20. data/examples/network_graph/breadth_first_search.rb +14 -21
  21. data/examples/network_graph/gen_multi_edge.rb +22 -13
  22. data/examples/pagerank/pagerank.rb +1 -1
  23. data/examples/pagerank/pagerank_initialize.rb +6 -10
  24. data/examples/sample_records.rb +6 -16
  25. data/examples/server_logs/apache_log_parser.rb +7 -22
  26. data/examples/server_logs/breadcrumbs.rb +39 -0
  27. data/examples/server_logs/logline.rb +27 -0
  28. data/examples/size.rb +3 -2
  29. data/examples/{binning_percentile_estimator.rb → stats/binning_percentile_estimator.rb} +9 -11
  30. data/examples/{rank_and_bin.rb → stats/rank_and_bin.rb} +2 -2
  31. data/examples/stupidly_simple_filter.rb +11 -14
  32. data/examples/word_count.rb +16 -36
  33. data/lib/wukong/and_pig.rb +2 -15
  34. data/lib/wukong/logger.rb +7 -28
  35. data/lib/wukong/periodic_monitor.rb +24 -9
  36. data/lib/wukong/script/emr_command.rb +1 -0
  37. data/lib/wukong/script/hadoop_command.rb +31 -29
  38. data/lib/wukong/script.rb +19 -14
  39. data/lib/wukong/store/cassandra_model.rb +2 -1
  40. data/lib/wukong/streamer/accumulating_reducer.rb +5 -9
  41. data/lib/wukong/streamer/base.rb +44 -3
  42. data/lib/wukong/streamer/counting_reducer.rb +12 -12
  43. data/lib/wukong/streamer/filter.rb +2 -2
  44. data/lib/wukong/streamer/list_reducer.rb +3 -3
  45. data/lib/wukong/streamer/reducer.rb +11 -0
  46. data/lib/wukong/streamer.rb +7 -3
  47. data/lib/wukong.rb +7 -3
  48. data/{examples → old}/cassandra_streaming/berlitz_for_cassandra.textile +0 -0
  49. data/{examples → old}/cassandra_streaming/client_interface_notes.textile +0 -0
  50. data/{examples → old}/cassandra_streaming/client_schema.textile +0 -0
  51. data/{examples → old}/cassandra_streaming/tuning.textile +0 -0
  52. data/wukong.gemspec +257 -285
  53. metadata +45 -62
  54. data/examples/cassandra_streaming/avromapper.rb +0 -85
  55. data/examples/cassandra_streaming/cassandra.avpr +0 -468
  56. data/examples/cassandra_streaming/cassandra_random_partitioner.rb +0 -62
  57. data/examples/cassandra_streaming/catter.sh +0 -45
  58. data/examples/cassandra_streaming/client_schema.avpr +0 -211
  59. data/examples/cassandra_streaming/foofile.avr +0 -0
  60. data/examples/cassandra_streaming/pymap.sh +0 -1
  61. data/examples/cassandra_streaming/pyreduce.sh +0 -1
  62. data/examples/cassandra_streaming/smutation.avpr +0 -188
  63. data/examples/cassandra_streaming/streamer.sh +0 -51
  64. data/examples/cassandra_streaming/struct_loader.rb +0 -24
  65. data/examples/count_keys.rb +0 -56
  66. data/examples/count_keys_at_mapper.rb +0 -57
  67. data/examples/emr/README-elastic_map_reduce.textile +0 -26
  68. data/examples/keystore/cassandra_batch_test.rb +0 -41
  69. data/examples/keystore/conditional_outputter_example.rb +0 -70
  70. data/examples/store/chunked_store_example.rb +0 -18
  71. data/lib/wukong/dfs.rb +0 -81
  72. data/lib/wukong/keystore/cassandra_conditional_outputter.rb +0 -122
  73. data/lib/wukong/keystore/redis_db.rb +0 -24
  74. data/lib/wukong/keystore/tyrant_db.rb +0 -137
  75. data/lib/wukong/keystore/tyrant_notes.textile +0 -145
  76. data/lib/wukong/models/graph.rb +0 -25
  77. data/lib/wukong/monitor/chunked_store.rb +0 -23
  78. data/lib/wukong/monitor/periodic_logger.rb +0 -34
  79. data/lib/wukong/monitor/periodic_monitor.rb +0 -70
  80. data/lib/wukong/monitor.rb +0 -7
  81. data/lib/wukong/rdf.rb +0 -104
  82. data/lib/wukong/streamer/cassandra_streamer.rb +0 -61
  83. data/lib/wukong/streamer/count_keys.rb +0 -30
  84. data/lib/wukong/streamer/count_lines.rb +0 -26
  85. data/lib/wukong/streamer/em_streamer.rb +0 -7
  86. data/lib/wukong/streamer/preprocess_with_pipe_streamer.rb +0 -22
  87. data/lib/wukong/wukong_class.rb +0 -21
@@ -1,23 +0,0 @@
1
- require 'monkeyshines/monitor/periodic_monitor'
2
- module Monkeyshines
3
- module Monitor
4
- module ChunkedStore
5
- attr_accessor :file_pattern
6
- def initialize file_pattern
7
- self.file_pattern = file_pattern
8
- super file_pattern.make
9
- end
10
-
11
- def close_and_reopen
12
- close
13
- self.filename = file_pattern.make
14
- dump_file
15
- end
16
-
17
- def save *args
18
- chunk_monitor.periodically{ close_rename_and_open }
19
- super *args
20
- end
21
- end
22
- end
23
- end
@@ -1,34 +0,0 @@
1
- module Monkeyshines
2
- module Monitor
3
-
4
- #
5
- # Emits a log line but only every +iter_interval+ calls or +time_interval+
6
- # lapse.
7
- #
8
- # Since the contents of the block aren't called until the criteria are met,
9
- # you can put relatively expensive operations in the log without killing
10
- # your iteration time.
11
- #
12
- class PeriodicLogger < PeriodicMonitor
13
- #
14
- # Call with a block that returns a string or array to log.
15
- # If you return
16
- #
17
- # Ex: log if it has been at least 5 minutes since last announcement:
18
- #
19
- # periodic_logger = Monkeyshines::Monitor::PeriodicLogger.new(:time => 300)
20
- # loop do
21
- # # ... stuff ...
22
- # periodic_logger.periodically{ [morbenfactor, crunkosity, exuberance] }
23
- # end
24
- #
25
- def periodically &block
26
- super do
27
- now = Time.now.utc.to_f
28
- result = [ "%10d"%iter, "%7.1f"%since, "%7.1f"%inst_rate(now), (block ? block.call : nil) ].flatten.compact
29
- Log.info result.join("\t")
30
- end
31
- end
32
- end
33
- end
34
- end
@@ -1,70 +0,0 @@
1
- module Wukong::Monitor
2
- #
3
- # Accepts a lightweight call every iteration.
4
- #
5
- # Once either a time or an iteration criterion is met, executes the block
6
- # and resets the timer until next execution.
7
- #
8
- # Note that the +time_interval+ is measured *excution to execution* and not
9
- # in multiples of iter_interval. Say I set a time_interval of 300s, and
10
- # happen to iterate at 297s and 310s after start. Then the monitor will
11
- # execute at 310s, and the next execution will happen on or after 610s.
12
- #
13
- # Also note that when *either* criterion is met, *both* criteria are
14
- # reset. Say I set a time interval of 300s and an +iter_interval+ of 10_000;
15
- # and that at 250s I reach iteration 10_000. Then the monitor will execute
16
- # on or after 20_000 iteration or 550s, whichever happens first.
17
- #
18
- class PeriodicMonitor
19
- attr_accessor :time_interval, :iter_interval
20
- attr_accessor :last_time, :current_iter, :iter, :started_at
21
-
22
- def initialize options={}
23
- self.started_at = Time.now.utc.to_f
24
- self.last_time = started_at
25
- self.iter = 0
26
- self.current_iter = 0
27
- self.time_interval = options[:time]
28
- self.iter_interval = options[:iters]
29
- end
30
-
31
- # True if more than +iter_interval+ has elapsed since last execution.
32
- def enough_iterations?
33
- iter % iter_interval == 0 if iter_interval
34
- end
35
-
36
- # True if more than +time_interval+ has elapsed since last execution.
37
- def enough_time? now
38
- (now - last_time) > time_interval if time_interval
39
- end
40
-
41
- # Time since monitor was created
42
- def since
43
- Time.now.utc.to_f - started_at
44
- end
45
- # Overall iterations per second
46
- def rate
47
- iter.to_f / since.to_f
48
- end
49
- # "Instantaneous" iterations per second
50
- def inst_rate now
51
- current_iter.to_f / (now-last_time).to_f
52
- end
53
-
54
- #
55
- # if the interval conditions are met, executes block; otherwise just does
56
- # bookkeeping and returns.
57
- #
58
- def periodically &block
59
- self.iter += 1
60
- self.current_iter += 1
61
- now = Time.now.utc.to_f
62
- if enough_iterations? || enough_time?(now)
63
- block.call(iter, (now-last_time))
64
- self.last_time = now
65
- self.current_iter = 0
66
- end
67
- end
68
- end
69
-
70
- end
@@ -1,7 +0,0 @@
1
- module Monkeyshines
2
- module Monitor
3
- autoload :PeriodicMonitor, 'monkeyshines/monitor/periodic_monitor'
4
- autoload :PeriodicLogger, 'monkeyshines/monitor/periodic_logger'
5
- end
6
- end
7
-
data/lib/wukong/rdf.rb DELETED
@@ -1,104 +0,0 @@
1
- module Wukong
2
- #
3
- # Dump wukong object as RDF triples:
4
- #
5
- # <key attr val module Wukong
6
- #
7
- # Dump wukong object as RDF triples:
8
- #
9
- # <key> <attr> <val> # <extra>
10
- #
11
- # Each element of the triple is XML encoded such that it contains no tab,
12
- # newline or carriage returns, and the three are tab-separated. Any extra
13
- # fields -- reification info, for instance -- are appended as a comment.
14
- #
15
- # This makes the result not only a valid RDF triple file but perfectly
16
- # palatable to Wukong for further processing.
17
- #
18
- module Rdf
19
-
20
- #
21
- # RDF-formatted date
22
- #
23
- def self.encode_datetime dt
24
- DateTime.parse_safely(dt).xmlschema
25
- end
26
-
27
- #
28
- # Emit a component (subject or object) with the right semantic encoding
29
- #
30
- # Use :boolskip if a false property should just be left out.
31
- #
32
- def rdf_component val, type
33
- case type
34
- when :tweet then %Q{<http://twitter.com/statuses/show/#{val}.xml>}
35
- when :user then %Q{<http://twitter.com/users/show/#{val}.xml>}
36
- when :bool then ((!val) || (val==0) || (val=="0")) ? '"false"^^<xsd:boolean>' : '"true"^^<xsd:boolean>'
37
- when :boolskip then ((!val) || (val==0) || (val=="0")) ? nil : '"true"^^<xsd:boolean>'
38
- when :int then %Q{"#{val.to_i}"^^<xsd:integer>}
39
- when :date then %Q{"#{TwitterRdf.encode_datetime(val)}"^^<xsd:dateTime>}
40
- when :str then %Q{"#{val}"}
41
- else raise "Don't know how to encode #{type}"
42
- end
43
- end
44
-
45
- #
46
- # Express relationship (predicate) in RDF
47
- #
48
- def rdf_pred pred
49
- case pred
50
- when :created_at then %Q{<http://twitter.com/##{pred}>}
51
- else %Q{<http://twitter.com/##{pred}>}
52
- end
53
- end
54
-
55
- #
56
- # RDF Triple string for the given (subject, object, predicate)
57
- # http://www.w3.org/TR/rdf-testcases/#ntriples
58
- #
59
- def self.rdf_triple subj, pred, obj, comment=nil
60
- comment = "\t# " + comment.to_s unless comment.blank?
61
- %Q{%-55s\t%-39s\t%-23s\t.%s} % [subj, pred, obj, comment]
62
- end
63
-
64
- def mutable?(attr)
65
- false
66
- end
67
-
68
- #
69
- # Extract [subject, predicate, object, (extra)] tuples.
70
- #
71
- # (extra) is set to +scraped at+ for #mutable? attributes, blank otherwise.
72
- #
73
- def to_rdf3_tuples
74
- members_with_types.map do |attr, type|
75
- next if self[attr].blank?
76
- subj = rdf_resource
77
- pred = rdf_pred(attr)
78
- obj = rdf_component(self[attr], type) or next
79
- comment = scraped_at if mutable?(attr)
80
- [subj, pred, obj, comment]
81
- end.compact
82
- end
83
-
84
- #
85
- # Convert an object to an rdf triple.
86
- #
87
- # Appends scraped at to #mutable? attributes
88
- #
89
- def to_rdf3
90
- to_rdf3_tuples.map do |tuple|
91
- self.class.rdf_triple tuple
92
- end.join("\n")
93
- end
94
-
95
- end
96
- end
97
- >
98
- #
99
- #
100
- module Rdf
101
- def to_rdf
102
- end
103
- end
104
- end
@@ -1,61 +0,0 @@
1
- # Defines a base class for streaming data into a cassandra db connection.
2
- require 'cassandra' ; include Cassandra::Constants
3
- module Wukong
4
- module Streamer
5
-
6
- class CassandraStreamer < Wukong::Streamer::Base
7
- attr_accessor :batch_count, :batch_record_count, :batch_size, :column_space, :db_seeds, :cassandra_db
8
-
9
- def initialize *args
10
- super *args
11
- self.batch_count = 0
12
- self.batch_record_count = 0
13
- self.column_space ||= 'Twitter'
14
- self.batch_size ||= 100
15
- self.db_seeds ||= %w[10.244.191.178 10.243.19.223 10.243.17.219 10.245.70.85 10.244.206.241].map{ |s| s.to_s+':9160'}
16
- self.cassandra_db ||= Cassandra.new(self.column_space, self.db_seeds)
17
- end
18
-
19
- def stream
20
- while still_lines? do
21
- start_batch do
22
- while still_lines? && batch_not_full? do
23
- line = get_line
24
- record = recordize(line.chomp) or next
25
- next if record.blank?
26
- process(*record) do |output_record|
27
- emit output_record
28
- end
29
- self.batch_record_count += 1
30
- end
31
- end
32
- end
33
- end
34
-
35
- def process *args, &blk
36
- Raise "Overwrite this method to insert into cassandra db"
37
- end
38
-
39
- def start_batch &blk
40
- self.batch_record_count = 0
41
- self.batch_count += 1
42
- self.cassandra_db.batch(&blk)
43
- end
44
-
45
- def get_line
46
- $stdin.gets
47
- end
48
-
49
- def still_lines?
50
- !$stdin.eof?
51
- end
52
-
53
- def batch_not_full?
54
- self.batch_record_count < self.batch_size
55
- end
56
-
57
- end
58
- end
59
-
60
- end
61
-
@@ -1,30 +0,0 @@
1
- module Wukong
2
- module Streamer
3
- #
4
- # Emit each unique key and the count of its occurrences
5
- #
6
- class CountKeys < Wukong::Streamer::AccumulatingReducer
7
- attr_accessor :key_count
8
-
9
- def formatted_key_count
10
- "%10d"%key_count.to_i
11
- end
12
-
13
- # reset the counter to zero
14
- def start! *args
15
- self.key_count = 0
16
- end
17
-
18
- # record one more for this key
19
- def accumulate *vals
20
- self.key_count += 1
21
- end
22
-
23
- # emit each key field and the count, tab-separated.
24
- def finalize
25
- yield [key, formatted_key_count]
26
- end
27
- end
28
-
29
- end
30
- end
@@ -1,26 +0,0 @@
1
- module Wukong
2
- module Streamer
3
- #
4
- # For each identical line in the map phase output, emit one representative
5
- # line followed by the count of occrrences (separated by a tab).
6
- #
7
- # (This is the functional equivalent of +'uniq -c'+)
8
- #
9
- class CountLines < Wukong::Streamer::Base
10
- def formatted_count item, key_count
11
- "%s\t%10d" % [item, key_count.to_i]
12
- end
13
-
14
- #
15
- # Delegate to +uniq -c+, but put the count last for idempotence.
16
- #
17
- def stream
18
- %x{/usr/bin/uniq -c}.split("\n").each do |line|
19
- key_count, item = line.chomp.strip.split(/\s+/, 2)
20
- puts formatted_count(item, key_count)
21
- end
22
- end
23
- end
24
-
25
- end
26
- end
@@ -1,7 +0,0 @@
1
- module Wukong
2
- module Streamer
3
- class EmStreamer
4
-
5
- end
6
- end
7
- end
@@ -1,22 +0,0 @@
1
- module Wukong
2
- module Streamer
3
- module PreprocessWithPipeStreamer
4
- #
5
- # Runs STDIN through a shell command and then begins processing.
6
- #
7
- # If you don't need to do anything to the output of the command, just
8
- # inherit from Wukong::Script and override the #map_command.
9
- #
10
- # You must provide a @preprocess_pipe_command@ method that returns a shell
11
- # command to run the input through.
12
- #
13
- def stream
14
- #
15
- `#{preprocess_pipe_command}`.each do |line|
16
- item = itemize(line) ; next if item.blank?
17
- process(*item)
18
- end
19
- end
20
- end
21
- end
22
- end
@@ -1,21 +0,0 @@
1
- # require 'active_support/core_ext/class/inheritable_attributes.rb'
2
- require 'extlib/class'
3
-
4
- module Wukong
5
- #
6
- # Use to instrument an actual class to behave
7
- #
8
- module WukongClass
9
-
10
-
11
- def [](attr)
12
- self.send attr
13
- end
14
- def []=(attr, val)
15
- self.send("#{attr}=", val)
16
- end
17
-
18
- end
19
-
20
-
21
- end