wukong 1.5.4 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG.textile +32 -0
- data/README.textile +58 -12
- data/TODO.textile +0 -8
- data/bin/hdp-bzip +12 -17
- data/bin/hdp-kill-task +1 -1
- data/bin/hdp-sort +7 -7
- data/bin/hdp-stream +7 -7
- data/bin/hdp-stream-flat +2 -3
- data/bin/setcat +11 -0
- data/bin/uniq-ord +59 -0
- data/examples/corpus/bucket_counter.rb +47 -0
- data/examples/corpus/dbpedia_abstract_to_sentences.rb +85 -0
- data/examples/corpus/sentence_coocurrence.rb +70 -0
- data/examples/emr/README.textile +110 -0
- data/examples/emr/dot_wukong_dir/emr_bootstrap.sh +1 -0
- data/examples/emr/elastic_mapreduce_example.rb +2 -2
- data/examples/ignore_me/counting.rb +56 -0
- data/examples/ignore_me/grouper.rb +71 -0
- data/examples/network_graph/adjacency_list.rb +2 -2
- data/examples/network_graph/breadth_first_search.rb +14 -21
- data/examples/network_graph/gen_multi_edge.rb +22 -13
- data/examples/pagerank/pagerank.rb +1 -1
- data/examples/pagerank/pagerank_initialize.rb +6 -10
- data/examples/sample_records.rb +6 -16
- data/examples/server_logs/apache_log_parser.rb +7 -22
- data/examples/server_logs/breadcrumbs.rb +39 -0
- data/examples/server_logs/logline.rb +27 -0
- data/examples/size.rb +3 -2
- data/examples/{binning_percentile_estimator.rb → stats/binning_percentile_estimator.rb} +9 -11
- data/examples/{rank_and_bin.rb → stats/rank_and_bin.rb} +2 -2
- data/examples/stupidly_simple_filter.rb +11 -14
- data/examples/word_count.rb +16 -36
- data/lib/wukong/and_pig.rb +2 -15
- data/lib/wukong/logger.rb +7 -28
- data/lib/wukong/periodic_monitor.rb +24 -9
- data/lib/wukong/script/emr_command.rb +1 -0
- data/lib/wukong/script/hadoop_command.rb +31 -29
- data/lib/wukong/script.rb +19 -14
- data/lib/wukong/store/cassandra_model.rb +2 -1
- data/lib/wukong/streamer/accumulating_reducer.rb +5 -9
- data/lib/wukong/streamer/base.rb +44 -3
- data/lib/wukong/streamer/counting_reducer.rb +12 -12
- data/lib/wukong/streamer/filter.rb +2 -2
- data/lib/wukong/streamer/list_reducer.rb +3 -3
- data/lib/wukong/streamer/reducer.rb +11 -0
- data/lib/wukong/streamer.rb +7 -3
- data/lib/wukong.rb +7 -3
- data/{examples → old}/cassandra_streaming/berlitz_for_cassandra.textile +0 -0
- data/{examples → old}/cassandra_streaming/client_interface_notes.textile +0 -0
- data/{examples → old}/cassandra_streaming/client_schema.textile +0 -0
- data/{examples → old}/cassandra_streaming/tuning.textile +0 -0
- data/wukong.gemspec +257 -285
- metadata +45 -62
- data/examples/cassandra_streaming/avromapper.rb +0 -85
- data/examples/cassandra_streaming/cassandra.avpr +0 -468
- data/examples/cassandra_streaming/cassandra_random_partitioner.rb +0 -62
- data/examples/cassandra_streaming/catter.sh +0 -45
- data/examples/cassandra_streaming/client_schema.avpr +0 -211
- data/examples/cassandra_streaming/foofile.avr +0 -0
- data/examples/cassandra_streaming/pymap.sh +0 -1
- data/examples/cassandra_streaming/pyreduce.sh +0 -1
- data/examples/cassandra_streaming/smutation.avpr +0 -188
- data/examples/cassandra_streaming/streamer.sh +0 -51
- data/examples/cassandra_streaming/struct_loader.rb +0 -24
- data/examples/count_keys.rb +0 -56
- data/examples/count_keys_at_mapper.rb +0 -57
- data/examples/emr/README-elastic_map_reduce.textile +0 -26
- data/examples/keystore/cassandra_batch_test.rb +0 -41
- data/examples/keystore/conditional_outputter_example.rb +0 -70
- data/examples/store/chunked_store_example.rb +0 -18
- data/lib/wukong/dfs.rb +0 -81
- data/lib/wukong/keystore/cassandra_conditional_outputter.rb +0 -122
- data/lib/wukong/keystore/redis_db.rb +0 -24
- data/lib/wukong/keystore/tyrant_db.rb +0 -137
- data/lib/wukong/keystore/tyrant_notes.textile +0 -145
- data/lib/wukong/models/graph.rb +0 -25
- data/lib/wukong/monitor/chunked_store.rb +0 -23
- data/lib/wukong/monitor/periodic_logger.rb +0 -34
- data/lib/wukong/monitor/periodic_monitor.rb +0 -70
- data/lib/wukong/monitor.rb +0 -7
- data/lib/wukong/rdf.rb +0 -104
- data/lib/wukong/streamer/cassandra_streamer.rb +0 -61
- data/lib/wukong/streamer/count_keys.rb +0 -30
- data/lib/wukong/streamer/count_lines.rb +0 -26
- data/lib/wukong/streamer/em_streamer.rb +0 -7
- data/lib/wukong/streamer/preprocess_with_pipe_streamer.rb +0 -22
- data/lib/wukong/wukong_class.rb +0 -21
@@ -1,23 +0,0 @@
|
|
1
|
-
require 'monkeyshines/monitor/periodic_monitor'
|
2
|
-
module Monkeyshines
|
3
|
-
module Monitor
|
4
|
-
module ChunkedStore
|
5
|
-
attr_accessor :file_pattern
|
6
|
-
def initialize file_pattern
|
7
|
-
self.file_pattern = file_pattern
|
8
|
-
super file_pattern.make
|
9
|
-
end
|
10
|
-
|
11
|
-
def close_and_reopen
|
12
|
-
close
|
13
|
-
self.filename = file_pattern.make
|
14
|
-
dump_file
|
15
|
-
end
|
16
|
-
|
17
|
-
def save *args
|
18
|
-
chunk_monitor.periodically{ close_rename_and_open }
|
19
|
-
super *args
|
20
|
-
end
|
21
|
-
end
|
22
|
-
end
|
23
|
-
end
|
@@ -1,34 +0,0 @@
|
|
1
|
-
module Monkeyshines
|
2
|
-
module Monitor
|
3
|
-
|
4
|
-
#
|
5
|
-
# Emits a log line but only every +iter_interval+ calls or +time_interval+
|
6
|
-
# lapse.
|
7
|
-
#
|
8
|
-
# Since the contents of the block aren't called until the criteria are met,
|
9
|
-
# you can put relatively expensive operations in the log without killing
|
10
|
-
# your iteration time.
|
11
|
-
#
|
12
|
-
class PeriodicLogger < PeriodicMonitor
|
13
|
-
#
|
14
|
-
# Call with a block that returns a string or array to log.
|
15
|
-
# If you return
|
16
|
-
#
|
17
|
-
# Ex: log if it has been at least 5 minutes since last announcement:
|
18
|
-
#
|
19
|
-
# periodic_logger = Monkeyshines::Monitor::PeriodicLogger.new(:time => 300)
|
20
|
-
# loop do
|
21
|
-
# # ... stuff ...
|
22
|
-
# periodic_logger.periodically{ [morbenfactor, crunkosity, exuberance] }
|
23
|
-
# end
|
24
|
-
#
|
25
|
-
def periodically &block
|
26
|
-
super do
|
27
|
-
now = Time.now.utc.to_f
|
28
|
-
result = [ "%10d"%iter, "%7.1f"%since, "%7.1f"%inst_rate(now), (block ? block.call : nil) ].flatten.compact
|
29
|
-
Log.info result.join("\t")
|
30
|
-
end
|
31
|
-
end
|
32
|
-
end
|
33
|
-
end
|
34
|
-
end
|
@@ -1,70 +0,0 @@
|
|
1
|
-
module Wukong::Monitor
|
2
|
-
#
|
3
|
-
# Accepts a lightweight call every iteration.
|
4
|
-
#
|
5
|
-
# Once either a time or an iteration criterion is met, executes the block
|
6
|
-
# and resets the timer until next execution.
|
7
|
-
#
|
8
|
-
# Note that the +time_interval+ is measured *excution to execution* and not
|
9
|
-
# in multiples of iter_interval. Say I set a time_interval of 300s, and
|
10
|
-
# happen to iterate at 297s and 310s after start. Then the monitor will
|
11
|
-
# execute at 310s, and the next execution will happen on or after 610s.
|
12
|
-
#
|
13
|
-
# Also note that when *either* criterion is met, *both* criteria are
|
14
|
-
# reset. Say I set a time interval of 300s and an +iter_interval+ of 10_000;
|
15
|
-
# and that at 250s I reach iteration 10_000. Then the monitor will execute
|
16
|
-
# on or after 20_000 iteration or 550s, whichever happens first.
|
17
|
-
#
|
18
|
-
class PeriodicMonitor
|
19
|
-
attr_accessor :time_interval, :iter_interval
|
20
|
-
attr_accessor :last_time, :current_iter, :iter, :started_at
|
21
|
-
|
22
|
-
def initialize options={}
|
23
|
-
self.started_at = Time.now.utc.to_f
|
24
|
-
self.last_time = started_at
|
25
|
-
self.iter = 0
|
26
|
-
self.current_iter = 0
|
27
|
-
self.time_interval = options[:time]
|
28
|
-
self.iter_interval = options[:iters]
|
29
|
-
end
|
30
|
-
|
31
|
-
# True if more than +iter_interval+ has elapsed since last execution.
|
32
|
-
def enough_iterations?
|
33
|
-
iter % iter_interval == 0 if iter_interval
|
34
|
-
end
|
35
|
-
|
36
|
-
# True if more than +time_interval+ has elapsed since last execution.
|
37
|
-
def enough_time? now
|
38
|
-
(now - last_time) > time_interval if time_interval
|
39
|
-
end
|
40
|
-
|
41
|
-
# Time since monitor was created
|
42
|
-
def since
|
43
|
-
Time.now.utc.to_f - started_at
|
44
|
-
end
|
45
|
-
# Overall iterations per second
|
46
|
-
def rate
|
47
|
-
iter.to_f / since.to_f
|
48
|
-
end
|
49
|
-
# "Instantaneous" iterations per second
|
50
|
-
def inst_rate now
|
51
|
-
current_iter.to_f / (now-last_time).to_f
|
52
|
-
end
|
53
|
-
|
54
|
-
#
|
55
|
-
# if the interval conditions are met, executes block; otherwise just does
|
56
|
-
# bookkeeping and returns.
|
57
|
-
#
|
58
|
-
def periodically &block
|
59
|
-
self.iter += 1
|
60
|
-
self.current_iter += 1
|
61
|
-
now = Time.now.utc.to_f
|
62
|
-
if enough_iterations? || enough_time?(now)
|
63
|
-
block.call(iter, (now-last_time))
|
64
|
-
self.last_time = now
|
65
|
-
self.current_iter = 0
|
66
|
-
end
|
67
|
-
end
|
68
|
-
end
|
69
|
-
|
70
|
-
end
|
data/lib/wukong/monitor.rb
DELETED
data/lib/wukong/rdf.rb
DELETED
@@ -1,104 +0,0 @@
|
|
1
|
-
module Wukong
|
2
|
-
#
|
3
|
-
# Dump wukong object as RDF triples:
|
4
|
-
#
|
5
|
-
# <key attr val module Wukong
|
6
|
-
#
|
7
|
-
# Dump wukong object as RDF triples:
|
8
|
-
#
|
9
|
-
# <key> <attr> <val> # <extra>
|
10
|
-
#
|
11
|
-
# Each element of the triple is XML encoded such that it contains no tab,
|
12
|
-
# newline or carriage returns, and the three are tab-separated. Any extra
|
13
|
-
# fields -- reification info, for instance -- are appended as a comment.
|
14
|
-
#
|
15
|
-
# This makes the result not only a valid RDF triple file but perfectly
|
16
|
-
# palatable to Wukong for further processing.
|
17
|
-
#
|
18
|
-
module Rdf
|
19
|
-
|
20
|
-
#
|
21
|
-
# RDF-formatted date
|
22
|
-
#
|
23
|
-
def self.encode_datetime dt
|
24
|
-
DateTime.parse_safely(dt).xmlschema
|
25
|
-
end
|
26
|
-
|
27
|
-
#
|
28
|
-
# Emit a component (subject or object) with the right semantic encoding
|
29
|
-
#
|
30
|
-
# Use :boolskip if a false property should just be left out.
|
31
|
-
#
|
32
|
-
def rdf_component val, type
|
33
|
-
case type
|
34
|
-
when :tweet then %Q{<http://twitter.com/statuses/show/#{val}.xml>}
|
35
|
-
when :user then %Q{<http://twitter.com/users/show/#{val}.xml>}
|
36
|
-
when :bool then ((!val) || (val==0) || (val=="0")) ? '"false"^^<xsd:boolean>' : '"true"^^<xsd:boolean>'
|
37
|
-
when :boolskip then ((!val) || (val==0) || (val=="0")) ? nil : '"true"^^<xsd:boolean>'
|
38
|
-
when :int then %Q{"#{val.to_i}"^^<xsd:integer>}
|
39
|
-
when :date then %Q{"#{TwitterRdf.encode_datetime(val)}"^^<xsd:dateTime>}
|
40
|
-
when :str then %Q{"#{val}"}
|
41
|
-
else raise "Don't know how to encode #{type}"
|
42
|
-
end
|
43
|
-
end
|
44
|
-
|
45
|
-
#
|
46
|
-
# Express relationship (predicate) in RDF
|
47
|
-
#
|
48
|
-
def rdf_pred pred
|
49
|
-
case pred
|
50
|
-
when :created_at then %Q{<http://twitter.com/##{pred}>}
|
51
|
-
else %Q{<http://twitter.com/##{pred}>}
|
52
|
-
end
|
53
|
-
end
|
54
|
-
|
55
|
-
#
|
56
|
-
# RDF Triple string for the given (subject, object, predicate)
|
57
|
-
# http://www.w3.org/TR/rdf-testcases/#ntriples
|
58
|
-
#
|
59
|
-
def self.rdf_triple subj, pred, obj, comment=nil
|
60
|
-
comment = "\t# " + comment.to_s unless comment.blank?
|
61
|
-
%Q{%-55s\t%-39s\t%-23s\t.%s} % [subj, pred, obj, comment]
|
62
|
-
end
|
63
|
-
|
64
|
-
def mutable?(attr)
|
65
|
-
false
|
66
|
-
end
|
67
|
-
|
68
|
-
#
|
69
|
-
# Extract [subject, predicate, object, (extra)] tuples.
|
70
|
-
#
|
71
|
-
# (extra) is set to +scraped at+ for #mutable? attributes, blank otherwise.
|
72
|
-
#
|
73
|
-
def to_rdf3_tuples
|
74
|
-
members_with_types.map do |attr, type|
|
75
|
-
next if self[attr].blank?
|
76
|
-
subj = rdf_resource
|
77
|
-
pred = rdf_pred(attr)
|
78
|
-
obj = rdf_component(self[attr], type) or next
|
79
|
-
comment = scraped_at if mutable?(attr)
|
80
|
-
[subj, pred, obj, comment]
|
81
|
-
end.compact
|
82
|
-
end
|
83
|
-
|
84
|
-
#
|
85
|
-
# Convert an object to an rdf triple.
|
86
|
-
#
|
87
|
-
# Appends scraped at to #mutable? attributes
|
88
|
-
#
|
89
|
-
def to_rdf3
|
90
|
-
to_rdf3_tuples.map do |tuple|
|
91
|
-
self.class.rdf_triple tuple
|
92
|
-
end.join("\n")
|
93
|
-
end
|
94
|
-
|
95
|
-
end
|
96
|
-
end
|
97
|
-
>
|
98
|
-
#
|
99
|
-
#
|
100
|
-
module Rdf
|
101
|
-
def to_rdf
|
102
|
-
end
|
103
|
-
end
|
104
|
-
end
|
@@ -1,61 +0,0 @@
|
|
1
|
-
# Defines a base class for streaming data into a cassandra db connection.
|
2
|
-
require 'cassandra' ; include Cassandra::Constants
|
3
|
-
module Wukong
|
4
|
-
module Streamer
|
5
|
-
|
6
|
-
class CassandraStreamer < Wukong::Streamer::Base
|
7
|
-
attr_accessor :batch_count, :batch_record_count, :batch_size, :column_space, :db_seeds, :cassandra_db
|
8
|
-
|
9
|
-
def initialize *args
|
10
|
-
super *args
|
11
|
-
self.batch_count = 0
|
12
|
-
self.batch_record_count = 0
|
13
|
-
self.column_space ||= 'Twitter'
|
14
|
-
self.batch_size ||= 100
|
15
|
-
self.db_seeds ||= %w[10.244.191.178 10.243.19.223 10.243.17.219 10.245.70.85 10.244.206.241].map{ |s| s.to_s+':9160'}
|
16
|
-
self.cassandra_db ||= Cassandra.new(self.column_space, self.db_seeds)
|
17
|
-
end
|
18
|
-
|
19
|
-
def stream
|
20
|
-
while still_lines? do
|
21
|
-
start_batch do
|
22
|
-
while still_lines? && batch_not_full? do
|
23
|
-
line = get_line
|
24
|
-
record = recordize(line.chomp) or next
|
25
|
-
next if record.blank?
|
26
|
-
process(*record) do |output_record|
|
27
|
-
emit output_record
|
28
|
-
end
|
29
|
-
self.batch_record_count += 1
|
30
|
-
end
|
31
|
-
end
|
32
|
-
end
|
33
|
-
end
|
34
|
-
|
35
|
-
def process *args, &blk
|
36
|
-
Raise "Overwrite this method to insert into cassandra db"
|
37
|
-
end
|
38
|
-
|
39
|
-
def start_batch &blk
|
40
|
-
self.batch_record_count = 0
|
41
|
-
self.batch_count += 1
|
42
|
-
self.cassandra_db.batch(&blk)
|
43
|
-
end
|
44
|
-
|
45
|
-
def get_line
|
46
|
-
$stdin.gets
|
47
|
-
end
|
48
|
-
|
49
|
-
def still_lines?
|
50
|
-
!$stdin.eof?
|
51
|
-
end
|
52
|
-
|
53
|
-
def batch_not_full?
|
54
|
-
self.batch_record_count < self.batch_size
|
55
|
-
end
|
56
|
-
|
57
|
-
end
|
58
|
-
end
|
59
|
-
|
60
|
-
end
|
61
|
-
|
@@ -1,30 +0,0 @@
|
|
1
|
-
module Wukong
|
2
|
-
module Streamer
|
3
|
-
#
|
4
|
-
# Emit each unique key and the count of its occurrences
|
5
|
-
#
|
6
|
-
class CountKeys < Wukong::Streamer::AccumulatingReducer
|
7
|
-
attr_accessor :key_count
|
8
|
-
|
9
|
-
def formatted_key_count
|
10
|
-
"%10d"%key_count.to_i
|
11
|
-
end
|
12
|
-
|
13
|
-
# reset the counter to zero
|
14
|
-
def start! *args
|
15
|
-
self.key_count = 0
|
16
|
-
end
|
17
|
-
|
18
|
-
# record one more for this key
|
19
|
-
def accumulate *vals
|
20
|
-
self.key_count += 1
|
21
|
-
end
|
22
|
-
|
23
|
-
# emit each key field and the count, tab-separated.
|
24
|
-
def finalize
|
25
|
-
yield [key, formatted_key_count]
|
26
|
-
end
|
27
|
-
end
|
28
|
-
|
29
|
-
end
|
30
|
-
end
|
@@ -1,26 +0,0 @@
|
|
1
|
-
module Wukong
|
2
|
-
module Streamer
|
3
|
-
#
|
4
|
-
# For each identical line in the map phase output, emit one representative
|
5
|
-
# line followed by the count of occrrences (separated by a tab).
|
6
|
-
#
|
7
|
-
# (This is the functional equivalent of +'uniq -c'+)
|
8
|
-
#
|
9
|
-
class CountLines < Wukong::Streamer::Base
|
10
|
-
def formatted_count item, key_count
|
11
|
-
"%s\t%10d" % [item, key_count.to_i]
|
12
|
-
end
|
13
|
-
|
14
|
-
#
|
15
|
-
# Delegate to +uniq -c+, but put the count last for idempotence.
|
16
|
-
#
|
17
|
-
def stream
|
18
|
-
%x{/usr/bin/uniq -c}.split("\n").each do |line|
|
19
|
-
key_count, item = line.chomp.strip.split(/\s+/, 2)
|
20
|
-
puts formatted_count(item, key_count)
|
21
|
-
end
|
22
|
-
end
|
23
|
-
end
|
24
|
-
|
25
|
-
end
|
26
|
-
end
|
@@ -1,22 +0,0 @@
|
|
1
|
-
module Wukong
|
2
|
-
module Streamer
|
3
|
-
module PreprocessWithPipeStreamer
|
4
|
-
#
|
5
|
-
# Runs STDIN through a shell command and then begins processing.
|
6
|
-
#
|
7
|
-
# If you don't need to do anything to the output of the command, just
|
8
|
-
# inherit from Wukong::Script and override the #map_command.
|
9
|
-
#
|
10
|
-
# You must provide a @preprocess_pipe_command@ method that returns a shell
|
11
|
-
# command to run the input through.
|
12
|
-
#
|
13
|
-
def stream
|
14
|
-
#
|
15
|
-
`#{preprocess_pipe_command}`.each do |line|
|
16
|
-
item = itemize(line) ; next if item.blank?
|
17
|
-
process(*item)
|
18
|
-
end
|
19
|
-
end
|
20
|
-
end
|
21
|
-
end
|
22
|
-
end
|
data/lib/wukong/wukong_class.rb
DELETED
@@ -1,21 +0,0 @@
|
|
1
|
-
# require 'active_support/core_ext/class/inheritable_attributes.rb'
|
2
|
-
require 'extlib/class'
|
3
|
-
|
4
|
-
module Wukong
|
5
|
-
#
|
6
|
-
# Use to instrument an actual class to behave
|
7
|
-
#
|
8
|
-
module WukongClass
|
9
|
-
|
10
|
-
|
11
|
-
def [](attr)
|
12
|
-
self.send attr
|
13
|
-
end
|
14
|
-
def []=(attr, val)
|
15
|
-
self.send("#{attr}=", val)
|
16
|
-
end
|
17
|
-
|
18
|
-
end
|
19
|
-
|
20
|
-
|
21
|
-
end
|