wukong 1.5.4 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG.textile +32 -0
- data/README.textile +58 -12
- data/TODO.textile +0 -8
- data/bin/hdp-bzip +12 -17
- data/bin/hdp-kill-task +1 -1
- data/bin/hdp-sort +7 -7
- data/bin/hdp-stream +7 -7
- data/bin/hdp-stream-flat +2 -3
- data/bin/setcat +11 -0
- data/bin/uniq-ord +59 -0
- data/examples/corpus/bucket_counter.rb +47 -0
- data/examples/corpus/dbpedia_abstract_to_sentences.rb +85 -0
- data/examples/corpus/sentence_coocurrence.rb +70 -0
- data/examples/emr/README.textile +110 -0
- data/examples/emr/dot_wukong_dir/emr_bootstrap.sh +1 -0
- data/examples/emr/elastic_mapreduce_example.rb +2 -2
- data/examples/ignore_me/counting.rb +56 -0
- data/examples/ignore_me/grouper.rb +71 -0
- data/examples/network_graph/adjacency_list.rb +2 -2
- data/examples/network_graph/breadth_first_search.rb +14 -21
- data/examples/network_graph/gen_multi_edge.rb +22 -13
- data/examples/pagerank/pagerank.rb +1 -1
- data/examples/pagerank/pagerank_initialize.rb +6 -10
- data/examples/sample_records.rb +6 -16
- data/examples/server_logs/apache_log_parser.rb +7 -22
- data/examples/server_logs/breadcrumbs.rb +39 -0
- data/examples/server_logs/logline.rb +27 -0
- data/examples/size.rb +3 -2
- data/examples/{binning_percentile_estimator.rb → stats/binning_percentile_estimator.rb} +9 -11
- data/examples/{rank_and_bin.rb → stats/rank_and_bin.rb} +2 -2
- data/examples/stupidly_simple_filter.rb +11 -14
- data/examples/word_count.rb +16 -36
- data/lib/wukong/and_pig.rb +2 -15
- data/lib/wukong/logger.rb +7 -28
- data/lib/wukong/periodic_monitor.rb +24 -9
- data/lib/wukong/script/emr_command.rb +1 -0
- data/lib/wukong/script/hadoop_command.rb +31 -29
- data/lib/wukong/script.rb +19 -14
- data/lib/wukong/store/cassandra_model.rb +2 -1
- data/lib/wukong/streamer/accumulating_reducer.rb +5 -9
- data/lib/wukong/streamer/base.rb +44 -3
- data/lib/wukong/streamer/counting_reducer.rb +12 -12
- data/lib/wukong/streamer/filter.rb +2 -2
- data/lib/wukong/streamer/list_reducer.rb +3 -3
- data/lib/wukong/streamer/reducer.rb +11 -0
- data/lib/wukong/streamer.rb +7 -3
- data/lib/wukong.rb +7 -3
- data/{examples → old}/cassandra_streaming/berlitz_for_cassandra.textile +0 -0
- data/{examples → old}/cassandra_streaming/client_interface_notes.textile +0 -0
- data/{examples → old}/cassandra_streaming/client_schema.textile +0 -0
- data/{examples → old}/cassandra_streaming/tuning.textile +0 -0
- data/wukong.gemspec +257 -285
- metadata +45 -62
- data/examples/cassandra_streaming/avromapper.rb +0 -85
- data/examples/cassandra_streaming/cassandra.avpr +0 -468
- data/examples/cassandra_streaming/cassandra_random_partitioner.rb +0 -62
- data/examples/cassandra_streaming/catter.sh +0 -45
- data/examples/cassandra_streaming/client_schema.avpr +0 -211
- data/examples/cassandra_streaming/foofile.avr +0 -0
- data/examples/cassandra_streaming/pymap.sh +0 -1
- data/examples/cassandra_streaming/pyreduce.sh +0 -1
- data/examples/cassandra_streaming/smutation.avpr +0 -188
- data/examples/cassandra_streaming/streamer.sh +0 -51
- data/examples/cassandra_streaming/struct_loader.rb +0 -24
- data/examples/count_keys.rb +0 -56
- data/examples/count_keys_at_mapper.rb +0 -57
- data/examples/emr/README-elastic_map_reduce.textile +0 -26
- data/examples/keystore/cassandra_batch_test.rb +0 -41
- data/examples/keystore/conditional_outputter_example.rb +0 -70
- data/examples/store/chunked_store_example.rb +0 -18
- data/lib/wukong/dfs.rb +0 -81
- data/lib/wukong/keystore/cassandra_conditional_outputter.rb +0 -122
- data/lib/wukong/keystore/redis_db.rb +0 -24
- data/lib/wukong/keystore/tyrant_db.rb +0 -137
- data/lib/wukong/keystore/tyrant_notes.textile +0 -145
- data/lib/wukong/models/graph.rb +0 -25
- data/lib/wukong/monitor/chunked_store.rb +0 -23
- data/lib/wukong/monitor/periodic_logger.rb +0 -34
- data/lib/wukong/monitor/periodic_monitor.rb +0 -70
- data/lib/wukong/monitor.rb +0 -7
- data/lib/wukong/rdf.rb +0 -104
- data/lib/wukong/streamer/cassandra_streamer.rb +0 -61
- data/lib/wukong/streamer/count_keys.rb +0 -30
- data/lib/wukong/streamer/count_lines.rb +0 -26
- data/lib/wukong/streamer/em_streamer.rb +0 -7
- data/lib/wukong/streamer/preprocess_with_pipe_streamer.rb +0 -22
- data/lib/wukong/wukong_class.rb +0 -21
@@ -1,23 +0,0 @@
|
|
1
|
-
require 'monkeyshines/monitor/periodic_monitor'
|
2
|
-
module Monkeyshines
|
3
|
-
module Monitor
|
4
|
-
module ChunkedStore
|
5
|
-
attr_accessor :file_pattern
|
6
|
-
def initialize file_pattern
|
7
|
-
self.file_pattern = file_pattern
|
8
|
-
super file_pattern.make
|
9
|
-
end
|
10
|
-
|
11
|
-
def close_and_reopen
|
12
|
-
close
|
13
|
-
self.filename = file_pattern.make
|
14
|
-
dump_file
|
15
|
-
end
|
16
|
-
|
17
|
-
def save *args
|
18
|
-
chunk_monitor.periodically{ close_rename_and_open }
|
19
|
-
super *args
|
20
|
-
end
|
21
|
-
end
|
22
|
-
end
|
23
|
-
end
|
@@ -1,34 +0,0 @@
|
|
1
|
-
module Monkeyshines
|
2
|
-
module Monitor
|
3
|
-
|
4
|
-
#
|
5
|
-
# Emits a log line but only every +iter_interval+ calls or +time_interval+
|
6
|
-
# lapse.
|
7
|
-
#
|
8
|
-
# Since the contents of the block aren't called until the criteria are met,
|
9
|
-
# you can put relatively expensive operations in the log without killing
|
10
|
-
# your iteration time.
|
11
|
-
#
|
12
|
-
class PeriodicLogger < PeriodicMonitor
|
13
|
-
#
|
14
|
-
# Call with a block that returns a string or array to log.
|
15
|
-
# If you return
|
16
|
-
#
|
17
|
-
# Ex: log if it has been at least 5 minutes since last announcement:
|
18
|
-
#
|
19
|
-
# periodic_logger = Monkeyshines::Monitor::PeriodicLogger.new(:time => 300)
|
20
|
-
# loop do
|
21
|
-
# # ... stuff ...
|
22
|
-
# periodic_logger.periodically{ [morbenfactor, crunkosity, exuberance] }
|
23
|
-
# end
|
24
|
-
#
|
25
|
-
def periodically &block
|
26
|
-
super do
|
27
|
-
now = Time.now.utc.to_f
|
28
|
-
result = [ "%10d"%iter, "%7.1f"%since, "%7.1f"%inst_rate(now), (block ? block.call : nil) ].flatten.compact
|
29
|
-
Log.info result.join("\t")
|
30
|
-
end
|
31
|
-
end
|
32
|
-
end
|
33
|
-
end
|
34
|
-
end
|
@@ -1,70 +0,0 @@
|
|
1
|
-
module Wukong::Monitor
|
2
|
-
#
|
3
|
-
# Accepts a lightweight call every iteration.
|
4
|
-
#
|
5
|
-
# Once either a time or an iteration criterion is met, executes the block
|
6
|
-
# and resets the timer until next execution.
|
7
|
-
#
|
8
|
-
# Note that the +time_interval+ is measured *excution to execution* and not
|
9
|
-
# in multiples of iter_interval. Say I set a time_interval of 300s, and
|
10
|
-
# happen to iterate at 297s and 310s after start. Then the monitor will
|
11
|
-
# execute at 310s, and the next execution will happen on or after 610s.
|
12
|
-
#
|
13
|
-
# Also note that when *either* criterion is met, *both* criteria are
|
14
|
-
# reset. Say I set a time interval of 300s and an +iter_interval+ of 10_000;
|
15
|
-
# and that at 250s I reach iteration 10_000. Then the monitor will execute
|
16
|
-
# on or after 20_000 iteration or 550s, whichever happens first.
|
17
|
-
#
|
18
|
-
class PeriodicMonitor
|
19
|
-
attr_accessor :time_interval, :iter_interval
|
20
|
-
attr_accessor :last_time, :current_iter, :iter, :started_at
|
21
|
-
|
22
|
-
def initialize options={}
|
23
|
-
self.started_at = Time.now.utc.to_f
|
24
|
-
self.last_time = started_at
|
25
|
-
self.iter = 0
|
26
|
-
self.current_iter = 0
|
27
|
-
self.time_interval = options[:time]
|
28
|
-
self.iter_interval = options[:iters]
|
29
|
-
end
|
30
|
-
|
31
|
-
# True if more than +iter_interval+ has elapsed since last execution.
|
32
|
-
def enough_iterations?
|
33
|
-
iter % iter_interval == 0 if iter_interval
|
34
|
-
end
|
35
|
-
|
36
|
-
# True if more than +time_interval+ has elapsed since last execution.
|
37
|
-
def enough_time? now
|
38
|
-
(now - last_time) > time_interval if time_interval
|
39
|
-
end
|
40
|
-
|
41
|
-
# Time since monitor was created
|
42
|
-
def since
|
43
|
-
Time.now.utc.to_f - started_at
|
44
|
-
end
|
45
|
-
# Overall iterations per second
|
46
|
-
def rate
|
47
|
-
iter.to_f / since.to_f
|
48
|
-
end
|
49
|
-
# "Instantaneous" iterations per second
|
50
|
-
def inst_rate now
|
51
|
-
current_iter.to_f / (now-last_time).to_f
|
52
|
-
end
|
53
|
-
|
54
|
-
#
|
55
|
-
# if the interval conditions are met, executes block; otherwise just does
|
56
|
-
# bookkeeping and returns.
|
57
|
-
#
|
58
|
-
def periodically &block
|
59
|
-
self.iter += 1
|
60
|
-
self.current_iter += 1
|
61
|
-
now = Time.now.utc.to_f
|
62
|
-
if enough_iterations? || enough_time?(now)
|
63
|
-
block.call(iter, (now-last_time))
|
64
|
-
self.last_time = now
|
65
|
-
self.current_iter = 0
|
66
|
-
end
|
67
|
-
end
|
68
|
-
end
|
69
|
-
|
70
|
-
end
|
data/lib/wukong/monitor.rb
DELETED
data/lib/wukong/rdf.rb
DELETED
@@ -1,104 +0,0 @@
|
|
1
|
-
module Wukong
|
2
|
-
#
|
3
|
-
# Dump wukong object as RDF triples:
|
4
|
-
#
|
5
|
-
# <key attr val module Wukong
|
6
|
-
#
|
7
|
-
# Dump wukong object as RDF triples:
|
8
|
-
#
|
9
|
-
# <key> <attr> <val> # <extra>
|
10
|
-
#
|
11
|
-
# Each element of the triple is XML encoded such that it contains no tab,
|
12
|
-
# newline or carriage returns, and the three are tab-separated. Any extra
|
13
|
-
# fields -- reification info, for instance -- are appended as a comment.
|
14
|
-
#
|
15
|
-
# This makes the result not only a valid RDF triple file but perfectly
|
16
|
-
# palatable to Wukong for further processing.
|
17
|
-
#
|
18
|
-
module Rdf
|
19
|
-
|
20
|
-
#
|
21
|
-
# RDF-formatted date
|
22
|
-
#
|
23
|
-
def self.encode_datetime dt
|
24
|
-
DateTime.parse_safely(dt).xmlschema
|
25
|
-
end
|
26
|
-
|
27
|
-
#
|
28
|
-
# Emit a component (subject or object) with the right semantic encoding
|
29
|
-
#
|
30
|
-
# Use :boolskip if a false property should just be left out.
|
31
|
-
#
|
32
|
-
def rdf_component val, type
|
33
|
-
case type
|
34
|
-
when :tweet then %Q{<http://twitter.com/statuses/show/#{val}.xml>}
|
35
|
-
when :user then %Q{<http://twitter.com/users/show/#{val}.xml>}
|
36
|
-
when :bool then ((!val) || (val==0) || (val=="0")) ? '"false"^^<xsd:boolean>' : '"true"^^<xsd:boolean>'
|
37
|
-
when :boolskip then ((!val) || (val==0) || (val=="0")) ? nil : '"true"^^<xsd:boolean>'
|
38
|
-
when :int then %Q{"#{val.to_i}"^^<xsd:integer>}
|
39
|
-
when :date then %Q{"#{TwitterRdf.encode_datetime(val)}"^^<xsd:dateTime>}
|
40
|
-
when :str then %Q{"#{val}"}
|
41
|
-
else raise "Don't know how to encode #{type}"
|
42
|
-
end
|
43
|
-
end
|
44
|
-
|
45
|
-
#
|
46
|
-
# Express relationship (predicate) in RDF
|
47
|
-
#
|
48
|
-
def rdf_pred pred
|
49
|
-
case pred
|
50
|
-
when :created_at then %Q{<http://twitter.com/##{pred}>}
|
51
|
-
else %Q{<http://twitter.com/##{pred}>}
|
52
|
-
end
|
53
|
-
end
|
54
|
-
|
55
|
-
#
|
56
|
-
# RDF Triple string for the given (subject, object, predicate)
|
57
|
-
# http://www.w3.org/TR/rdf-testcases/#ntriples
|
58
|
-
#
|
59
|
-
def self.rdf_triple subj, pred, obj, comment=nil
|
60
|
-
comment = "\t# " + comment.to_s unless comment.blank?
|
61
|
-
%Q{%-55s\t%-39s\t%-23s\t.%s} % [subj, pred, obj, comment]
|
62
|
-
end
|
63
|
-
|
64
|
-
def mutable?(attr)
|
65
|
-
false
|
66
|
-
end
|
67
|
-
|
68
|
-
#
|
69
|
-
# Extract [subject, predicate, object, (extra)] tuples.
|
70
|
-
#
|
71
|
-
# (extra) is set to +scraped at+ for #mutable? attributes, blank otherwise.
|
72
|
-
#
|
73
|
-
def to_rdf3_tuples
|
74
|
-
members_with_types.map do |attr, type|
|
75
|
-
next if self[attr].blank?
|
76
|
-
subj = rdf_resource
|
77
|
-
pred = rdf_pred(attr)
|
78
|
-
obj = rdf_component(self[attr], type) or next
|
79
|
-
comment = scraped_at if mutable?(attr)
|
80
|
-
[subj, pred, obj, comment]
|
81
|
-
end.compact
|
82
|
-
end
|
83
|
-
|
84
|
-
#
|
85
|
-
# Convert an object to an rdf triple.
|
86
|
-
#
|
87
|
-
# Appends scraped at to #mutable? attributes
|
88
|
-
#
|
89
|
-
def to_rdf3
|
90
|
-
to_rdf3_tuples.map do |tuple|
|
91
|
-
self.class.rdf_triple tuple
|
92
|
-
end.join("\n")
|
93
|
-
end
|
94
|
-
|
95
|
-
end
|
96
|
-
end
|
97
|
-
>
|
98
|
-
#
|
99
|
-
#
|
100
|
-
module Rdf
|
101
|
-
def to_rdf
|
102
|
-
end
|
103
|
-
end
|
104
|
-
end
|
@@ -1,61 +0,0 @@
|
|
1
|
-
# Defines a base class for streaming data into a cassandra db connection.
|
2
|
-
require 'cassandra' ; include Cassandra::Constants
|
3
|
-
module Wukong
|
4
|
-
module Streamer
|
5
|
-
|
6
|
-
class CassandraStreamer < Wukong::Streamer::Base
|
7
|
-
attr_accessor :batch_count, :batch_record_count, :batch_size, :column_space, :db_seeds, :cassandra_db
|
8
|
-
|
9
|
-
def initialize *args
|
10
|
-
super *args
|
11
|
-
self.batch_count = 0
|
12
|
-
self.batch_record_count = 0
|
13
|
-
self.column_space ||= 'Twitter'
|
14
|
-
self.batch_size ||= 100
|
15
|
-
self.db_seeds ||= %w[10.244.191.178 10.243.19.223 10.243.17.219 10.245.70.85 10.244.206.241].map{ |s| s.to_s+':9160'}
|
16
|
-
self.cassandra_db ||= Cassandra.new(self.column_space, self.db_seeds)
|
17
|
-
end
|
18
|
-
|
19
|
-
def stream
|
20
|
-
while still_lines? do
|
21
|
-
start_batch do
|
22
|
-
while still_lines? && batch_not_full? do
|
23
|
-
line = get_line
|
24
|
-
record = recordize(line.chomp) or next
|
25
|
-
next if record.blank?
|
26
|
-
process(*record) do |output_record|
|
27
|
-
emit output_record
|
28
|
-
end
|
29
|
-
self.batch_record_count += 1
|
30
|
-
end
|
31
|
-
end
|
32
|
-
end
|
33
|
-
end
|
34
|
-
|
35
|
-
def process *args, &blk
|
36
|
-
Raise "Overwrite this method to insert into cassandra db"
|
37
|
-
end
|
38
|
-
|
39
|
-
def start_batch &blk
|
40
|
-
self.batch_record_count = 0
|
41
|
-
self.batch_count += 1
|
42
|
-
self.cassandra_db.batch(&blk)
|
43
|
-
end
|
44
|
-
|
45
|
-
def get_line
|
46
|
-
$stdin.gets
|
47
|
-
end
|
48
|
-
|
49
|
-
def still_lines?
|
50
|
-
!$stdin.eof?
|
51
|
-
end
|
52
|
-
|
53
|
-
def batch_not_full?
|
54
|
-
self.batch_record_count < self.batch_size
|
55
|
-
end
|
56
|
-
|
57
|
-
end
|
58
|
-
end
|
59
|
-
|
60
|
-
end
|
61
|
-
|
@@ -1,30 +0,0 @@
|
|
1
|
-
module Wukong
|
2
|
-
module Streamer
|
3
|
-
#
|
4
|
-
# Emit each unique key and the count of its occurrences
|
5
|
-
#
|
6
|
-
class CountKeys < Wukong::Streamer::AccumulatingReducer
|
7
|
-
attr_accessor :key_count
|
8
|
-
|
9
|
-
def formatted_key_count
|
10
|
-
"%10d"%key_count.to_i
|
11
|
-
end
|
12
|
-
|
13
|
-
# reset the counter to zero
|
14
|
-
def start! *args
|
15
|
-
self.key_count = 0
|
16
|
-
end
|
17
|
-
|
18
|
-
# record one more for this key
|
19
|
-
def accumulate *vals
|
20
|
-
self.key_count += 1
|
21
|
-
end
|
22
|
-
|
23
|
-
# emit each key field and the count, tab-separated.
|
24
|
-
def finalize
|
25
|
-
yield [key, formatted_key_count]
|
26
|
-
end
|
27
|
-
end
|
28
|
-
|
29
|
-
end
|
30
|
-
end
|
@@ -1,26 +0,0 @@
|
|
1
|
-
module Wukong
|
2
|
-
module Streamer
|
3
|
-
#
|
4
|
-
# For each identical line in the map phase output, emit one representative
|
5
|
-
# line followed by the count of occrrences (separated by a tab).
|
6
|
-
#
|
7
|
-
# (This is the functional equivalent of +'uniq -c'+)
|
8
|
-
#
|
9
|
-
class CountLines < Wukong::Streamer::Base
|
10
|
-
def formatted_count item, key_count
|
11
|
-
"%s\t%10d" % [item, key_count.to_i]
|
12
|
-
end
|
13
|
-
|
14
|
-
#
|
15
|
-
# Delegate to +uniq -c+, but put the count last for idempotence.
|
16
|
-
#
|
17
|
-
def stream
|
18
|
-
%x{/usr/bin/uniq -c}.split("\n").each do |line|
|
19
|
-
key_count, item = line.chomp.strip.split(/\s+/, 2)
|
20
|
-
puts formatted_count(item, key_count)
|
21
|
-
end
|
22
|
-
end
|
23
|
-
end
|
24
|
-
|
25
|
-
end
|
26
|
-
end
|
@@ -1,22 +0,0 @@
|
|
1
|
-
module Wukong
|
2
|
-
module Streamer
|
3
|
-
module PreprocessWithPipeStreamer
|
4
|
-
#
|
5
|
-
# Runs STDIN through a shell command and then begins processing.
|
6
|
-
#
|
7
|
-
# If you don't need to do anything to the output of the command, just
|
8
|
-
# inherit from Wukong::Script and override the #map_command.
|
9
|
-
#
|
10
|
-
# You must provide a @preprocess_pipe_command@ method that returns a shell
|
11
|
-
# command to run the input through.
|
12
|
-
#
|
13
|
-
def stream
|
14
|
-
#
|
15
|
-
`#{preprocess_pipe_command}`.each do |line|
|
16
|
-
item = itemize(line) ; next if item.blank?
|
17
|
-
process(*item)
|
18
|
-
end
|
19
|
-
end
|
20
|
-
end
|
21
|
-
end
|
22
|
-
end
|
data/lib/wukong/wukong_class.rb
DELETED
@@ -1,21 +0,0 @@
|
|
1
|
-
# require 'active_support/core_ext/class/inheritable_attributes.rb'
|
2
|
-
require 'extlib/class'
|
3
|
-
|
4
|
-
module Wukong
|
5
|
-
#
|
6
|
-
# Use to instrument an actual class to behave
|
7
|
-
#
|
8
|
-
module WukongClass
|
9
|
-
|
10
|
-
|
11
|
-
def [](attr)
|
12
|
-
self.send attr
|
13
|
-
end
|
14
|
-
def []=(attr, val)
|
15
|
-
self.send("#{attr}=", val)
|
16
|
-
end
|
17
|
-
|
18
|
-
end
|
19
|
-
|
20
|
-
|
21
|
-
end
|