wukong 1.4.10 → 1.4.11
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG.textile +11 -0
- data/bin/hdp-mkdirp +12 -0
- data/bin/hdp-rm +26 -5
- data/bin/hdp-sort +1 -1
- data/bin/hdp-stream +1 -1
- data/bin/hdp-stream-flat +1 -1
- data/examples/store/chunked_store_example.rb +18 -0
- data/lib/wukong/and_pig.rb +23 -0
- data/lib/wukong/monitor.rb +7 -0
- data/lib/wukong/monitor/chunked_store.rb +23 -0
- data/lib/wukong/monitor/periodic_logger.rb +34 -0
- data/lib/wukong/monitor/periodic_monitor.rb +72 -0
- data/lib/wukong/script/hadoop_command.rb +4 -0
- data/lib/wukong/store.rb +14 -0
- data/lib/wukong/store/base.rb +29 -0
- data/lib/wukong/store/chunked_flat_file_store.rb +37 -0
- data/lib/wukong/store/conditional_store.rb +57 -0
- data/lib/wukong/store/factory.rb +8 -0
- data/lib/wukong/store/flat_file_store.rb +90 -0
- data/lib/wukong/store/key_store.rb +51 -0
- data/lib/wukong/store/null_store.rb +15 -0
- data/lib/wukong/store/read_thru_store.rb +22 -0
- data/lib/wukong/store/tokyo_tdb_key_store.rb +33 -0
- data/lib/wukong/store/tyrant_rdb_key_store.rb +57 -0
- data/lib/wukong/store/tyrant_tdb_key_store.rb +20 -0
- data/wukong.gemspec +37 -17
- metadata +39 -19
data/CHANGELOG.textile
CHANGED
@@ -1,3 +1,14 @@
|
|
1
|
+
h2. Wukong v.14.11 2010-07-30
|
2
|
+
|
3
|
+
* added the @max_(maps|reduces)_per_(node|cluster)@ jobconfs.
|
4
|
+
* added jobconfs for io_job_mb and friends.
|
5
|
+
* added a loadable module to convert output data to pig bags and tuples
|
6
|
+
* pulled in several methods from active_support, incl. Enumerable#sum
|
7
|
+
* Scripts to find percentile rank of elements in a dataset
|
8
|
+
* We are starting to move wukong to a model where streaming is from a generic
|
9
|
+
source into a generic sink. Several stores have been landed in the code, but
|
10
|
+
many are in a half- or un-baked state. Please ignore this for the moment.
|
11
|
+
|
1
12
|
h2. Wukong v1.4.8 2010-06-05
|
2
13
|
|
3
14
|
* made scripts inject a helpful job name using mapred.job.name
|
data/bin/hdp-mkdirp
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
#!/usr/bin/env bash
|
2
|
+
|
3
|
+
#
|
4
|
+
# Despite arguments and deliberation, this IS a necessary script. Azkaban, should you choose to
|
5
|
+
# use it, will fail if (it seems) ANY of its spawned subprocesses fails
|
6
|
+
#
|
7
|
+
|
8
|
+
hadoop fs -test -e "$@"
|
9
|
+
if [ "$?" != "0" ] ; then
|
10
|
+
# echo "File does not exist, making..."
|
11
|
+
exec hadoop fs -mkdir "$@"
|
12
|
+
fi
|
data/bin/hdp-rm
CHANGED
@@ -1,11 +1,32 @@
|
|
1
1
|
#!/usr/bin/env bash
|
2
2
|
|
3
|
+
#
|
4
|
+
# Documentation for hadoop fs -rmr says "acts just like Unix rm -rf command". If this
|
5
|
+
# is true then we need to ignore directories that don't exist and still return 0.
|
6
|
+
#
|
7
|
+
|
8
|
+
#
|
9
|
+
# All the dirty conditional logic here does is test whether a directory exists. If so, remove it
|
10
|
+
#
|
3
11
|
if [ "$1" == "-r" ] ; then
|
12
|
+
shift
|
13
|
+
if [ "$1" == "-skipTrash" ] ; then
|
4
14
|
shift
|
5
|
-
|
15
|
+
hadoop fs -test -e "$@"
|
16
|
+
if [ "$?" == "0" ] ; then
|
17
|
+
# echo "File exists, skipping trash, removing it..."
|
18
|
+
echo hadoop dfs -rmr "$@"
|
19
|
+
exec hadoop dfs -rmr "$@"
|
20
|
+
fi
|
21
|
+
else
|
22
|
+
hadoop fs -test -e "$@"
|
23
|
+
if [ "$?" == "0" ] ; then
|
24
|
+
# echo "File exists, removing it..."
|
25
|
+
echo hadoop dfs -rmr "$@"
|
26
|
+
exec hadoop dfs -rmr "$@"
|
27
|
+
fi
|
28
|
+
fi
|
6
29
|
else
|
7
|
-
|
30
|
+
echo hadoop dfs -rm "$@"
|
31
|
+
exec hadoop dfs -rm "$@"
|
8
32
|
fi
|
9
|
-
echo hadoop dfs -$action "$@"
|
10
|
-
# read -p "Hit ctrl-C to abort or enter to do this...."
|
11
|
-
exec hadoop dfs -$action "$@"
|
data/bin/hdp-sort
CHANGED
@@ -13,7 +13,7 @@ if [ "$output_file" == "" ] ; then echo "$0 input_file output_file [mapper=/bin/
|
|
13
13
|
HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
|
14
14
|
|
15
15
|
cmd="${HADOOP_HOME}/bin/hadoop \
|
16
|
-
jar ${HADOOP_HOME}/contrib/streaming/hadoop
|
16
|
+
jar ${HADOOP_HOME}/contrib/streaming/hadoop-*streaming*.jar
|
17
17
|
-partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner
|
18
18
|
-jobconf num.key.fields.for.partition=\"$partfields\"
|
19
19
|
-jobconf stream.num.map.output.key.fields=\"$sortfields\"
|
data/bin/hdp-stream
CHANGED
@@ -13,7 +13,7 @@ if [ "$output_file" == "" ] ; then echo "$0 input_file output_file [mapper=/bin/
|
|
13
13
|
HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
|
14
14
|
|
15
15
|
cmd="${HADOOP_HOME}/bin/hadoop \
|
16
|
-
jar ${HADOOP_HOME}/contrib/streaming/hadoop
|
16
|
+
jar ${HADOOP_HOME}/contrib/streaming/hadoop-*streaming*.jar
|
17
17
|
-partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner
|
18
18
|
-jobconf num.key.fields.for.partition=\"$partfields\"
|
19
19
|
-jobconf stream.num.map.output.key.fields=\"$sortfields\"
|
data/bin/hdp-stream-flat
CHANGED
@@ -14,7 +14,7 @@ HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
|
|
14
14
|
# -jobconf mapred.reduce.tasks=3 \
|
15
15
|
|
16
16
|
exec ${HADOOP_HOME}/bin/hadoop \
|
17
|
-
jar ${HADOOP_HOME}/contrib/streaming/hadoop
|
17
|
+
jar ${HADOOP_HOME}/contrib/streaming/hadoop-*streaming*.jar \
|
18
18
|
"$@" \
|
19
19
|
-jobconf "mapred.job.name=`basename $0`-$map_script-$input_file-$output_file" \
|
20
20
|
-mapper "$map_script" \
|
@@ -0,0 +1,18 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'rubygems'
|
3
|
+
require 'wukong'
|
4
|
+
# require 'wukong/store'
|
5
|
+
|
6
|
+
require 'configliere'
|
7
|
+
Configliere.use :commandline, :define, :config_file
|
8
|
+
Settings.read('foo.yaml')
|
9
|
+
|
10
|
+
# store = ChunkedFlatFileStore.new(Settings)
|
11
|
+
|
12
|
+
100.times do |iter|
|
13
|
+
# store.save [iter, Time.now.to_flat].join("\t")
|
14
|
+
$stdout.puts [iter, Time.now.to_flat].join("\t")
|
15
|
+
sleep 2
|
16
|
+
end
|
17
|
+
|
18
|
+
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module Enumerable
|
2
|
+
#
|
3
|
+
# Convert an array of values to a string representing it as a pig tuple
|
4
|
+
#
|
5
|
+
def to_pig_tuple
|
6
|
+
map{|*vals| '(' + vals.join(',') + ')' }
|
7
|
+
end
|
8
|
+
|
9
|
+
#
|
10
|
+
# Convert an array of values to a string pig format
|
11
|
+
# Delegates to to_pig_tuple -- see also to_pig_bag
|
12
|
+
#
|
13
|
+
def to_pig *args
|
14
|
+
to_pig_tuple *args
|
15
|
+
end
|
16
|
+
|
17
|
+
#
|
18
|
+
# Convert an array of values to a string representing it as a pig bag
|
19
|
+
#
|
20
|
+
def to_pig_bag
|
21
|
+
'{' + self.join(',') + '}'
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'monkeyshines/monitor/periodic_monitor'
|
2
|
+
module Monkeyshines
|
3
|
+
module Monitor
|
4
|
+
module ChunkedStore
|
5
|
+
attr_accessor :file_pattern
|
6
|
+
def initialize file_pattern
|
7
|
+
self.file_pattern = file_pattern
|
8
|
+
super file_pattern.make
|
9
|
+
end
|
10
|
+
|
11
|
+
def close_and_reopen
|
12
|
+
close
|
13
|
+
self.filename = file_pattern.make
|
14
|
+
dump_file
|
15
|
+
end
|
16
|
+
|
17
|
+
def save *args
|
18
|
+
chunk_monitor.periodically{ close_rename_and_open }
|
19
|
+
super *args
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
module Monkeyshines
|
2
|
+
module Monitor
|
3
|
+
|
4
|
+
#
|
5
|
+
# Emits a log line but only every +iter_interval+ calls or +time_interval+
|
6
|
+
# lapse.
|
7
|
+
#
|
8
|
+
# Since the contents of the block aren't called until the criteria are met,
|
9
|
+
# you can put relatively expensive operations in the log without killing
|
10
|
+
# your iteration time.
|
11
|
+
#
|
12
|
+
class PeriodicLogger < PeriodicMonitor
|
13
|
+
#
|
14
|
+
# Call with a block that returns a string or array to log.
|
15
|
+
# If you return
|
16
|
+
#
|
17
|
+
# Ex: log if it has been at least 5 minutes since last announcement:
|
18
|
+
#
|
19
|
+
# periodic_logger = Monkeyshines::Monitor::PeriodicLogger.new(:time => 300)
|
20
|
+
# loop do
|
21
|
+
# # ... stuff ...
|
22
|
+
# periodic_logger.periodically{ [morbenfactor, crunkosity, exuberance] }
|
23
|
+
# end
|
24
|
+
#
|
25
|
+
def periodically &block
|
26
|
+
super do
|
27
|
+
now = Time.now.utc.to_f
|
28
|
+
result = [ "%10d"%iter, "%7.1f"%since, "%7.1f"%inst_rate(now), (block ? block.call : nil) ].flatten.compact
|
29
|
+
Log.info result.join("\t")
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,72 @@
|
|
1
|
+
module Monkeyshines
|
2
|
+
module Monitor
|
3
|
+
#
|
4
|
+
# Accepts a lightweight call every iteration.
|
5
|
+
#
|
6
|
+
# Once either a time or an iteration criterion is met, executes the block
|
7
|
+
# and resets the timer until next execution.
|
8
|
+
#
|
9
|
+
# Note that the +time_interval+ is measured *excution to execution* and not
|
10
|
+
# in multiples of iter_interval. Say I set a time_interval of 300s, and
|
11
|
+
# happen to iterate at 297s and 310s after start. Then the monitor will
|
12
|
+
# execute at 310s, and the next execution will happen on or after 610s.
|
13
|
+
#
|
14
|
+
# Also note that when *either* criterion is met, *both* criteria are
|
15
|
+
# reset. Say I set a time interval of 300s and an +iter_interval+ of 10_000;
|
16
|
+
# and that at 250s I reach iteration 10_000. Then the monitor will execute
|
17
|
+
# on or after 20_000 iteration or 550s, whichever happens first.
|
18
|
+
#
|
19
|
+
class PeriodicMonitor
|
20
|
+
attr_accessor :time_interval, :iter_interval
|
21
|
+
attr_accessor :last_time, :current_iter, :iter, :started_at
|
22
|
+
|
23
|
+
def initialize options={}
|
24
|
+
self.started_at = Time.now.utc.to_f
|
25
|
+
self.last_time = started_at
|
26
|
+
self.iter = 0
|
27
|
+
self.current_iter = 0
|
28
|
+
self.time_interval = options[:time]
|
29
|
+
self.iter_interval = options[:iters]
|
30
|
+
end
|
31
|
+
|
32
|
+
# True if more than +iter_interval+ has elapsed since last execution.
|
33
|
+
def enough_iterations?
|
34
|
+
iter % iter_interval == 0 if iter_interval
|
35
|
+
end
|
36
|
+
|
37
|
+
# True if more than +time_interval+ has elapsed since last execution.
|
38
|
+
def enough_time? now
|
39
|
+
(now - last_time) > time_interval if time_interval
|
40
|
+
end
|
41
|
+
|
42
|
+
# Time since monitor was created
|
43
|
+
def since
|
44
|
+
Time.now.utc.to_f - started_at
|
45
|
+
end
|
46
|
+
# Overall iterations per second
|
47
|
+
def rate
|
48
|
+
iter.to_f / since.to_f
|
49
|
+
end
|
50
|
+
# "Instantaneous" iterations per second
|
51
|
+
def inst_rate now
|
52
|
+
current_iter.to_f / (now-last_time).to_f
|
53
|
+
end
|
54
|
+
|
55
|
+
#
|
56
|
+
# if the interval conditions are met, executes block; otherwise just does
|
57
|
+
# bookkeeping and returns.
|
58
|
+
#
|
59
|
+
def periodically &block
|
60
|
+
self.iter += 1
|
61
|
+
self.current_iter += 1
|
62
|
+
now = Time.now.utc.to_f
|
63
|
+
if enough_iterations? || enough_time?(now)
|
64
|
+
block.call(iter, (now-last_time))
|
65
|
+
self.last_time = now
|
66
|
+
self.current_iter = 0
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
end
|
72
|
+
end
|
@@ -32,6 +32,10 @@ module Wukong
|
|
32
32
|
Settings.define :noempty, :description => "don't create zero-byte reduce files (hadoop mode only)", :wukong => true
|
33
33
|
Settings.define :job_name, :jobconf => true, :description => 'mapred.job.name', :wukong => true
|
34
34
|
# mapred.linerecordreader.maxlength :description => "Safeguards against corrupted data: lines longer than this (in bytes) are treated as bad records."
|
35
|
+
Settings.define :max_reduces_per_node, :jobconf => true, :description => 'mapred.max.reduces.per.node', :wukong => true
|
36
|
+
Settings.define :max_reduces_per_cluster,:jobconf => true, :description => 'mapred.max.reduces.per.cluster', :wukong => true
|
37
|
+
Settings.define :max_maps_per_node, :jobconf => true, :description => 'mapred.max.maps.per.node', :wukong => true
|
38
|
+
Settings.define :max_maps_per_cluster, :jobconf => true, :description => 'mapred.max.maps.per.cluster', :wukong => true
|
35
39
|
|
36
40
|
# emit a -jobconf hadoop option if the simplified command line arg is present
|
37
41
|
# if not, the resulting nil will be elided later
|
data/lib/wukong/store.rb
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
module Monkeyshines
|
2
|
+
module Store
|
3
|
+
extend FactoryModule
|
4
|
+
autoload :Base, 'monkeyshines/store/base'
|
5
|
+
autoload :FlatFileStore, 'monkeyshines/store/flat_file_store'
|
6
|
+
autoload :ConditionalStore, 'monkeyshines/store/conditional_store'
|
7
|
+
autoload :ChunkedFlatFileStore, 'monkeyshines/store/chunked_flat_file_store'
|
8
|
+
autoload :KeyStore, 'monkeyshines/store/key_store'
|
9
|
+
autoload :TokyoTdbKeyStore, 'monkeyshines/store/tokyo_tdb_key_store'
|
10
|
+
autoload :TyrantTdbKeyStore, 'monkeyshines/store/tyrant_tdb_key_store'
|
11
|
+
autoload :TyrantRdbKeyStore, 'monkeyshines/store/tyrant_rdb_key_store'
|
12
|
+
autoload :ReadThruStore, 'monkeyshines/store/read_thru_store'
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module Monkeyshines
|
2
|
+
module Store
|
3
|
+
class Base
|
4
|
+
attr_accessor :options
|
5
|
+
def initialize _options={}
|
6
|
+
self.options = _options
|
7
|
+
Log.info "Creating #{self.class}"
|
8
|
+
end
|
9
|
+
|
10
|
+
#
|
11
|
+
def each_as klass, &block
|
12
|
+
self.each do |*args|
|
13
|
+
begin
|
14
|
+
item = klass.new *args[1..-1]
|
15
|
+
rescue Exception => e
|
16
|
+
Log.info [args, e.to_s, self].join("\t")
|
17
|
+
raise e
|
18
|
+
end
|
19
|
+
yield item
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def log_line
|
24
|
+
nil
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module Monkeyshines
|
2
|
+
module Store
|
3
|
+
class ChunkedFlatFileStore < Monkeyshines::Store::FlatFileStore
|
4
|
+
attr_accessor :filename_pattern, :chunk_monitor, :handle
|
5
|
+
|
6
|
+
DEFAULT_OPTIONS = {
|
7
|
+
:chunktime => 4*60*60, # default 4 hours
|
8
|
+
:pattern => ":rootdir/:date/:handle+:timestamp-:pid.tsv",
|
9
|
+
:rootdir => nil,
|
10
|
+
:filemode => 'w',
|
11
|
+
}
|
12
|
+
|
13
|
+
def initialize _options
|
14
|
+
self.options = DEFAULT_OPTIONS.deep_merge(_options)
|
15
|
+
raise "You don't really want a chunk time this small: #{options[:chunktime]}" unless options[:chunktime] > 600
|
16
|
+
self.chunk_monitor = Monkeyshines::Monitor::PeriodicMonitor.new( :time => options[:chunktime] )
|
17
|
+
self.handle = options[:handle] || Monkeyshines::CONFIG[:handle]
|
18
|
+
self.filename_pattern = Monkeyshines::Utils::FilenamePattern.new(options[:pattern], :handle => handle, :rootdir => options[:rootdir])
|
19
|
+
super options.merge(:filename => filename_pattern.make())
|
20
|
+
self.mkdir!
|
21
|
+
end
|
22
|
+
|
23
|
+
def save *args
|
24
|
+
result = super *args
|
25
|
+
chunk_monitor.periodically do
|
26
|
+
new_filename = filename_pattern.make()
|
27
|
+
Log.info "Rotating chunked file #{filename} into #{new_filename}"
|
28
|
+
self.close
|
29
|
+
@filename = new_filename
|
30
|
+
self.mkdir!
|
31
|
+
end
|
32
|
+
result
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
module Monkeyshines
|
2
|
+
module Store
|
3
|
+
class ConditionalStore < Monkeyshines::Store::Base
|
4
|
+
attr_accessor :options, :cache, :store, :misses
|
5
|
+
|
6
|
+
DEFAULT_OPTIONS = {
|
7
|
+
:cache => { :type => :tyrant_rdb_key_store },
|
8
|
+
:store => { :type => :chunked_flat_file_store },
|
9
|
+
}
|
10
|
+
|
11
|
+
#
|
12
|
+
#
|
13
|
+
# +cache+ must behave like a hash (Hash and
|
14
|
+
# Monkeyshines::Store::TyrantRdbKeyStore are both cromulent
|
15
|
+
# choices).
|
16
|
+
#
|
17
|
+
#
|
18
|
+
#
|
19
|
+
def initialize _options
|
20
|
+
self.options = DEFAULT_OPTIONS.deep_merge(_options)
|
21
|
+
self.cache = Monkeyshines::Store.create(options[:cache])
|
22
|
+
self.store = Monkeyshines::Store.create(options[:store])
|
23
|
+
self.misses = 0
|
24
|
+
end
|
25
|
+
|
26
|
+
#
|
27
|
+
# If key is absent, save the result of calling the block.
|
28
|
+
# If key is present, block is never called.
|
29
|
+
#
|
30
|
+
# Ex:
|
31
|
+
# rt_store.set(url) do
|
32
|
+
# fetcher.get url # will only be called if url isn't in rt_store
|
33
|
+
# end
|
34
|
+
#
|
35
|
+
def set key, force=nil, &block
|
36
|
+
return if (!force) && cache.include?(key)
|
37
|
+
cache_val, store_val = block.call()
|
38
|
+
return unless cache_val
|
39
|
+
cache.set_nr key, cache_val # update cache
|
40
|
+
store << store_val # save value
|
41
|
+
self.misses += 1 # track the cache miss
|
42
|
+
store_val
|
43
|
+
end
|
44
|
+
|
45
|
+
def size() cache.size end
|
46
|
+
|
47
|
+
def log_line
|
48
|
+
[size, "%8d misses"%misses]
|
49
|
+
end
|
50
|
+
|
51
|
+
def close()
|
52
|
+
cache.close
|
53
|
+
store.close
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
@@ -0,0 +1,90 @@
|
|
1
|
+
require 'fileutils'; include FileUtils
|
2
|
+
|
3
|
+
module Monkeyshines
|
4
|
+
module Store
|
5
|
+
#
|
6
|
+
class FlatFileStore < Store::Base
|
7
|
+
attr_accessor :filename, :filemode
|
8
|
+
|
9
|
+
#
|
10
|
+
# +filename_root+ : first part of name for files
|
11
|
+
#
|
12
|
+
def initialize options={}
|
13
|
+
Log.debug "New #{self.class} as #{options.inspect}"
|
14
|
+
self.filename = options[:filename] or raise "Missing filename in #{self.class}"
|
15
|
+
self.filemode = options[:filemode] || 'r'
|
16
|
+
skip!(options[:skip]) if options[:skip]
|
17
|
+
end
|
18
|
+
|
19
|
+
#
|
20
|
+
#
|
21
|
+
#
|
22
|
+
def each &block
|
23
|
+
file.each do |line|
|
24
|
+
next if line[0..0] == '#'
|
25
|
+
attrs = line.chomp.split("\t")
|
26
|
+
next if attrs.blank?
|
27
|
+
yield *attrs
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
#
|
32
|
+
# Read ahead n_lines lines in the file
|
33
|
+
#
|
34
|
+
def skip! n_lines
|
35
|
+
Log.info "Skipping #{n_lines} in #{self.class}:#{filename}"
|
36
|
+
n_lines.times do
|
37
|
+
file.readline
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
#
|
42
|
+
# Open the timestamped file,
|
43
|
+
# ensuring its directory exists
|
44
|
+
#
|
45
|
+
def file
|
46
|
+
return @file if @file
|
47
|
+
Log.info "Opening file #{filename} with mode #{filemode}"
|
48
|
+
@file = File.open(filename, filemode)
|
49
|
+
end
|
50
|
+
|
51
|
+
# Close the dump file
|
52
|
+
def close
|
53
|
+
@file.close if @file
|
54
|
+
@file = nil
|
55
|
+
end
|
56
|
+
|
57
|
+
# Ensure the file's directory exists
|
58
|
+
def mkdir!
|
59
|
+
dir = File.dirname(filename)
|
60
|
+
return if File.directory?(dir)
|
61
|
+
Log.info "Making directory #{dir}"
|
62
|
+
FileUtils.mkdir_p dir
|
63
|
+
end
|
64
|
+
|
65
|
+
# write to the file
|
66
|
+
def save obj
|
67
|
+
file << obj.to_flat.join("\t")+"\n"
|
68
|
+
obj
|
69
|
+
end
|
70
|
+
|
71
|
+
# returns the size of the current file
|
72
|
+
def size
|
73
|
+
return 0 if !@file
|
74
|
+
File.size(filename)
|
75
|
+
end
|
76
|
+
|
77
|
+
def set key, *args, &block
|
78
|
+
tok, obj = block.call
|
79
|
+
save obj
|
80
|
+
end
|
81
|
+
|
82
|
+
# delegates to +#save+ -- writes the object to the file
|
83
|
+
def <<(obj)
|
84
|
+
save obj
|
85
|
+
end
|
86
|
+
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
@@ -0,0 +1,51 @@
|
|
1
|
+
module Monkeyshines
|
2
|
+
module Store
|
3
|
+
class KeyStore < Monkeyshines::Store::Base
|
4
|
+
# The actual backing store; should respond to #set and #get methods
|
5
|
+
attr_accessor :db
|
6
|
+
|
7
|
+
#
|
8
|
+
# Executes block once for each element in the whole DB, in whatever order
|
9
|
+
# the DB thinks you should see it.
|
10
|
+
#
|
11
|
+
# Your block will see |key, val|
|
12
|
+
#
|
13
|
+
# key_store.each do |key, val|
|
14
|
+
# # ... stuff ...
|
15
|
+
# end
|
16
|
+
#
|
17
|
+
def each &block
|
18
|
+
db.iterinit
|
19
|
+
loop do
|
20
|
+
key = db.iternext or break
|
21
|
+
val = db[key]
|
22
|
+
yield key, val
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
|
27
|
+
# Save the value into the database
|
28
|
+
def set(key, val)
|
29
|
+
return unless val
|
30
|
+
db[key] = val
|
31
|
+
end
|
32
|
+
|
33
|
+
alias_method :save, :set
|
34
|
+
def get(key) db[key] end
|
35
|
+
def [](key) db[key] end
|
36
|
+
def close() db.close end
|
37
|
+
def size() db.size end
|
38
|
+
|
39
|
+
#
|
40
|
+
# Load from standard command-line options
|
41
|
+
#
|
42
|
+
# obvs only works when there's just one store
|
43
|
+
#
|
44
|
+
def self.new_from_command_line cmdline_opts, default_opts={}
|
45
|
+
options = default_opts.merge(cmdline_opts)
|
46
|
+
store = self.new(options[:store_db])
|
47
|
+
store
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module Monkeyshines
|
2
|
+
module Store
|
3
|
+
class ReadThruStore < Monkeyshines::Store::TyrantTdbKeyStore
|
4
|
+
|
5
|
+
#
|
6
|
+
# If key is absent, save the result of calling the block.
|
7
|
+
# If key is present, block is never called.
|
8
|
+
#
|
9
|
+
# Ex:
|
10
|
+
# rt_store.set(url) do
|
11
|
+
# fetcher.get url # will only be called if url isn't in rt_store
|
12
|
+
# end
|
13
|
+
#
|
14
|
+
def set key, force=nil, &block
|
15
|
+
return if !force && db.has_key?(key)
|
16
|
+
result = block.call() or return
|
17
|
+
super(key, result)
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require 'tokyocabinet'
|
2
|
+
module Monkeyshines
|
3
|
+
module Store
|
4
|
+
#
|
5
|
+
# Implementation of KeyStore with a Local TokyoCabinet table database (TDB)
|
6
|
+
#
|
7
|
+
class TokyoTdbKeyStore < Monkeyshines::Store::KeyStore
|
8
|
+
|
9
|
+
# pass in the filename or URI of a tokyo cabinet table-style DB
|
10
|
+
# set create_db = true if you want to create a missing DB file
|
11
|
+
def initialize db_uri, *args
|
12
|
+
self.db = TokyoCabinet::TDB.new
|
13
|
+
db.open(db_uri, TokyoCabinet::TDB::OWRITER) or raise "#{self.class.to_s}: Can't open TokyoCabinet TDB #{db_uri}"
|
14
|
+
super *args
|
15
|
+
end
|
16
|
+
|
17
|
+
|
18
|
+
def each_as klass, &block
|
19
|
+
self.each do |key, hsh|
|
20
|
+
yield klass.from_hash hsh
|
21
|
+
end
|
22
|
+
end
|
23
|
+
# Delegate to store
|
24
|
+
def set(key, val)
|
25
|
+
return unless val
|
26
|
+
db.put key, val.to_hash.compact
|
27
|
+
end
|
28
|
+
|
29
|
+
def size() db.rnum end
|
30
|
+
|
31
|
+
end #class
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
require 'tokyotyrant'
|
2
|
+
module Monkeyshines
|
3
|
+
module Store
|
4
|
+
|
5
|
+
#
|
6
|
+
# Implementation of KeyStore with a Local TokyoCabinet hash database (RDB)
|
7
|
+
#
|
8
|
+
class TyrantRdbKeyStore < Monkeyshines::Store::KeyStore
|
9
|
+
attr_accessor :db_host, :db_port
|
10
|
+
|
11
|
+
# pass in the host:port uri of the key store.
|
12
|
+
def initialize options
|
13
|
+
raise "URI for #{self.class} is required" if options[:uri].blank?
|
14
|
+
self.db_host, self.db_port = options[:uri].to_s.split(':')
|
15
|
+
self.db_host.gsub!(/^(localhost|127\.0\.0\.1)$/,'')
|
16
|
+
super options
|
17
|
+
end
|
18
|
+
|
19
|
+
def db
|
20
|
+
return @db if @db
|
21
|
+
@db ||= TokyoTyrant::RDB.new
|
22
|
+
@db.open(db_host, db_port) or raise("Can't open DB at host #{db_host} port #{db_port}. Pass in host:port' #{@db.ecode}: #{@db.errmsg(@db.ecode)}")
|
23
|
+
@db
|
24
|
+
end
|
25
|
+
|
26
|
+
def close
|
27
|
+
@db.close if @db
|
28
|
+
@db = nil
|
29
|
+
end
|
30
|
+
|
31
|
+
# Save the value into the database without waiting for a response.
|
32
|
+
def set_nr(key, val)
|
33
|
+
db.putnr key, val if val
|
34
|
+
end
|
35
|
+
|
36
|
+
def size() db.rnum end
|
37
|
+
def include? *args
|
38
|
+
db.has_key? *args
|
39
|
+
end
|
40
|
+
|
41
|
+
# require 'memcache'
|
42
|
+
# def initialize db_uri=nil, *args
|
43
|
+
# # db_uri ||= ':1978'
|
44
|
+
# # self.db_host, self.db_port = db_uri.split(':')
|
45
|
+
# self.db = MemCache.new(db_uri, :no_reply => true)
|
46
|
+
# if !self.db then raise("Can't open DB #{db_uri}. Pass in host:port, default is ':1978' #{db.ecode}: #{db.errmsg(db.ecode)}") end
|
47
|
+
# super *args
|
48
|
+
# end
|
49
|
+
#
|
50
|
+
# def size
|
51
|
+
# db.stats
|
52
|
+
# end
|
53
|
+
|
54
|
+
end #class
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
@@ -0,0 +1,20 @@
|
|
1
|
+
require 'tokyotyrant'
|
2
|
+
require 'tyrant_rdb_key_store'
|
3
|
+
module Monkeyshines
|
4
|
+
module Store
|
5
|
+
#
|
6
|
+
# Implementation of KeyStore with a Local TokyoCabinet Table database (RDBTBL)
|
7
|
+
#
|
8
|
+
class TyrantRdbKeyStore < TyrantRdbKeyStore Monkeyshines::Store::KeyStore
|
9
|
+
|
10
|
+
def db
|
11
|
+
return @db if @db
|
12
|
+
@db ||= TokyoTyrant::RDBTBL.new
|
13
|
+
@db.open(db_host, db_port) or raise("Can't open DB #{db_host}:#{db_port}. Pass in host:port' #{@db.ecode}: #{@db.errmsg(@db.ecode)}")
|
14
|
+
@db
|
15
|
+
end
|
16
|
+
|
17
|
+
end #class
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
data/wukong.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{wukong}
|
8
|
-
s.version = "1.4.
|
8
|
+
s.version = "1.4.11"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Philip (flip) Kromer"]
|
12
|
-
s.date = %q{2010-07-
|
12
|
+
s.date = %q{2010-07-30}
|
13
13
|
s.description = %q{ Treat your dataset like a:
|
14
14
|
|
15
15
|
* stream of lines when it's efficient to process by lines
|
@@ -42,6 +42,7 @@ Gem::Specification.new do |s|
|
|
42
42
|
"bin/hdp-kill-task",
|
43
43
|
"bin/hdp-ls",
|
44
44
|
"bin/hdp-mkdir",
|
45
|
+
"bin/hdp-mkdirp",
|
45
46
|
"bin/hdp-mv",
|
46
47
|
"bin/hdp-parts_to_keys.rb",
|
47
48
|
"bin/hdp-ps",
|
@@ -151,9 +152,11 @@ Gem::Specification.new do |s|
|
|
151
152
|
"examples/size.rb",
|
152
153
|
"examples/stats/avg_value_frequency.rb",
|
153
154
|
"examples/stats/data/avg_value_frequency.tsv",
|
155
|
+
"examples/store/chunked_store_example.rb",
|
154
156
|
"examples/stupidly_simple_filter.rb",
|
155
157
|
"examples/word_count.rb",
|
156
158
|
"lib/wukong.rb",
|
159
|
+
"lib/wukong/and_pig.rb",
|
157
160
|
"lib/wukong/bad_record.rb",
|
158
161
|
"lib/wukong/datatypes.rb",
|
159
162
|
"lib/wukong/datatypes/enum.rb",
|
@@ -182,12 +185,28 @@ Gem::Specification.new do |s|
|
|
182
185
|
"lib/wukong/keystore/tyrant_notes.textile",
|
183
186
|
"lib/wukong/logger.rb",
|
184
187
|
"lib/wukong/models/graph.rb",
|
188
|
+
"lib/wukong/monitor.rb",
|
189
|
+
"lib/wukong/monitor/chunked_store.rb",
|
190
|
+
"lib/wukong/monitor/periodic_logger.rb",
|
191
|
+
"lib/wukong/monitor/periodic_monitor.rb",
|
185
192
|
"lib/wukong/periodic_monitor.rb",
|
186
193
|
"lib/wukong/rdf.rb",
|
187
194
|
"lib/wukong/schema.rb",
|
188
195
|
"lib/wukong/script.rb",
|
189
196
|
"lib/wukong/script/hadoop_command.rb",
|
190
197
|
"lib/wukong/script/local_command.rb",
|
198
|
+
"lib/wukong/store.rb",
|
199
|
+
"lib/wukong/store/base.rb",
|
200
|
+
"lib/wukong/store/chunked_flat_file_store.rb",
|
201
|
+
"lib/wukong/store/conditional_store.rb",
|
202
|
+
"lib/wukong/store/factory.rb",
|
203
|
+
"lib/wukong/store/flat_file_store.rb",
|
204
|
+
"lib/wukong/store/key_store.rb",
|
205
|
+
"lib/wukong/store/null_store.rb",
|
206
|
+
"lib/wukong/store/read_thru_store.rb",
|
207
|
+
"lib/wukong/store/tokyo_tdb_key_store.rb",
|
208
|
+
"lib/wukong/store/tyrant_rdb_key_store.rb",
|
209
|
+
"lib/wukong/store/tyrant_tdb_key_store.rb",
|
191
210
|
"lib/wukong/streamer.rb",
|
192
211
|
"lib/wukong/streamer/accumulating_reducer.rb",
|
193
212
|
"lib/wukong/streamer/base.rb",
|
@@ -226,30 +245,31 @@ Gem::Specification.new do |s|
|
|
226
245
|
"spec/spec_helper.rb",
|
227
246
|
"spec/wukong/encoding_spec.rb",
|
228
247
|
"spec/wukong/script_spec.rb",
|
248
|
+
"examples/binning_percentile_estimator.rb",
|
249
|
+
"examples/contrib/jeans/normalize.rb",
|
250
|
+
"examples/contrib/jeans/sizes.rb",
|
251
|
+
"examples/corpus/words_to_bigrams.rb",
|
252
|
+
"examples/count_keys.rb",
|
253
|
+
"examples/count_keys_at_mapper.rb",
|
254
|
+
"examples/keystore/cassandra_batch_test.rb",
|
255
|
+
"examples/keystore/conditional_outputter_example.rb",
|
256
|
+
"examples/network_graph/adjacency_list.rb",
|
257
|
+
"examples/network_graph/breadth_first_search.rb",
|
258
|
+
"examples/network_graph/gen_2paths.rb",
|
259
|
+
"examples/network_graph/gen_multi_edge.rb",
|
260
|
+
"examples/network_graph/gen_symmetric_links.rb",
|
229
261
|
"examples/pagerank/pagerank.rb",
|
230
262
|
"examples/pagerank/pagerank_initialize.rb",
|
263
|
+
"examples/rank_and_bin.rb",
|
231
264
|
"examples/sample_records.rb",
|
232
265
|
"examples/server_logs/apache_log_parser.rb",
|
233
266
|
"examples/server_logs/breadcrumbs.rb",
|
234
267
|
"examples/server_logs/user_agent.rb",
|
235
|
-
"examples/corpus/words_to_bigrams.rb",
|
236
|
-
"examples/count_keys.rb",
|
237
|
-
"examples/rank_and_bin.rb",
|
238
|
-
"examples/binning_percentile_estimator.rb",
|
239
268
|
"examples/size.rb",
|
240
|
-
"examples/network_graph/breadth_first_search.rb",
|
241
|
-
"examples/network_graph/gen_symmetric_links.rb",
|
242
|
-
"examples/network_graph/gen_multi_edge.rb",
|
243
|
-
"examples/network_graph/adjacency_list.rb",
|
244
|
-
"examples/network_graph/gen_2paths.rb",
|
245
|
-
"examples/keystore/cassandra_batch_test.rb",
|
246
|
-
"examples/keystore/conditional_outputter_example.rb",
|
247
269
|
"examples/stats/avg_value_frequency.rb",
|
248
|
-
"examples/
|
249
|
-
"examples/contrib/jeans/normalize.rb",
|
250
|
-
"examples/word_count.rb",
|
270
|
+
"examples/store/chunked_store_example.rb",
|
251
271
|
"examples/stupidly_simple_filter.rb",
|
252
|
-
"examples/
|
272
|
+
"examples/word_count.rb"
|
253
273
|
]
|
254
274
|
|
255
275
|
if s.respond_to? :specification_version then
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wukong
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 17
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 1
|
8
8
|
- 4
|
9
|
-
-
|
10
|
-
version: 1.4.
|
9
|
+
- 11
|
10
|
+
version: 1.4.11
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Philip (flip) Kromer
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2010-07-
|
18
|
+
date: 2010-07-30 00:00:00 -05:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -136,6 +136,7 @@ files:
|
|
136
136
|
- bin/hdp-kill-task
|
137
137
|
- bin/hdp-ls
|
138
138
|
- bin/hdp-mkdir
|
139
|
+
- bin/hdp-mkdirp
|
139
140
|
- bin/hdp-mv
|
140
141
|
- bin/hdp-parts_to_keys.rb
|
141
142
|
- bin/hdp-ps
|
@@ -245,9 +246,11 @@ files:
|
|
245
246
|
- examples/size.rb
|
246
247
|
- examples/stats/avg_value_frequency.rb
|
247
248
|
- examples/stats/data/avg_value_frequency.tsv
|
249
|
+
- examples/store/chunked_store_example.rb
|
248
250
|
- examples/stupidly_simple_filter.rb
|
249
251
|
- examples/word_count.rb
|
250
252
|
- lib/wukong.rb
|
253
|
+
- lib/wukong/and_pig.rb
|
251
254
|
- lib/wukong/bad_record.rb
|
252
255
|
- lib/wukong/datatypes.rb
|
253
256
|
- lib/wukong/datatypes/enum.rb
|
@@ -276,12 +279,28 @@ files:
|
|
276
279
|
- lib/wukong/keystore/tyrant_notes.textile
|
277
280
|
- lib/wukong/logger.rb
|
278
281
|
- lib/wukong/models/graph.rb
|
282
|
+
- lib/wukong/monitor.rb
|
283
|
+
- lib/wukong/monitor/chunked_store.rb
|
284
|
+
- lib/wukong/monitor/periodic_logger.rb
|
285
|
+
- lib/wukong/monitor/periodic_monitor.rb
|
279
286
|
- lib/wukong/periodic_monitor.rb
|
280
287
|
- lib/wukong/rdf.rb
|
281
288
|
- lib/wukong/schema.rb
|
282
289
|
- lib/wukong/script.rb
|
283
290
|
- lib/wukong/script/hadoop_command.rb
|
284
291
|
- lib/wukong/script/local_command.rb
|
292
|
+
- lib/wukong/store.rb
|
293
|
+
- lib/wukong/store/base.rb
|
294
|
+
- lib/wukong/store/chunked_flat_file_store.rb
|
295
|
+
- lib/wukong/store/conditional_store.rb
|
296
|
+
- lib/wukong/store/factory.rb
|
297
|
+
- lib/wukong/store/flat_file_store.rb
|
298
|
+
- lib/wukong/store/key_store.rb
|
299
|
+
- lib/wukong/store/null_store.rb
|
300
|
+
- lib/wukong/store/read_thru_store.rb
|
301
|
+
- lib/wukong/store/tokyo_tdb_key_store.rb
|
302
|
+
- lib/wukong/store/tyrant_rdb_key_store.rb
|
303
|
+
- lib/wukong/store/tyrant_tdb_key_store.rb
|
285
304
|
- lib/wukong/streamer.rb
|
286
305
|
- lib/wukong/streamer/accumulating_reducer.rb
|
287
306
|
- lib/wukong/streamer/base.rb
|
@@ -348,27 +367,28 @@ test_files:
|
|
348
367
|
- spec/spec_helper.rb
|
349
368
|
- spec/wukong/encoding_spec.rb
|
350
369
|
- spec/wukong/script_spec.rb
|
370
|
+
- examples/binning_percentile_estimator.rb
|
371
|
+
- examples/contrib/jeans/normalize.rb
|
372
|
+
- examples/contrib/jeans/sizes.rb
|
373
|
+
- examples/corpus/words_to_bigrams.rb
|
374
|
+
- examples/count_keys.rb
|
375
|
+
- examples/count_keys_at_mapper.rb
|
376
|
+
- examples/keystore/cassandra_batch_test.rb
|
377
|
+
- examples/keystore/conditional_outputter_example.rb
|
378
|
+
- examples/network_graph/adjacency_list.rb
|
379
|
+
- examples/network_graph/breadth_first_search.rb
|
380
|
+
- examples/network_graph/gen_2paths.rb
|
381
|
+
- examples/network_graph/gen_multi_edge.rb
|
382
|
+
- examples/network_graph/gen_symmetric_links.rb
|
351
383
|
- examples/pagerank/pagerank.rb
|
352
384
|
- examples/pagerank/pagerank_initialize.rb
|
385
|
+
- examples/rank_and_bin.rb
|
353
386
|
- examples/sample_records.rb
|
354
387
|
- examples/server_logs/apache_log_parser.rb
|
355
388
|
- examples/server_logs/breadcrumbs.rb
|
356
389
|
- examples/server_logs/user_agent.rb
|
357
|
-
- examples/corpus/words_to_bigrams.rb
|
358
|
-
- examples/count_keys.rb
|
359
|
-
- examples/rank_and_bin.rb
|
360
|
-
- examples/binning_percentile_estimator.rb
|
361
390
|
- examples/size.rb
|
362
|
-
- examples/network_graph/breadth_first_search.rb
|
363
|
-
- examples/network_graph/gen_symmetric_links.rb
|
364
|
-
- examples/network_graph/gen_multi_edge.rb
|
365
|
-
- examples/network_graph/adjacency_list.rb
|
366
|
-
- examples/network_graph/gen_2paths.rb
|
367
|
-
- examples/keystore/cassandra_batch_test.rb
|
368
|
-
- examples/keystore/conditional_outputter_example.rb
|
369
391
|
- examples/stats/avg_value_frequency.rb
|
370
|
-
- examples/
|
371
|
-
- examples/contrib/jeans/normalize.rb
|
372
|
-
- examples/word_count.rb
|
392
|
+
- examples/store/chunked_store_example.rb
|
373
393
|
- examples/stupidly_simple_filter.rb
|
374
|
-
- examples/
|
394
|
+
- examples/word_count.rb
|