wukong 1.4.9 → 1.4.10
Sign up to get free protection for your applications and to get access to all the features.
- data/TODO.textile +13 -0
- data/bin/hdp-bzip +3 -3
- data/bin/hdp-kill-task +3 -0
- data/bin/hdp-mkdir +0 -1
- data/bin/hdp-put +1 -1
- data/bin/hdp-sort +5 -17
- data/bin/hdp-stream +5 -17
- data/bin/hdp-stream-flat +5 -5
- data/bin/wu-sum +1 -0
- data/docpages/README-performance.textile +90 -0
- data/examples/binning_percentile_estimator.rb +142 -0
- data/examples/corpus/words_to_bigrams.rb +52 -0
- data/examples/keystore/cassandra_batch_test.rb +41 -0
- data/examples/network_graph/gen_multi_edge.rb +3 -2
- data/examples/sample_records.rb +1 -0
- data/lib/wukong/extensions/date_time.rb +4 -3
- data/lib/wukong/extensions/enumerable.rb +79 -0
- data/lib/wukong/extensions.rb +1 -0
- data/lib/wukong/keystore/redis_db.rb +24 -0
- data/lib/wukong/keystore/tyrant_db.rb +124 -0
- data/lib/wukong/keystore/tyrant_notes.textile +145 -0
- data/lib/wukong/periodic_monitor.rb +57 -0
- data/lib/wukong/script/hadoop_command.rb +3 -1
- data/lib/wukong/streamer/accumulating_reducer.rb +1 -0
- data/lib/wukong/streamer/cassandra_streamer.rb +61 -0
- data/lib/wukong/streamer.rb +12 -10
- data/wukong.gemspec +34 -16
- metadata +60 -16
@@ -0,0 +1,24 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'rubygems' ;
|
3
|
+
require 'redis' ;
|
4
|
+
|
5
|
+
RDB = Redis.new(:host => 'localhost', :port => 6379)
|
6
|
+
|
7
|
+
start_time = Time.now.utc.to_f ;
|
8
|
+
iter=0;
|
9
|
+
|
10
|
+
|
11
|
+
$stdin.each do |line|
|
12
|
+
_r, id, scat, sn, pr, fo, fr, st, fv, crat, sid, full = line.chomp.split("\t");
|
13
|
+
iter+=1 ;
|
14
|
+
break if iter > 20_000_000
|
15
|
+
|
16
|
+
if (iter % 10_000 == 0)
|
17
|
+
elapsed = (Time.now.utc.to_f - start_time)
|
18
|
+
puts "%-20s\t%7d\t%7d\t%7.2f\t%7.2f" % [sn, fo, iter, elapsed, iter.to_f/elapsed]
|
19
|
+
end
|
20
|
+
|
21
|
+
RDB['sn:'+sn.downcase] = id unless sn.empty?
|
22
|
+
RDB['sid:'+sid] = id unless sid.empty?
|
23
|
+
RDB['uid:'+id] = [sn,sid,crat,scat].join(',') unless id.empty?
|
24
|
+
end
|
@@ -0,0 +1,124 @@
|
|
1
|
+
require 'tokyo_tyrant'
|
2
|
+
require 'tokyo_tyrant/balancer'
|
3
|
+
|
4
|
+
# -- Installing
|
5
|
+
# make sure tokyocabinet and tokyotyrant are installed (cehf recipe)
|
6
|
+
# make sure ruby-tokyotyrant is installed
|
7
|
+
# ldconfig
|
8
|
+
# mkdir -p /data/db/ttyrant /var/run/tyrant /var/log/tyrant
|
9
|
+
#
|
10
|
+
# -- Starting
|
11
|
+
# ttserver -port 12001 -thnum 96 -tout 3 -pid /var/run/tyrant/screen_names.pid -kl -log /var/log/tyrant/user_ids.tch '/data/db/ttyrant/user_ids.tch#bnum=100000000#opts=l#rcnum=50000#xmsiz=268435456'
|
12
|
+
# ttserver -port 12002 -thnum 96 -tout 3 -pid /var/run/tyrant/screen_names.pid -kl -log /var/log/tyrant/screen_names.tch '/data/db/ttyrant/screen_names.tch#bnum=100000000#opts=l#rcnum=50000#xmsiz=268435456'
|
13
|
+
# ttserver -port 12003 -thnum 96 -tout 3 -pid /var/run/tyrant/screen_names.pid -kl -log /var/log/tyrant/search_ids.tch '/data/db/ttyrant/search_ids.tch#bnum=100000000#opts=l#rcnum=50000#xmsiz=268435456'
|
14
|
+
# ttserver -port 12004 -thnum 96 -tout 3 -pid /var/run/tyrant/screen_names.pid -kl -log /var/log/tyrant/tweets_parsed.tch '/data/db/ttyrant/tweets_parsed.tch#bnum=800000000#opts=l#rcnum=50000#xmsiz=268435456'
|
15
|
+
# ttserver -port 12005 -thnum 96 -tout 3 -pid /var/run/tyrant/screen_names.pid -kl -log /var/log/tyrant/users_parsed.tch '/data/db/ttyrant/users_parsed.tch#bnum=100000000#opts=l#rcnum=50000#xmsiz=268435456'
|
16
|
+
#
|
17
|
+
# -- Monitoring
|
18
|
+
# tcrmgr inform -port $port -st $hostname
|
19
|
+
# active conns:
|
20
|
+
# lsof -i | grep ttserver | wc -l
|
21
|
+
# netstat -a -W | grep ':120' | ruby -ne 'puts $_.split(/ +/)[3 .. 4].join("\t")' | sort | cut -d: -f1-2 | uniq -c | sort -n
|
22
|
+
# use db.rnum for most lightweight ping method
|
23
|
+
#
|
24
|
+
# -- Tuning
|
25
|
+
# http://korrespondence.blogspot.com/2009/09/tokyo-tyrant-tuning-parameters.html
|
26
|
+
# http://capttofu.livejournal.com/23381.html
|
27
|
+
# http://groups.google.com/group/tokyocabinet-users/browse_thread/thread/5a46ee04006a791c#
|
28
|
+
# opts "l" of large option (the size of the database can be larger than 2GB by using 64-bit bucket array.), "d" of Deflate option (each record is compressed with Deflate encoding), "b" of BZIP2 option, "t" of TCBS option
|
29
|
+
# bnum number of elements of the bucket array. If it is not more than 0, the default value is specified. The default value is 131071 (128K). Suggested size of the bucket array is about from 0.5 to 4 times of the number of all records to be stored.
|
30
|
+
# rcnum maximum number of records to be cached. If it is not more than 0, the record cache is disabled. It is disabled by default.
|
31
|
+
# xmsiz size of the extra mapped memory. If it is not more than 0, the extra mapped memory is disabled. The default size is 67108864 (64MB).
|
32
|
+
# apow size of record alignment by power of 2. If it is negative, the default value is specified. The default value is 4 standing for 2^4=16.
|
33
|
+
# fpow maximum number of elements of the free block pool by power of 2. If it is negative, the default value is specified. The default value is 10 standing for 2^10=1024.
|
34
|
+
# dfunit unit step number of auto defragmentation. If it is not more than 0, the auto defragmentation is disabled. It is disabled by default.
|
35
|
+
# mode "w" of writer, "r" of reader,"c" of creating,"t" of truncating ,"e" of no locking,"f" of non-blocking lock
|
36
|
+
#
|
37
|
+
# -- Links
|
38
|
+
# http://1978th.net/tokyocabinet/spex-en.html
|
39
|
+
# http://groups.google.com/group/tokyocabinet-users/browse_thread/thread/3bd2a93322c09eec#
|
40
|
+
|
41
|
+
|
42
|
+
class TokyoTyrant::Balancer::Base
|
43
|
+
def initialize(hostnames = [], timeout = 20.0, should_retry = true)
|
44
|
+
@servers = hostnames.map do |hostname|
|
45
|
+
host, port = hostname.split(':')
|
46
|
+
klass.new(host, port.to_i, timeout, should_retry)
|
47
|
+
end
|
48
|
+
# yes, for some reason it's spelled 'Constistent' here
|
49
|
+
# DO NOT fix it because it goes deep...
|
50
|
+
@ring = TokyoTyrant::ConstistentHash.new(servers)
|
51
|
+
end
|
52
|
+
|
53
|
+
def close
|
54
|
+
@servers.all?{ |server| server.close rescue nil}
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
58
|
+
|
59
|
+
module TokyoDbConnection
|
60
|
+
class TyrantDb
|
61
|
+
attr_reader :dataset
|
62
|
+
DB_SERVERS = [
|
63
|
+
'10.194.101.156',
|
64
|
+
'10.196.73.156',
|
65
|
+
'10.196.75.47',
|
66
|
+
'10.242.217.140',
|
67
|
+
].freeze unless defined?(TokyoDbConnection::TyrantDb::DB_SERVERS)
|
68
|
+
|
69
|
+
DB_PORTS = {
|
70
|
+
:user_ids => 12001,
|
71
|
+
:screen_names => 12002,
|
72
|
+
:search_ids => 12003,
|
73
|
+
:tweets_parsed => 12004,
|
74
|
+
:users_parsed => 12005,
|
75
|
+
} unless defined?(TokyoDbConnection::TyrantDb::DB_PORTS)
|
76
|
+
|
77
|
+
def initialize dataset
|
78
|
+
@dataset = dataset
|
79
|
+
end
|
80
|
+
|
81
|
+
def db
|
82
|
+
return @db if @db
|
83
|
+
port = DB_PORTS[dataset] or raise "Don't know how to reach dataset #{dataset}"
|
84
|
+
@db = TokyoTyrant::Balancer::DB.new(DB_SERVERS.map{|s| s+':'+port.to_s})
|
85
|
+
# @db = TokyoTyrant::DB.new(DB_SERVERS.first, port.to_i)
|
86
|
+
@db
|
87
|
+
end
|
88
|
+
|
89
|
+
def [](*args) ; db[*args] ; end
|
90
|
+
def size(*args) ; db.size(*args) ; end
|
91
|
+
def vanish!(*args) ; db.vanish(*args) ; end
|
92
|
+
|
93
|
+
#
|
94
|
+
# Insert into the cassandra database with default settings
|
95
|
+
#
|
96
|
+
def insert key, value
|
97
|
+
begin
|
98
|
+
db.putnr(key, value)
|
99
|
+
rescue StandardError => e ; handle_error("Insert #{[key, value].inspect}", e); end
|
100
|
+
end
|
101
|
+
|
102
|
+
def insert_array key, value
|
103
|
+
insert(key, value.join(','))
|
104
|
+
end
|
105
|
+
|
106
|
+
def get *args
|
107
|
+
begin
|
108
|
+
db.get(*args)
|
109
|
+
rescue StandardError => e ; handle_error("Fetch #{args.inspect}", e); end
|
110
|
+
end
|
111
|
+
|
112
|
+
def handle_error action, e
|
113
|
+
warn "#{action} failed: #{e} #{e.backtrace.join("\t")}" ;
|
114
|
+
invalidate!
|
115
|
+
end
|
116
|
+
|
117
|
+
def invalidate!
|
118
|
+
(@db && @db.close) or warn "Couldn't close #{@db.inspect}"
|
119
|
+
@db = nil
|
120
|
+
sleep 2
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
@@ -0,0 +1,145 @@
|
|
1
|
+
|
2
|
+
# -- Installing
|
3
|
+
# make sure tokyocabinet and tokyotyrant are installed (cehf recipe)
|
4
|
+
# make sure ruby-tokyotyrant is installed
|
5
|
+
# ldconfig
|
6
|
+
# mkdir -p /data/db/ttyrant /var/run/tyrant /var/log/tyrant
|
7
|
+
#
|
8
|
+
# -- Starting
|
9
|
+
# ttserver -port 12001 -thnum 96 -tout 3 -pid /var/run/tyrant/screen_names.pid -kl -log /var/log/tyrant/user_ids.tch '/data/db/ttyrant/user_ids.tch#bnum=100000000#opts=l#rcnum=50000#xmsiz=268435456'
|
10
|
+
# ttserver -port 12002 -thnum 96 -tout 3 -pid /var/run/tyrant/screen_names.pid -kl -log /var/log/tyrant/screen_names.tch '/data/db/ttyrant/screen_names.tch#bnum=100000000#opts=l#rcnum=50000#xmsiz=268435456'
|
11
|
+
# ttserver -port 12003 -thnum 96 -tout 3 -pid /var/run/tyrant/screen_names.pid -kl -log /var/log/tyrant/search_ids.tch '/data/db/ttyrant/search_ids.tch#bnum=100000000#opts=l#rcnum=50000#xmsiz=268435456'
|
12
|
+
# ttserver -port 12004 -thnum 96 -tout 3 -pid /var/run/tyrant/screen_names.pid -kl -log /var/log/tyrant/tweets_parsed.tch '/data/db/ttyrant/tweets_parsed.tch#bnum=800000000#opts=l#rcnum=50000#xmsiz=268435456'
|
13
|
+
# ttserver -port 12005 -thnum 96 -tout 3 -pid /var/run/tyrant/screen_names.pid -kl -log /var/log/tyrant/users_parsed.tch '/data/db/ttyrant/users_parsed.tch#bnum=100000000#opts=l#rcnum=50000#xmsiz=268435456'
|
14
|
+
#
|
15
|
+
# -- Monitoring
|
16
|
+
# tcrmgr inform -port $port -st $hostname
|
17
|
+
# active conns:
|
18
|
+
# lsof -i | grep ttserver | wc -l
|
19
|
+
# netstat -a -W | grep ':120' | ruby -ne 'puts $_.split(/ +/)[3 .. 4].join("\t")' | sort | cut -d: -f1-2 | uniq -c | sort -n
|
20
|
+
# use db.rnum for most lightweight ping method
|
21
|
+
#
|
22
|
+
# -- Tuning
|
23
|
+
# http://korrespondence.blogspot.com/2009/09/tokyo-tyrant-tuning-parameters.html
|
24
|
+
# http://capttofu.livejournal.com/23381.html
|
25
|
+
# http://groups.google.com/group/tokyocabinet-users/browse_thread/thread/5a46ee04006a791c#
|
26
|
+
# opts "l" of large option (the size of the database can be larger than 2GB by using 64-bit bucket array.), "d" of Deflate option (each record is compressed with Deflate encoding), "b" of BZIP2 option, "t" of TCBS option
|
27
|
+
# bnum number of elements of the bucket array. If it is not more than 0, the default value is specified. The default value is 131071 (128K). Suggested size of the bucket array is about from 0.5 to 4 times of the number of all records to be stored.
|
28
|
+
# rcnum maximum number of records to be cached. If it is not more than 0, the record cache is disabled. It is disabled by default.
|
29
|
+
# xmsiz size of the extra mapped memory. If it is not more than 0, the extra mapped memory is disabled. The default size is 67108864 (64MB).
|
30
|
+
# apow size of record alignment by power of 2. If it is negative, the default value is specified. The default value is 4 standing for 2^4=16.
|
31
|
+
# fpow maximum number of elements of the free block pool by power of 2. If it is negative, the default value is specified. The default value is 10 standing for 2^10=1024.
|
32
|
+
# dfunit unit step number of auto defragmentation. If it is not more than 0, the auto defragmentation is disabled. It is disabled by default.
|
33
|
+
# mode "w" of writer, "r" of reader,"c" of creating,"t" of truncating ,"e" of no locking,"f" of non-blocking lock
|
34
|
+
#
|
35
|
+
# -- Links
|
36
|
+
# http://1978th.net/tokyocabinet/spex-en.html
|
37
|
+
# http://groups.google.com/group/tokyocabinet-users/browse_thread/thread/3bd2a93322c09eec#
|
38
|
+
# Performance limits: http://groups.google.com/group/tokyocabinet-users/browse_thread/thread/3bd2a93322c09eec#
|
39
|
+
|
40
|
+
|
41
|
+
h2. Tyrant: ttserver
|
42
|
+
|
43
|
+
ttdb="test"
|
44
|
+
ttserver -port 12009 -thnum 96 \
|
45
|
+
-dmn -pid /var/run/tyrant-${ttdb}.pid
|
46
|
+
-ulog /mnt/tmp/ttyrant/tyrant-$[ttdb}.ulog -ulim 268435456 -uas \
|
47
|
+
-log /var/log/ttyrant/tyrant-${ttdb}.log \
|
48
|
+
"/data/db/ttyrant/${ttdb}.tch#bnum=200000000#opts=l#rcnum=100000#xmsiz=536870912"
|
49
|
+
|
50
|
+
can also add host, and umask out to be read-only
|
51
|
+
|
52
|
+
* -host name : specify the host name or the address of the server. By default, every network address is bound.
|
53
|
+
* -port num : specify the port number. By default, it is 1978.
|
54
|
+
* -thnum num : specify the number of worker threads. By default, it is 8.
|
55
|
+
* -tout num : specify the timeout of each session in seconds. By default, no timeout is specified.
|
56
|
+
* -dmn : work as a daemon process.
|
57
|
+
* -pid path : output the process ID into the file.
|
58
|
+
* -kl : kill the existing process if the process ID file is detected.
|
59
|
+
* -log path : output log messages into the file.
|
60
|
+
* -ld : log debug messages also.
|
61
|
+
* -le : log error messages only.
|
62
|
+
* -ulog path : specify the update log directory.
|
63
|
+
* -ulim num : specify the limit size of each update log file.
|
64
|
+
* -uas : use asynchronous I/O for the update log.
|
65
|
+
* -sid num : specify the server ID.
|
66
|
+
* -mhost name : specify the host name of the replication master server.
|
67
|
+
* -mport num : specify the port number of the replication master server.
|
68
|
+
* -rts path : specify the replication time stamp file.
|
69
|
+
* -rcc : check consistency of replication.
|
70
|
+
* -skel name : specify the name of the skeleton database library.
|
71
|
+
* -mul num : specify the division number of the multiple database mechanism.
|
72
|
+
* -ext path : specify the script language extension file.
|
73
|
+
* -extpc name period : specify the function name and the calling period of a periodic command.
|
74
|
+
* -mask expr : specify the names of forbidden commands.
|
75
|
+
* -unmask expr : specify the names of allowed commands.
|
76
|
+
|
77
|
+
|
78
|
+
h2. From "Wolfgang Gassler":http://groups.google.com/group/tokyocabinet-users/browse_thread/thread/5a46ee04006a791c#
|
79
|
+
|
80
|
+
On Sat, Dec 05, 2009 at 09:32:20PM +0100, Wolfgang Gassler wrote:
|
81
|
+
> Hi,
|
82
|
+
|
83
|
+
> did anybody look up some of the folowing parameters in the code or can
|
84
|
+
> explain them in detail? I just have a guess what they really mean and
|
85
|
+
> the short description at the docu homepage
|
86
|
+
> http://korrespondence.blogspot.com/2009/09/tokyo-tyrant-tuning-parame...
|
87
|
+
> explain them very roughly. Also the already posted blog post
|
88
|
+
> http://korrespondence.blogspot.com/2009/09/tokyo-tyrant-tuning-parame...
|
89
|
+
> couldn't help.
|
90
|
+
|
91
|
+
this is what I gleaned from reading the source code for the hash database
|
92
|
+
format ( tchdb.c and tchdb.h ).
|
93
|
+
|
94
|
+
> xmsiz
|
95
|
+
|
96
|
+
On a TC Hash database, from the beginning of the file, to the end of the bucket
|
97
|
+
section, all of that space is mmap'd. Setting 'xmsiz' sets the minimum amount
|
98
|
+
of space that is mmap'd. Since 67108864 is the default, this means, that an a
|
99
|
+
minimum, the first 64MiB of the file will be mmap'd.
|
100
|
+
|
101
|
+
If the header size, plus the bucket region is greater than 'xmsize', then xmsiz
|
102
|
+
appers to have no affect.
|
103
|
+
|
104
|
+
> apow
|
105
|
+
|
106
|
+
On a TC Hash database, 'apow' determines on what byte alignment each record will
|
107
|
+
sit. 'apow' is a power of 2. This means that when apow is 4 ( the default for
|
108
|
+
hash databases) all records in the database are aligned on a 16 byte boundary,
|
109
|
+
in the database file.
|
110
|
+
|
111
|
+
This means that every record will take up at a minumum 16 bytes of space, and
|
112
|
+
all records are padded to a length that is a multiple of 16.
|
113
|
+
|
114
|
+
> fpow
|
115
|
+
|
116
|
+
On a TC Hash database, 'fpow' determines the maximum number of free blocks that
|
117
|
+
can exist in the free block pool. This is also a power-of-2 parameter so with
|
118
|
+
the default in a Hash database of 10, this means that there can be a maximum
|
119
|
+
of 2^10, or 1024 free blocks in the database.
|
120
|
+
|
121
|
+
Free blocks come into existence when records are deleted from the database
|
122
|
+
and their space in the db file is up for reuse. If you never delete an
|
123
|
+
item from the database, you will never have any free blocks.
|
124
|
+
|
125
|
+
> dfunit
|
126
|
+
|
127
|
+
On a TC Hash database, 'dfunit' describes how defragmentation takes place.
|
128
|
+
Every time a free block is created a 'dfcnt' is incremented. When 'dfcnt'
|
129
|
+
is greater than 'dfunit' and 'dfunit' is greater than 0, defragmentation
|
130
|
+
takes place.
|
131
|
+
|
132
|
+
I don't know precisely what defragmentation does in TC. A cursory look
|
133
|
+
at 'tchdbdefragimpl', the function implementing defagmentation for hash
|
134
|
+
databases, it looks like it moves records around filling up free blocks
|
135
|
+
in the hash db with real records from the end of the file and then making the
|
136
|
+
file smaller if possible.
|
137
|
+
|
138
|
+
Basically it moves records around minimizing dead space in the file.
|
139
|
+
|
140
|
+
Again, defragmentation will only take place if 'dfunit' has a positive
|
141
|
+
value and you remove records from the db creating free blocks.
|
142
|
+
|
143
|
+
enjoy,
|
144
|
+
|
145
|
+
-jeremy
|
@@ -0,0 +1,57 @@
|
|
1
|
+
Settings.define :log_interval, :default => 1000, :type => Integer, :description => 'How many iterations between log statements'
|
2
|
+
|
3
|
+
#
|
4
|
+
# Periodic monitor
|
5
|
+
#
|
6
|
+
#
|
7
|
+
# This is very much a work in progress
|
8
|
+
#
|
9
|
+
class PeriodicMonitor
|
10
|
+
attr_reader :iter, :start_time, :options
|
11
|
+
attr_accessor :interval
|
12
|
+
|
13
|
+
def initialize extra_options={}
|
14
|
+
@options = {}
|
15
|
+
@options.deep_merge!( extra_options || {} )
|
16
|
+
@iter = 0
|
17
|
+
@start_time = now
|
18
|
+
@interval = (options[:log_interval] || Settings[:log_interval]).to_i
|
19
|
+
@interval = 1000 unless @interval >= 1
|
20
|
+
end
|
21
|
+
|
22
|
+
def periodically *args, &block
|
23
|
+
incr!
|
24
|
+
if ready?
|
25
|
+
if block
|
26
|
+
block.call(iter, *args)
|
27
|
+
else
|
28
|
+
$stderr.puts progress(*args)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def incr!
|
34
|
+
@iter += 1
|
35
|
+
end
|
36
|
+
|
37
|
+
def ready?
|
38
|
+
iter % @interval == 0
|
39
|
+
end
|
40
|
+
|
41
|
+
def progress *stuff
|
42
|
+
[
|
43
|
+
"%15d" % iter,
|
44
|
+
"%7.1f"% elapsed_time, "sec",
|
45
|
+
"%7.1f"%(iter.to_f / elapsed_time), "/sec",
|
46
|
+
now.to_flat,
|
47
|
+
*stuff
|
48
|
+
].flatten.join("\t")
|
49
|
+
end
|
50
|
+
|
51
|
+
def elapsed_time
|
52
|
+
now - start_time
|
53
|
+
end
|
54
|
+
def now
|
55
|
+
Time.now.utc
|
56
|
+
end
|
57
|
+
end
|
@@ -27,6 +27,8 @@ module Wukong
|
|
27
27
|
Settings.define :timeout, :jobconf => true, :description => 'mapred.task.timeout', :wukong => true
|
28
28
|
Settings.define :reuse_jvms, :jobconf => true, :description => 'mapred.job.reuse.jvm.num.tasks', :wukong => true
|
29
29
|
Settings.define :respect_exit_status, :jobconf => true, :description => 'stream.non.zero.exit.is.failure', :wukong => true
|
30
|
+
Settings.define :io_sort_record_percent, :jobconf => true, :description => 'io.sort.record.percent', :wukong => true
|
31
|
+
Settings.define :io_sort_mb, :jobconf => true, :description => 'io.sort.mb', :wukong => true
|
30
32
|
Settings.define :noempty, :description => "don't create zero-byte reduce files (hadoop mode only)", :wukong => true
|
31
33
|
Settings.define :job_name, :jobconf => true, :description => 'mapred.job.name', :wukong => true
|
32
34
|
# mapred.linerecordreader.maxlength :description => "Safeguards against corrupted data: lines longer than this (in bytes) are treated as bad records."
|
@@ -98,7 +100,7 @@ module Wukong
|
|
98
100
|
# root of your config install.
|
99
101
|
[
|
100
102
|
hadoop_runner,
|
101
|
-
"jar #{Settings[:hadoop_home]}/contrib/streaming/hadoop
|
103
|
+
"jar #{Settings[:hadoop_home]}/contrib/streaming/hadoop-*streaming*.jar",
|
102
104
|
hadoop_partition_args,
|
103
105
|
hadoop_sort_args,
|
104
106
|
hadoop_num_tasks_args,
|
@@ -0,0 +1,61 @@
|
|
1
|
+
# Defines a base class for streaming data into a cassandra db connection.
|
2
|
+
require 'cassandra' ; include Cassandra::Constants
|
3
|
+
module Wukong
|
4
|
+
module Streamer
|
5
|
+
|
6
|
+
class CassandraStreamer < Wukong::Streamer::Base
|
7
|
+
attr_accessor :batch_count, :batch_record_count, :batch_size, :column_space, :db_seeds, :cassandra_db
|
8
|
+
|
9
|
+
def initialize *args
|
10
|
+
super *args
|
11
|
+
self.batch_count = 0
|
12
|
+
self.batch_record_count = 0
|
13
|
+
self.column_space ||= 'Twitter'
|
14
|
+
self.batch_size ||= 100
|
15
|
+
self.db_seeds ||= %w[10.244.191.178 10.243.19.223 10.243.17.219 10.245.70.85 10.244.206.241].map{ |s| s.to_s+':9160'}
|
16
|
+
self.cassandra_db ||= Cassandra.new(self.column_space, self.db_seeds)
|
17
|
+
end
|
18
|
+
|
19
|
+
def stream
|
20
|
+
while still_lines? do
|
21
|
+
start_batch do
|
22
|
+
while still_lines? && batch_not_full? do
|
23
|
+
line = get_line
|
24
|
+
record = recordize(line.chomp) or next
|
25
|
+
next if record.blank?
|
26
|
+
process(*record) do |output_record|
|
27
|
+
emit output_record
|
28
|
+
end
|
29
|
+
self.batch_record_count += 1
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def process *args, &blk
|
36
|
+
Raise "Overwrite this method to insert into cassandra db"
|
37
|
+
end
|
38
|
+
|
39
|
+
def start_batch &blk
|
40
|
+
self.batch_record_count = 0
|
41
|
+
self.batch_count += 1
|
42
|
+
self.cassandra_db.batch(&blk)
|
43
|
+
end
|
44
|
+
|
45
|
+
def get_line
|
46
|
+
$stdin.gets
|
47
|
+
end
|
48
|
+
|
49
|
+
def still_lines?
|
50
|
+
!$stdin.eof?
|
51
|
+
end
|
52
|
+
|
53
|
+
def batch_not_full?
|
54
|
+
self.batch_record_count < self.batch_size
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
end
|
61
|
+
|
data/lib/wukong/streamer.rb
CHANGED
@@ -1,16 +1,18 @@
|
|
1
1
|
module Wukong
|
2
2
|
module Streamer
|
3
|
-
autoload :Base,
|
4
|
-
autoload :LineStreamer,
|
5
|
-
autoload :RecordStreamer,
|
6
|
-
autoload :StructStreamer,
|
7
|
-
autoload :StructRecordizer,
|
3
|
+
autoload :Base, 'wukong/streamer/base'
|
4
|
+
autoload :LineStreamer, 'wukong/streamer/line_streamer'
|
5
|
+
autoload :RecordStreamer, 'wukong/streamer/record_streamer'
|
6
|
+
autoload :StructStreamer, 'wukong/streamer/struct_streamer'
|
7
|
+
autoload :StructRecordizer, 'wukong/streamer/struct_streamer'
|
8
|
+
# cassandra goodies
|
9
|
+
autoload :CassandraStreamer, 'wukong/streamer/cassandra_streamer'
|
8
10
|
#
|
9
|
-
autoload :Filter,
|
11
|
+
autoload :Filter, 'wukong/streamer/filter'
|
10
12
|
#
|
11
|
-
autoload :AccumulatingReducer,
|
12
|
-
autoload :ListReducer,
|
13
|
-
autoload :UniqByLastReducer,
|
14
|
-
autoload :CountingReducer,
|
13
|
+
autoload :AccumulatingReducer, 'wukong/streamer/accumulating_reducer'
|
14
|
+
autoload :ListReducer, 'wukong/streamer/list_reducer'
|
15
|
+
autoload :UniqByLastReducer, 'wukong/streamer/uniq_by_last_reducer'
|
16
|
+
autoload :CountingReducer, 'wukong/streamer/counting_reducer'
|
15
17
|
end
|
16
18
|
end
|
data/wukong.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{wukong}
|
8
|
-
s.version = "1.4.
|
8
|
+
s.version = "1.4.10"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Philip (flip) Kromer"]
|
12
|
-
s.date = %q{2010-
|
12
|
+
s.date = %q{2010-07-19}
|
13
13
|
s.description = %q{ Treat your dataset like a:
|
14
14
|
|
15
15
|
* stream of lines when it's efficient to process by lines
|
@@ -29,6 +29,7 @@ Gem::Specification.new do |s|
|
|
29
29
|
"INSTALL.textile",
|
30
30
|
"LICENSE.textile",
|
31
31
|
"README.textile",
|
32
|
+
"TODO.textile",
|
32
33
|
"bin/cutc",
|
33
34
|
"bin/cuttab",
|
34
35
|
"bin/greptrue",
|
@@ -38,6 +39,7 @@ Gem::Specification.new do |s|
|
|
38
39
|
"bin/hdp-du",
|
39
40
|
"bin/hdp-get",
|
40
41
|
"bin/hdp-kill",
|
42
|
+
"bin/hdp-kill-task",
|
41
43
|
"bin/hdp-ls",
|
42
44
|
"bin/hdp-mkdir",
|
43
45
|
"bin/hdp-mv",
|
@@ -62,6 +64,7 @@ Gem::Specification.new do |s|
|
|
62
64
|
"bin/wu-sum",
|
63
65
|
"docpages/INSTALL.textile",
|
64
66
|
"docpages/LICENSE.textile",
|
67
|
+
"docpages/README-performance.textile",
|
65
68
|
"docpages/README-wulign.textile",
|
66
69
|
"docpages/UsingWukong-part1-get_ready.textile",
|
67
70
|
"docpages/UsingWukong-part2-ThinkingBigData.textile",
|
@@ -118,14 +121,17 @@ Gem::Specification.new do |s|
|
|
118
121
|
"docpages/usage.textile",
|
119
122
|
"docpages/wutils.textile",
|
120
123
|
"examples/README.txt",
|
124
|
+
"examples/binning_percentile_estimator.rb",
|
121
125
|
"examples/contrib/jeans/README.markdown",
|
122
126
|
"examples/contrib/jeans/data/normalized_sizes",
|
123
127
|
"examples/contrib/jeans/data/orders.tsv",
|
124
128
|
"examples/contrib/jeans/data/sizes",
|
125
129
|
"examples/contrib/jeans/normalize.rb",
|
126
130
|
"examples/contrib/jeans/sizes.rb",
|
131
|
+
"examples/corpus/words_to_bigrams.rb",
|
127
132
|
"examples/count_keys.rb",
|
128
133
|
"examples/count_keys_at_mapper.rb",
|
134
|
+
"examples/keystore/cassandra_batch_test.rb",
|
129
135
|
"examples/keystore/conditional_outputter_example.rb",
|
130
136
|
"examples/network_graph/adjacency_list.rb",
|
131
137
|
"examples/network_graph/breadth_first_search.rb",
|
@@ -160,6 +166,7 @@ Gem::Specification.new do |s|
|
|
160
166
|
"lib/wukong/extensions/class.rb",
|
161
167
|
"lib/wukong/extensions/date_time.rb",
|
162
168
|
"lib/wukong/extensions/emittable.rb",
|
169
|
+
"lib/wukong/extensions/enumerable.rb",
|
163
170
|
"lib/wukong/extensions/hash.rb",
|
164
171
|
"lib/wukong/extensions/hash_keys.rb",
|
165
172
|
"lib/wukong/extensions/hash_like.rb",
|
@@ -170,8 +177,12 @@ Gem::Specification.new do |s|
|
|
170
177
|
"lib/wukong/extensions/struct.rb",
|
171
178
|
"lib/wukong/extensions/symbol.rb",
|
172
179
|
"lib/wukong/keystore/cassandra_conditional_outputter.rb",
|
180
|
+
"lib/wukong/keystore/redis_db.rb",
|
181
|
+
"lib/wukong/keystore/tyrant_db.rb",
|
182
|
+
"lib/wukong/keystore/tyrant_notes.textile",
|
173
183
|
"lib/wukong/logger.rb",
|
174
184
|
"lib/wukong/models/graph.rb",
|
185
|
+
"lib/wukong/periodic_monitor.rb",
|
175
186
|
"lib/wukong/rdf.rb",
|
176
187
|
"lib/wukong/schema.rb",
|
177
188
|
"lib/wukong/script.rb",
|
@@ -180,6 +191,7 @@ Gem::Specification.new do |s|
|
|
180
191
|
"lib/wukong/streamer.rb",
|
181
192
|
"lib/wukong/streamer/accumulating_reducer.rb",
|
182
193
|
"lib/wukong/streamer/base.rb",
|
194
|
+
"lib/wukong/streamer/cassandra_streamer.rb",
|
183
195
|
"lib/wukong/streamer/count_keys.rb",
|
184
196
|
"lib/wukong/streamer/count_lines.rb",
|
185
197
|
"lib/wukong/streamer/counting_reducer.rb",
|
@@ -208,51 +220,56 @@ Gem::Specification.new do |s|
|
|
208
220
|
s.homepage = %q{http://mrflip.github.com/wukong}
|
209
221
|
s.rdoc_options = ["--charset=UTF-8"]
|
210
222
|
s.require_paths = ["lib"]
|
211
|
-
s.rubygems_version = %q{1.3.
|
223
|
+
s.rubygems_version = %q{1.3.7}
|
212
224
|
s.summary = %q{Hadoop Streaming for Ruby. Wukong makes Hadoop so easy a chimpanzee can use it, yet handles terabyte-scale computation with ease.}
|
213
225
|
s.test_files = [
|
214
226
|
"spec/spec_helper.rb",
|
215
227
|
"spec/wukong/encoding_spec.rb",
|
216
228
|
"spec/wukong/script_spec.rb",
|
217
|
-
"examples/contrib/jeans/normalize.rb",
|
218
|
-
"examples/contrib/jeans/sizes.rb",
|
219
|
-
"examples/count_keys.rb",
|
220
|
-
"examples/count_keys_at_mapper.rb",
|
221
|
-
"examples/keystore/conditional_outputter_example.rb",
|
222
|
-
"examples/network_graph/adjacency_list.rb",
|
223
|
-
"examples/network_graph/breadth_first_search.rb",
|
224
|
-
"examples/network_graph/gen_2paths.rb",
|
225
|
-
"examples/network_graph/gen_multi_edge.rb",
|
226
|
-
"examples/network_graph/gen_symmetric_links.rb",
|
227
229
|
"examples/pagerank/pagerank.rb",
|
228
230
|
"examples/pagerank/pagerank_initialize.rb",
|
229
|
-
"examples/rank_and_bin.rb",
|
230
231
|
"examples/sample_records.rb",
|
231
232
|
"examples/server_logs/apache_log_parser.rb",
|
232
233
|
"examples/server_logs/breadcrumbs.rb",
|
233
234
|
"examples/server_logs/user_agent.rb",
|
235
|
+
"examples/corpus/words_to_bigrams.rb",
|
236
|
+
"examples/count_keys.rb",
|
237
|
+
"examples/rank_and_bin.rb",
|
238
|
+
"examples/binning_percentile_estimator.rb",
|
234
239
|
"examples/size.rb",
|
240
|
+
"examples/network_graph/breadth_first_search.rb",
|
241
|
+
"examples/network_graph/gen_symmetric_links.rb",
|
242
|
+
"examples/network_graph/gen_multi_edge.rb",
|
243
|
+
"examples/network_graph/adjacency_list.rb",
|
244
|
+
"examples/network_graph/gen_2paths.rb",
|
245
|
+
"examples/keystore/cassandra_batch_test.rb",
|
246
|
+
"examples/keystore/conditional_outputter_example.rb",
|
235
247
|
"examples/stats/avg_value_frequency.rb",
|
248
|
+
"examples/contrib/jeans/sizes.rb",
|
249
|
+
"examples/contrib/jeans/normalize.rb",
|
250
|
+
"examples/word_count.rb",
|
236
251
|
"examples/stupidly_simple_filter.rb",
|
237
|
-
"examples/
|
252
|
+
"examples/count_keys_at_mapper.rb"
|
238
253
|
]
|
239
254
|
|
240
255
|
if s.respond_to? :specification_version then
|
241
256
|
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
242
257
|
s.specification_version = 3
|
243
258
|
|
244
|
-
if Gem::Version.new(Gem::
|
259
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
245
260
|
s.add_development_dependency(%q<rspec>, [">= 1.2.9"])
|
246
261
|
s.add_development_dependency(%q<yard>, [">= 0"])
|
247
262
|
s.add_runtime_dependency(%q<addressable>, [">= 0"])
|
248
263
|
s.add_runtime_dependency(%q<extlib>, [">= 0"])
|
249
264
|
s.add_runtime_dependency(%q<htmlentities>, [">= 0"])
|
265
|
+
s.add_runtime_dependency(%q<configliere>, [">= 0"])
|
250
266
|
else
|
251
267
|
s.add_dependency(%q<rspec>, [">= 1.2.9"])
|
252
268
|
s.add_dependency(%q<yard>, [">= 0"])
|
253
269
|
s.add_dependency(%q<addressable>, [">= 0"])
|
254
270
|
s.add_dependency(%q<extlib>, [">= 0"])
|
255
271
|
s.add_dependency(%q<htmlentities>, [">= 0"])
|
272
|
+
s.add_dependency(%q<configliere>, [">= 0"])
|
256
273
|
end
|
257
274
|
else
|
258
275
|
s.add_dependency(%q<rspec>, [">= 1.2.9"])
|
@@ -260,6 +277,7 @@ Gem::Specification.new do |s|
|
|
260
277
|
s.add_dependency(%q<addressable>, [">= 0"])
|
261
278
|
s.add_dependency(%q<extlib>, [">= 0"])
|
262
279
|
s.add_dependency(%q<htmlentities>, [">= 0"])
|
280
|
+
s.add_dependency(%q<configliere>, [">= 0"])
|
263
281
|
end
|
264
282
|
end
|
265
283
|
|