wukong 1.5.4 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG.textile +32 -0
- data/README.textile +58 -12
- data/TODO.textile +0 -8
- data/bin/hdp-bzip +12 -17
- data/bin/hdp-kill-task +1 -1
- data/bin/hdp-sort +7 -7
- data/bin/hdp-stream +7 -7
- data/bin/hdp-stream-flat +2 -3
- data/bin/setcat +11 -0
- data/bin/uniq-ord +59 -0
- data/examples/corpus/bucket_counter.rb +47 -0
- data/examples/corpus/dbpedia_abstract_to_sentences.rb +85 -0
- data/examples/corpus/sentence_coocurrence.rb +70 -0
- data/examples/emr/README.textile +110 -0
- data/examples/emr/dot_wukong_dir/emr_bootstrap.sh +1 -0
- data/examples/emr/elastic_mapreduce_example.rb +2 -2
- data/examples/ignore_me/counting.rb +56 -0
- data/examples/ignore_me/grouper.rb +71 -0
- data/examples/network_graph/adjacency_list.rb +2 -2
- data/examples/network_graph/breadth_first_search.rb +14 -21
- data/examples/network_graph/gen_multi_edge.rb +22 -13
- data/examples/pagerank/pagerank.rb +1 -1
- data/examples/pagerank/pagerank_initialize.rb +6 -10
- data/examples/sample_records.rb +6 -16
- data/examples/server_logs/apache_log_parser.rb +7 -22
- data/examples/server_logs/breadcrumbs.rb +39 -0
- data/examples/server_logs/logline.rb +27 -0
- data/examples/size.rb +3 -2
- data/examples/{binning_percentile_estimator.rb → stats/binning_percentile_estimator.rb} +9 -11
- data/examples/{rank_and_bin.rb → stats/rank_and_bin.rb} +2 -2
- data/examples/stupidly_simple_filter.rb +11 -14
- data/examples/word_count.rb +16 -36
- data/lib/wukong/and_pig.rb +2 -15
- data/lib/wukong/logger.rb +7 -28
- data/lib/wukong/periodic_monitor.rb +24 -9
- data/lib/wukong/script/emr_command.rb +1 -0
- data/lib/wukong/script/hadoop_command.rb +31 -29
- data/lib/wukong/script.rb +19 -14
- data/lib/wukong/store/cassandra_model.rb +2 -1
- data/lib/wukong/streamer/accumulating_reducer.rb +5 -9
- data/lib/wukong/streamer/base.rb +44 -3
- data/lib/wukong/streamer/counting_reducer.rb +12 -12
- data/lib/wukong/streamer/filter.rb +2 -2
- data/lib/wukong/streamer/list_reducer.rb +3 -3
- data/lib/wukong/streamer/reducer.rb +11 -0
- data/lib/wukong/streamer.rb +7 -3
- data/lib/wukong.rb +7 -3
- data/{examples → old}/cassandra_streaming/berlitz_for_cassandra.textile +0 -0
- data/{examples → old}/cassandra_streaming/client_interface_notes.textile +0 -0
- data/{examples → old}/cassandra_streaming/client_schema.textile +0 -0
- data/{examples → old}/cassandra_streaming/tuning.textile +0 -0
- data/wukong.gemspec +257 -285
- metadata +45 -62
- data/examples/cassandra_streaming/avromapper.rb +0 -85
- data/examples/cassandra_streaming/cassandra.avpr +0 -468
- data/examples/cassandra_streaming/cassandra_random_partitioner.rb +0 -62
- data/examples/cassandra_streaming/catter.sh +0 -45
- data/examples/cassandra_streaming/client_schema.avpr +0 -211
- data/examples/cassandra_streaming/foofile.avr +0 -0
- data/examples/cassandra_streaming/pymap.sh +0 -1
- data/examples/cassandra_streaming/pyreduce.sh +0 -1
- data/examples/cassandra_streaming/smutation.avpr +0 -188
- data/examples/cassandra_streaming/streamer.sh +0 -51
- data/examples/cassandra_streaming/struct_loader.rb +0 -24
- data/examples/count_keys.rb +0 -56
- data/examples/count_keys_at_mapper.rb +0 -57
- data/examples/emr/README-elastic_map_reduce.textile +0 -26
- data/examples/keystore/cassandra_batch_test.rb +0 -41
- data/examples/keystore/conditional_outputter_example.rb +0 -70
- data/examples/store/chunked_store_example.rb +0 -18
- data/lib/wukong/dfs.rb +0 -81
- data/lib/wukong/keystore/cassandra_conditional_outputter.rb +0 -122
- data/lib/wukong/keystore/redis_db.rb +0 -24
- data/lib/wukong/keystore/tyrant_db.rb +0 -137
- data/lib/wukong/keystore/tyrant_notes.textile +0 -145
- data/lib/wukong/models/graph.rb +0 -25
- data/lib/wukong/monitor/chunked_store.rb +0 -23
- data/lib/wukong/monitor/periodic_logger.rb +0 -34
- data/lib/wukong/monitor/periodic_monitor.rb +0 -70
- data/lib/wukong/monitor.rb +0 -7
- data/lib/wukong/rdf.rb +0 -104
- data/lib/wukong/streamer/cassandra_streamer.rb +0 -61
- data/lib/wukong/streamer/count_keys.rb +0 -30
- data/lib/wukong/streamer/count_lines.rb +0 -26
- data/lib/wukong/streamer/em_streamer.rb +0 -7
- data/lib/wukong/streamer/preprocess_with_pipe_streamer.rb +0 -22
- data/lib/wukong/wukong_class.rb +0 -21
@@ -1,41 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
require 'rubygems'
|
3
|
-
require 'wukong'
|
4
|
-
|
5
|
-
# An example (and test) of streaming batches of data into distributed cassandra db
|
6
|
-
# Stream in whatever you like that has a key and value. Notice that you must
|
7
|
-
# have already defined a column space called 'Cruft' in storage-conf.xml as well
|
8
|
-
# as a column family called 'OhBaby'
|
9
|
-
|
10
|
-
class Mapper < Wukong::Streamer::CassandraStreamer
|
11
|
-
|
12
|
-
# you must redefine the column space, batch size, and db-seeds or they will
|
13
|
-
# be defaults. For testing on local machine simply seed db with 127.0.0.1:9160
|
14
|
-
|
15
|
-
def initialize *args
|
16
|
-
self.column_space = 'Cruft'
|
17
|
-
self.batch_size = 100
|
18
|
-
self.db_seeds = "127.0.0.1:9160"
|
19
|
-
super(*args)
|
20
|
-
@iter = 0
|
21
|
-
end
|
22
|
-
|
23
|
-
def process key, value, *_, &blk
|
24
|
-
insert_into_db(key, value)
|
25
|
-
yield [key, value] if (@iter %10 == 0)
|
26
|
-
end
|
27
|
-
|
28
|
-
# you must specify the column family, key, and value here
|
29
|
-
def insert_into_db key, value
|
30
|
-
@iter += 1
|
31
|
-
cassandra_db.insert(:OhBaby, key, {"value" => value}, :consistency => Cassandra::Consistency::ANY) unless key.blank?
|
32
|
-
end
|
33
|
-
end
|
34
|
-
|
35
|
-
#
|
36
|
-
# Executes the script
|
37
|
-
#
|
38
|
-
Wukong::Script.new(
|
39
|
-
Mapper,
|
40
|
-
nil
|
41
|
-
).run
|
@@ -1,70 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
require 'rubygems'
|
3
|
-
require 'cassandra'
|
4
|
-
require 'wukong'
|
5
|
-
require 'wukong/encoding'
|
6
|
-
require 'wukong/keystore/cassandra_conditional_outputter'
|
7
|
-
|
8
|
-
#
|
9
|
-
# Usage:
|
10
|
-
# echo -e "bob has boobs ha ha ha" | ./examples/keystore/conditional_outputter_example.rb --map
|
11
|
-
#
|
12
|
-
|
13
|
-
CASSANDRA_KEYSPACE = 'CorpusAnalysis'
|
14
|
-
|
15
|
-
#
|
16
|
-
# This demonstrates the CassandraConditionalOutputter module.
|
17
|
-
#
|
18
|
-
# CassandraConditionalOutputter uses and a cassandra key-value store to
|
19
|
-
# track unique IDs and prevent output of any record already present in the
|
20
|
-
# database.
|
21
|
-
#
|
22
|
-
# For this example, it takes an input stream, generates all letter pairs for
|
23
|
-
# each line, and emits
|
24
|
-
#
|
25
|
-
#
|
26
|
-
class LetterPairMapper < Wukong::Streamer::LineStreamer
|
27
|
-
include CassandraConditionalOutputter
|
28
|
-
|
29
|
-
#
|
30
|
-
# A unique key for the given record. If an object with
|
31
|
-
# that key has been seen, it won't be re-emitted.
|
32
|
-
#
|
33
|
-
# In this example, we'll just encode the letter pair
|
34
|
-
#
|
35
|
-
def conditional_output_key record
|
36
|
-
record.to_s.wukong_encode(:url)
|
37
|
-
end
|
38
|
-
|
39
|
-
#
|
40
|
-
# Emit each letter pair in the line.
|
41
|
-
# the CassandraConditionalOutputter will swallow all duplicate lines.
|
42
|
-
#
|
43
|
-
def process line, &block
|
44
|
-
letter_pairs(line).each do |pair|
|
45
|
-
yield(pair)
|
46
|
-
end
|
47
|
-
end
|
48
|
-
|
49
|
-
# turn a string into the pairs of adjacent letters
|
50
|
-
#
|
51
|
-
# @example
|
52
|
-
# letter_pairs('abracadabra')
|
53
|
-
# # => ['ab', 'br',
|
54
|
-
def letter_pairs str, &block
|
55
|
-
chars = str.chars.to_a
|
56
|
-
chars[0..-2].zip(chars[1..-1]).map(&:join)
|
57
|
-
end
|
58
|
-
|
59
|
-
# Clear the entire cached keys column at the end of the run.
|
60
|
-
#
|
61
|
-
# You almost certainly don't want to do this in a real script.
|
62
|
-
#
|
63
|
-
def after_stream
|
64
|
-
$stderr.puts 'Clearing conditional_output_key cache...'
|
65
|
-
@key_cache.clear_column_family!(conditional_output_key_column)
|
66
|
-
end
|
67
|
-
end
|
68
|
-
|
69
|
-
# Execute the script
|
70
|
-
Wukong::Script.new( LetterPairMapper, nil ).run
|
@@ -1,18 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
require 'rubygems'
|
3
|
-
require 'wukong'
|
4
|
-
# require 'wukong/store'
|
5
|
-
|
6
|
-
require 'configliere'
|
7
|
-
Configliere.use :commandline, :define, :config_file
|
8
|
-
Settings.read('foo.yaml')
|
9
|
-
|
10
|
-
# store = ChunkedFlatFileStore.new(Settings)
|
11
|
-
|
12
|
-
100.times do |iter|
|
13
|
-
# store.save [iter, Time.now.to_flat].join("\t")
|
14
|
-
$stdout.puts [iter, Time.now.to_flat].join("\t")
|
15
|
-
sleep 2
|
16
|
-
end
|
17
|
-
|
18
|
-
|
data/lib/wukong/dfs.rb
DELETED
@@ -1,81 +0,0 @@
|
|
1
|
-
require 'time' # ain't it always that way
|
2
|
-
module Wukong
|
3
|
-
module Dfs
|
4
|
-
def self.list_files dfs_path
|
5
|
-
Log.info{ "DFS: listing #{dfs_path}" }
|
6
|
-
listing = `hadoop dfs -ls #{dfs_path}`.split("\n").reject{|ls_line| ls_line =~ /Found \d+ items/i}
|
7
|
-
listing.map{|ls_line| HFile.new_from_ls(ls_line)}
|
8
|
-
end
|
9
|
-
|
10
|
-
#
|
11
|
-
# FIXME -- this will fail if multiple files in a listing have the
|
12
|
-
# same basename. Sorry.
|
13
|
-
#
|
14
|
-
def self.compare_listings src_files, dest_files, &block
|
15
|
-
src_files.sort.each do |src_file|
|
16
|
-
dest_file = dest_files.find{|df| File.basename(src_file) == df.basename }
|
17
|
-
case
|
18
|
-
when (! dest_file) then yield :missing, src_file, nil
|
19
|
-
when (! dest_file.kinda_equal(src_file)) then yield :differ, src_file, dest_file
|
20
|
-
else yield :same, src_file, dest_file
|
21
|
-
end
|
22
|
-
end
|
23
|
-
end
|
24
|
-
|
25
|
-
HFile = TypedStruct.new(
|
26
|
-
[:mode_str, String],
|
27
|
-
[:i_count, String],
|
28
|
-
[:owner, String],
|
29
|
-
[:group, String],
|
30
|
-
[:size, Integer],
|
31
|
-
[:date, Bignum],
|
32
|
-
[:path, String]
|
33
|
-
)
|
34
|
-
HFile.class_eval do
|
35
|
-
def self.new_from_ls ls_line
|
36
|
-
mode, ic, o, g, sz, dt, tm, path = ls_line.chomp.split(/\s+/)
|
37
|
-
date = Time.parse("#{dt} #{tm}").utc.to_flat
|
38
|
-
new mode, ic.to_i, o, g, sz.to_i, date, path
|
39
|
-
end
|
40
|
-
def dirname
|
41
|
-
@dirname ||= File.dirname(path)
|
42
|
-
end
|
43
|
-
def basename
|
44
|
-
@basename ||= File.basename(path)
|
45
|
-
end
|
46
|
-
#
|
47
|
-
# Two files are kinda_equal if they match in size and if
|
48
|
-
# the hdfs version is later than the filesystem version.
|
49
|
-
#
|
50
|
-
def kinda_equal file
|
51
|
-
(self.size == File.size(file)) # && (self.date >= File.mtime(file).utc.to_flat)
|
52
|
-
end
|
53
|
-
def to_s
|
54
|
-
to_a.join("\t")
|
55
|
-
end
|
56
|
-
|
57
|
-
#
|
58
|
-
# These will be very slow.
|
59
|
-
# If some kind soul will integrate JRuby callouts the bards shall
|
60
|
-
# celebrate your name evermore.
|
61
|
-
#
|
62
|
-
|
63
|
-
# rename the file on the HDFS
|
64
|
-
def mv new_filename
|
65
|
-
self.class.run_dfs_command :mv, path, new_filename
|
66
|
-
end
|
67
|
-
|
68
|
-
def self.mkdir dirname
|
69
|
-
run_dfs_command :mkdir, dirname
|
70
|
-
end
|
71
|
-
def self.mkdir_p(*args) self.mkdir *args ; end # HDFS is always -p
|
72
|
-
|
73
|
-
def self.run_dfs_command *args
|
74
|
-
cmd = 'hadoop dfs -'+ args.flatten.compact.join(" ")
|
75
|
-
Log.debug{ "DFS: Running #{cmd}" }
|
76
|
-
Log.info{ `#{cmd} 2>&1`.gsub(/[\r\n\t]+/, " ") }
|
77
|
-
end
|
78
|
-
|
79
|
-
end
|
80
|
-
end
|
81
|
-
end
|
@@ -1,122 +0,0 @@
|
|
1
|
-
|
2
|
-
#
|
3
|
-
# For a stream process that sees a significant number of duplicated heavyweight
|
4
|
-
# objects, it may be better to deduplicate them midflight (rather than, say,
|
5
|
-
# using a reducer to effectively `cat | sort | uniq` the data).
|
6
|
-
#
|
7
|
-
# This uses a cassandra key-value store to track unique IDs and prevent output
|
8
|
-
# of any record already present in the database. (Why cassandra? Because we use
|
9
|
-
# it in production. Might be nice to rewrite this example against redis or
|
10
|
-
# TokyoTyrant or something less demanding.)
|
11
|
-
#
|
12
|
-
# Things you have to do:
|
13
|
-
#
|
14
|
-
# * Override the conditional_output_key method to distinguish identical records
|
15
|
-
# * Define a constant CASSANDRA_KEYSPACE giving the Cassandra keyspace you're working in
|
16
|
-
# * (Optionally) override conditional_output_key_column
|
17
|
-
#
|
18
|
-
# * In your cassandra storage-conf.xml, add a column family to your keyspace:
|
19
|
-
#
|
20
|
-
# <Keyspace Name="CorpusAnalysis">
|
21
|
-
# <KeysCachedFraction>0.01</KeysCachedFraction>
|
22
|
-
#
|
23
|
-
# <!-- Added for CassandraConditionalOutputter -->
|
24
|
-
# <ColumnFamily CompareWith="UTF8Type" Name="LetterPairMapperKeys" />
|
25
|
-
#
|
26
|
-
# <ReplicaPlacementStrategy>org.apache.cassandra.locator.RackUnawareStrategy</ReplicaPlacementStrategy>
|
27
|
-
# <ReplicationFactor>1</ReplicationFactor>
|
28
|
-
# <EndPointSnitch>org.apache.cassandra.locator.EndPointSnitch</EndPointSnitch>
|
29
|
-
# </Keyspace>
|
30
|
-
#
|
31
|
-
# In this example, the CASSANDRA_KEYSPACE is 'CorpusAnalysis' and the
|
32
|
-
# conditional_output_key_column is 'LetterPairMapperKeys'
|
33
|
-
#
|
34
|
-
# @example
|
35
|
-
# Given
|
36
|
-
# tweet 123456789 20100102030405 @frank: I'm having a bacon sandwich
|
37
|
-
# tweet 24601 20100104136526 @jerry, I'm having your baby
|
38
|
-
# tweet 8675309 20100102030405 I find pastrami to be the most sensual of the salted, cured meats.
|
39
|
-
# tweet 24601 20100104136526 @jerry, I'm having your baby
|
40
|
-
# tweet 1137 20100119234532 These pretzels are making me thirsty
|
41
|
-
# ....
|
42
|
-
# will emit:
|
43
|
-
# tweet 123456789 20100102030405 @frank: I'm having a bacon sandwich
|
44
|
-
# tweet 24601 20100104136526 @jerry, I'm having your baby
|
45
|
-
# tweet 8675309 20100102030405 I find pastrami to be the most sensual of the salted, cured meats.
|
46
|
-
# tweet 24601 20100104136526 @jerry, I'm having your baby
|
47
|
-
# tweet 1137 20100119234532 These pretzels are making me thirsty
|
48
|
-
# ....
|
49
|
-
#
|
50
|
-
module CassandraConditionalOutputter
|
51
|
-
|
52
|
-
#
|
53
|
-
# A unique key for the given record. If an object with
|
54
|
-
# that key has been seen, it won't be re-emitted.
|
55
|
-
#
|
56
|
-
# You will almost certainly want to override this method in your subclass. Be
|
57
|
-
# sure that the key is a string, and is encoded properly (Cassandra likes to
|
58
|
-
# strip whitespace from keys, for instance).
|
59
|
-
#
|
60
|
-
def conditional_output_key record
|
61
|
-
record.to_s
|
62
|
-
end
|
63
|
-
|
64
|
-
#
|
65
|
-
# Checks each record against the key cache
|
66
|
-
# Swallows records already there,
|
67
|
-
#
|
68
|
-
#
|
69
|
-
def emit record, &block
|
70
|
-
key = conditional_output_key(record)
|
71
|
-
if should_emit?(record)
|
72
|
-
set_key(key, {'t' => record.timestamp})
|
73
|
-
super record
|
74
|
-
end
|
75
|
-
end
|
76
|
-
|
77
|
-
# Default. Emit record if its key is not already contained
|
78
|
-
# in the key-value store. Overwrite this as necessary
|
79
|
-
def should_emit? record
|
80
|
-
key = conditional_output_key(record)
|
81
|
-
!has_key?(key)
|
82
|
-
end
|
83
|
-
|
84
|
-
# Check for presence of key in the cache
|
85
|
-
def has_key? key
|
86
|
-
not key_cache.get(conditional_output_key_column, key).blank?
|
87
|
-
end
|
88
|
-
|
89
|
-
# register key in the key_cache
|
90
|
-
def set_key key, data={'t' => '0'}
|
91
|
-
key_cache.insert(conditional_output_key_column, key, data)
|
92
|
-
end
|
93
|
-
|
94
|
-
# nuke key from the key_cache
|
95
|
-
def remove_key key
|
96
|
-
key_cache.remove(conditional_output_key_column, key)
|
97
|
-
end
|
98
|
-
|
99
|
-
#
|
100
|
-
# Key cache implementation in Cassandra
|
101
|
-
#
|
102
|
-
|
103
|
-
# The cache
|
104
|
-
def key_cache
|
105
|
-
@key_cache ||= Cassandra.new(CASSANDRA_KEYSPACE)
|
106
|
-
end
|
107
|
-
|
108
|
-
# The column to use for the key cache. By default, the class name plus 'Keys',
|
109
|
-
# but feel free to override.
|
110
|
-
#
|
111
|
-
# @example
|
112
|
-
#
|
113
|
-
# class FooMapper < Wukong::Streamer::RecordStreamer
|
114
|
-
# include ConditionalOutputter
|
115
|
-
# end
|
116
|
-
# FooMapper.new.conditional_output_key_column
|
117
|
-
# # => 'FooMapperKeys'
|
118
|
-
#
|
119
|
-
def conditional_output_key_column
|
120
|
-
self.class.to_s+'Keys'
|
121
|
-
end
|
122
|
-
end
|
@@ -1,24 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
require 'rubygems' ;
|
3
|
-
require 'redis' ;
|
4
|
-
|
5
|
-
RDB = Redis.new(:host => 'localhost', :port => 6379)
|
6
|
-
|
7
|
-
start_time = Time.now.utc.to_f ;
|
8
|
-
iter=0;
|
9
|
-
|
10
|
-
|
11
|
-
$stdin.each do |line|
|
12
|
-
_r, id, scat, sn, pr, fo, fr, st, fv, crat, sid, full = line.chomp.split("\t");
|
13
|
-
iter+=1 ;
|
14
|
-
break if iter > 20_000_000
|
15
|
-
|
16
|
-
if (iter % 10_000 == 0)
|
17
|
-
elapsed = (Time.now.utc.to_f - start_time)
|
18
|
-
puts "%-20s\t%7d\t%7d\t%7.2f\t%7.2f" % [sn, fo, iter, elapsed, iter.to_f/elapsed]
|
19
|
-
end
|
20
|
-
|
21
|
-
RDB['sn:'+sn.downcase] = id unless sn.empty?
|
22
|
-
RDB['sid:'+sid] = id unless sid.empty?
|
23
|
-
RDB['uid:'+id] = [sn,sid,crat,scat].join(',') unless id.empty?
|
24
|
-
end
|
@@ -1,137 +0,0 @@
|
|
1
|
-
require 'tokyo_tyrant'
|
2
|
-
require 'tokyo_tyrant/balancer'
|
3
|
-
|
4
|
-
# -- Installing
|
5
|
-
# make sure tokyocabinet and tokyotyrant are installed (cehf recipe)
|
6
|
-
# make sure ruby-tokyotyrant is installed
|
7
|
-
# ldconfig
|
8
|
-
# mkdir -p /data/db/ttyrant /var/run/tyrant /var/log/tyrant
|
9
|
-
#
|
10
|
-
# -- Starting
|
11
|
-
# ttserver -port 12001 -thnum 96 -tout 3 -pid /var/run/tyrant/screen_names.pid -kl -log /var/log/tyrant/user_ids.tch '/data/db/ttyrant/user_ids.tch#bnum=100000000#opts=l#rcnum=50000#xmsiz=268435456'
|
12
|
-
# ttserver -port 12002 -thnum 96 -tout 3 -pid /var/run/tyrant/screen_names.pid -kl -log /var/log/tyrant/screen_names.tch '/data/db/ttyrant/screen_names.tch#bnum=100000000#opts=l#rcnum=50000#xmsiz=268435456'
|
13
|
-
# ttserver -port 12003 -thnum 96 -tout 3 -pid /var/run/tyrant/screen_names.pid -kl -log /var/log/tyrant/search_ids.tch '/data/db/ttyrant/search_ids.tch#bnum=100000000#opts=l#rcnum=50000#xmsiz=268435456'
|
14
|
-
# ttserver -port 12004 -thnum 96 -tout 3 -pid /var/run/tyrant/screen_names.pid -kl -log /var/log/tyrant/tweets_parsed.tch '/data/db/ttyrant/tweets_parsed.tch#bnum=800000000#opts=l#rcnum=50000#xmsiz=268435456'
|
15
|
-
# ttserver -port 12005 -thnum 96 -tout 3 -pid /var/run/tyrant/screen_names.pid -kl -log /var/log/tyrant/users_parsed.tch '/data/db/ttyrant/users_parsed.tch#bnum=100000000#opts=l#rcnum=50000#xmsiz=268435456'
|
16
|
-
#
|
17
|
-
# -- Monitoring
|
18
|
-
# tcrmgr inform -port $port -st $hostname
|
19
|
-
# active conns:
|
20
|
-
# lsof -i | grep ttserver | wc -l
|
21
|
-
# netstat -a -W | grep ':120' | ruby -ne 'puts $_.split(/ +/)[3 .. 4].join("\t")' | sort | cut -d: -f1-2 | uniq -c | sort -n
|
22
|
-
# use db.rnum for most lightweight ping method
|
23
|
-
#
|
24
|
-
# -- Tuning
|
25
|
-
# http://korrespondence.blogspot.com/2009/09/tokyo-tyrant-tuning-parameters.html
|
26
|
-
# http://capttofu.livejournal.com/23381.html
|
27
|
-
# http://groups.google.com/group/tokyocabinet-users/browse_thread/thread/5a46ee04006a791c#
|
28
|
-
# opts "l" of large option (the size of the database can be larger than 2GB by using 64-bit bucket array.), "d" of Deflate option (each record is compressed with Deflate encoding), "b" of BZIP2 option, "t" of TCBS option
|
29
|
-
# bnum number of elements of the bucket array. If it is not more than 0, the default value is specified. The default value is 131071 (128K). Suggested size of the bucket array is about from 0.5 to 4 times of the number of all records to be stored.
|
30
|
-
# rcnum maximum number of records to be cached. If it is not more than 0, the record cache is disabled. It is disabled by default.
|
31
|
-
# xmsiz size of the extra mapped memory. If it is not more than 0, the extra mapped memory is disabled. The default size is 67108864 (64MB).
|
32
|
-
# apow size of record alignment by power of 2. If it is negative, the default value is specified. The default value is 4 standing for 2^4=16.
|
33
|
-
# fpow maximum number of elements of the free block pool by power of 2. If it is negative, the default value is specified. The default value is 10 standing for 2^10=1024.
|
34
|
-
# dfunit unit step number of auto defragmentation. If it is not more than 0, the auto defragmentation is disabled. It is disabled by default.
|
35
|
-
# mode "w" of writer, "r" of reader,"c" of creating,"t" of truncating ,"e" of no locking,"f" of non-blocking lock
|
36
|
-
#
|
37
|
-
# -- Links
|
38
|
-
# http://1978th.net/tokyocabinet/spex-en.html
|
39
|
-
# http://groups.google.com/group/tokyocabinet-users/browse_thread/thread/3bd2a93322c09eec#
|
40
|
-
|
41
|
-
|
42
|
-
class TokyoTyrant::Balancer::Base
|
43
|
-
def initialize(hostnames = [], timeout = 20.0, should_retry = true)
|
44
|
-
@servers = hostnames.map do |hostname|
|
45
|
-
host, port = hostname.split(':')
|
46
|
-
klass.new(host, port.to_i, timeout, should_retry)
|
47
|
-
end
|
48
|
-
# yes, for some reason it's spelled 'Constistent' here
|
49
|
-
# DO NOT fix it because it goes deep...
|
50
|
-
@ring = TokyoTyrant::ConstistentHash.new(servers)
|
51
|
-
end
|
52
|
-
|
53
|
-
def close
|
54
|
-
@servers.all?{ |server| server.close rescue nil}
|
55
|
-
end
|
56
|
-
end
|
57
|
-
|
58
|
-
module TokyoDbConnection
|
59
|
-
class TyrantDb
|
60
|
-
attr_reader :dataset
|
61
|
-
DB_SERVERS = [
|
62
|
-
'10.194.101.156',
|
63
|
-
'10.196.73.156',
|
64
|
-
'10.196.75.47',
|
65
|
-
'10.242.217.140',
|
66
|
-
].freeze unless defined?(TokyoDbConnection::TyrantDb::DB_SERVERS)
|
67
|
-
|
68
|
-
DB_PORTS = {
|
69
|
-
:tw_screen_names => 12002,
|
70
|
-
:tw_search_ids => 12003,
|
71
|
-
#
|
72
|
-
:tw_user_info => 14000,
|
73
|
-
:tw_wordbag => 14101,
|
74
|
-
:tw_influence => 14102,
|
75
|
-
:tw_trstrank => 14103,
|
76
|
-
:tw_conversation => 14104,
|
77
|
-
#
|
78
|
-
:tw_screen_names2 => 12004,
|
79
|
-
:tw_search_ids2 => 12005,
|
80
|
-
#
|
81
|
-
:tw_user_info2 => 14200,
|
82
|
-
:tw_wordbag2 => 14201,
|
83
|
-
:tw_influence2 => 14202,
|
84
|
-
:tw_trstrank2 => 14203,
|
85
|
-
:tw_conversation2 => 14204,
|
86
|
-
:tw_strong_links2 => 14205,
|
87
|
-
:tw_word_stats2 => 14210,
|
88
|
-
#
|
89
|
-
:ip_geo_census => 14400,
|
90
|
-
} unless defined?(TokyoDbConnection::TyrantDb::DB_PORTS)
|
91
|
-
|
92
|
-
def initialize dataset
|
93
|
-
@dataset = dataset
|
94
|
-
end
|
95
|
-
|
96
|
-
def db
|
97
|
-
return @db if @db
|
98
|
-
port = DB_PORTS[dataset] or raise "Don't know how to reach dataset #{dataset}"
|
99
|
-
@db = TokyoTyrant::Balancer::DB.new(DB_SERVERS.map{|s| s+':'+port.to_s})
|
100
|
-
end
|
101
|
-
|
102
|
-
def [](*args) ; db[*args] ; end
|
103
|
-
def size(*args) ; db.size(*args) ; end
|
104
|
-
def vanish!(*args) ; db.vanish(*args) ; end
|
105
|
-
|
106
|
-
#
|
107
|
-
# Insert into the cassandra database with default settings
|
108
|
-
#
|
109
|
-
def insert key, value
|
110
|
-
begin
|
111
|
-
db.putnr(key, value)
|
112
|
-
rescue StandardError => e ; handle_error("Insert #{[key, value].inspect}", e); end
|
113
|
-
end
|
114
|
-
|
115
|
-
def insert_array key, value
|
116
|
-
insert(key, value.join(','))
|
117
|
-
end
|
118
|
-
|
119
|
-
def get *args
|
120
|
-
begin
|
121
|
-
db.get(*args)
|
122
|
-
rescue StandardError => e ; handle_error("Fetch #{args.inspect}", e); end
|
123
|
-
end
|
124
|
-
|
125
|
-
def handle_error action, e
|
126
|
-
Log.warn "#{action} failed: #{e} #{e.backtrace.join("\t")}" ;
|
127
|
-
invalidate!
|
128
|
-
end
|
129
|
-
|
130
|
-
def invalidate!
|
131
|
-
(@db && @db.close) or warn "Couldn't close #{@db.inspect}"
|
132
|
-
@db = nil
|
133
|
-
sleep 2
|
134
|
-
end
|
135
|
-
end
|
136
|
-
end
|
137
|
-
|
@@ -1,145 +0,0 @@
|
|
1
|
-
|
2
|
-
# -- Installing
|
3
|
-
# make sure tokyocabinet and tokyotyrant are installed (cehf recipe)
|
4
|
-
# make sure ruby-tokyotyrant is installed
|
5
|
-
# ldconfig
|
6
|
-
# mkdir -p /data/db/ttyrant /var/run/tyrant /var/log/tyrant
|
7
|
-
#
|
8
|
-
# -- Starting
|
9
|
-
# ttserver -port 12001 -thnum 96 -tout 3 -pid /var/run/tyrant/screen_names.pid -kl -log /var/log/tyrant/user_ids.tch '/data/db/ttyrant/user_ids.tch#bnum=100000000#opts=l#rcnum=50000#xmsiz=268435456'
|
10
|
-
# ttserver -port 12002 -thnum 96 -tout 3 -pid /var/run/tyrant/screen_names.pid -kl -log /var/log/tyrant/screen_names.tch '/data/db/ttyrant/screen_names.tch#bnum=100000000#opts=l#rcnum=50000#xmsiz=268435456'
|
11
|
-
# ttserver -port 12003 -thnum 96 -tout 3 -pid /var/run/tyrant/screen_names.pid -kl -log /var/log/tyrant/search_ids.tch '/data/db/ttyrant/search_ids.tch#bnum=100000000#opts=l#rcnum=50000#xmsiz=268435456'
|
12
|
-
# ttserver -port 12004 -thnum 96 -tout 3 -pid /var/run/tyrant/screen_names.pid -kl -log /var/log/tyrant/tweets_parsed.tch '/data/db/ttyrant/tweets_parsed.tch#bnum=800000000#opts=l#rcnum=50000#xmsiz=268435456'
|
13
|
-
# ttserver -port 12005 -thnum 96 -tout 3 -pid /var/run/tyrant/screen_names.pid -kl -log /var/log/tyrant/users_parsed.tch '/data/db/ttyrant/users_parsed.tch#bnum=100000000#opts=l#rcnum=50000#xmsiz=268435456'
|
14
|
-
#
|
15
|
-
# -- Monitoring
|
16
|
-
# tcrmgr inform -port $port -st $hostname
|
17
|
-
# active conns:
|
18
|
-
# lsof -i | grep ttserver | wc -l
|
19
|
-
# netstat -a -W | grep ':120' | ruby -ne 'puts $_.split(/ +/)[3 .. 4].join("\t")' | sort | cut -d: -f1-2 | uniq -c | sort -n
|
20
|
-
# use db.rnum for most lightweight ping method
|
21
|
-
#
|
22
|
-
# -- Tuning
|
23
|
-
# http://korrespondence.blogspot.com/2009/09/tokyo-tyrant-tuning-parameters.html
|
24
|
-
# http://capttofu.livejournal.com/23381.html
|
25
|
-
# http://groups.google.com/group/tokyocabinet-users/browse_thread/thread/5a46ee04006a791c#
|
26
|
-
# opts "l" of large option (the size of the database can be larger than 2GB by using 64-bit bucket array.), "d" of Deflate option (each record is compressed with Deflate encoding), "b" of BZIP2 option, "t" of TCBS option
|
27
|
-
# bnum number of elements of the bucket array. If it is not more than 0, the default value is specified. The default value is 131071 (128K). Suggested size of the bucket array is about from 0.5 to 4 times of the number of all records to be stored.
|
28
|
-
# rcnum maximum number of records to be cached. If it is not more than 0, the record cache is disabled. It is disabled by default.
|
29
|
-
# xmsiz size of the extra mapped memory. If it is not more than 0, the extra mapped memory is disabled. The default size is 67108864 (64MB).
|
30
|
-
# apow size of record alignment by power of 2. If it is negative, the default value is specified. The default value is 4 standing for 2^4=16.
|
31
|
-
# fpow maximum number of elements of the free block pool by power of 2. If it is negative, the default value is specified. The default value is 10 standing for 2^10=1024.
|
32
|
-
# dfunit unit step number of auto defragmentation. If it is not more than 0, the auto defragmentation is disabled. It is disabled by default.
|
33
|
-
# mode "w" of writer, "r" of reader,"c" of creating,"t" of truncating ,"e" of no locking,"f" of non-blocking lock
|
34
|
-
#
|
35
|
-
# -- Links
|
36
|
-
# http://1978th.net/tokyocabinet/spex-en.html
|
37
|
-
# http://groups.google.com/group/tokyocabinet-users/browse_thread/thread/3bd2a93322c09eec#
|
38
|
-
# Performance limits: http://groups.google.com/group/tokyocabinet-users/browse_thread/thread/3bd2a93322c09eec#
|
39
|
-
|
40
|
-
|
41
|
-
h2. Tyrant: ttserver
|
42
|
-
|
43
|
-
ttdb="test"
|
44
|
-
ttserver -port 12009 -thnum 96 \
|
45
|
-
-dmn -pid /var/run/tyrant-${ttdb}.pid
|
46
|
-
-ulog /mnt/tmp/ttyrant/tyrant-$[ttdb}.ulog -ulim 268435456 -uas \
|
47
|
-
-log /var/log/ttyrant/tyrant-${ttdb}.log \
|
48
|
-
"/data/db/ttyrant/${ttdb}.tch#bnum=200000000#opts=l#rcnum=100000#xmsiz=536870912"
|
49
|
-
|
50
|
-
can also add host, and umask out to be read-only
|
51
|
-
|
52
|
-
* -host name : specify the host name or the address of the server. By default, every network address is bound.
|
53
|
-
* -port num : specify the port number. By default, it is 1978.
|
54
|
-
* -thnum num : specify the number of worker threads. By default, it is 8.
|
55
|
-
* -tout num : specify the timeout of each session in seconds. By default, no timeout is specified.
|
56
|
-
* -dmn : work as a daemon process.
|
57
|
-
* -pid path : output the process ID into the file.
|
58
|
-
* -kl : kill the existing process if the process ID file is detected.
|
59
|
-
* -log path : output log messages into the file.
|
60
|
-
* -ld : log debug messages also.
|
61
|
-
* -le : log error messages only.
|
62
|
-
* -ulog path : specify the update log directory.
|
63
|
-
* -ulim num : specify the limit size of each update log file.
|
64
|
-
* -uas : use asynchronous I/O for the update log.
|
65
|
-
* -sid num : specify the server ID.
|
66
|
-
* -mhost name : specify the host name of the replication master server.
|
67
|
-
* -mport num : specify the port number of the replication master server.
|
68
|
-
* -rts path : specify the replication time stamp file.
|
69
|
-
* -rcc : check consistency of replication.
|
70
|
-
* -skel name : specify the name of the skeleton database library.
|
71
|
-
* -mul num : specify the division number of the multiple database mechanism.
|
72
|
-
* -ext path : specify the script language extension file.
|
73
|
-
* -extpc name period : specify the function name and the calling period of a periodic command.
|
74
|
-
* -mask expr : specify the names of forbidden commands.
|
75
|
-
* -unmask expr : specify the names of allowed commands.
|
76
|
-
|
77
|
-
|
78
|
-
h2. From "Wolfgang Gassler":http://groups.google.com/group/tokyocabinet-users/browse_thread/thread/5a46ee04006a791c#
|
79
|
-
|
80
|
-
On Sat, Dec 05, 2009 at 09:32:20PM +0100, Wolfgang Gassler wrote:
|
81
|
-
> Hi,
|
82
|
-
|
83
|
-
> did anybody look up some of the folowing parameters in the code or can
|
84
|
-
> explain them in detail? I just have a guess what they really mean and
|
85
|
-
> the short description at the docu homepage
|
86
|
-
> http://korrespondence.blogspot.com/2009/09/tokyo-tyrant-tuning-parame...
|
87
|
-
> explain them very roughly. Also the already posted blog post
|
88
|
-
> http://korrespondence.blogspot.com/2009/09/tokyo-tyrant-tuning-parame...
|
89
|
-
> couldn't help.
|
90
|
-
|
91
|
-
this is what I gleaned from reading the source code for the hash database
|
92
|
-
format ( tchdb.c and tchdb.h ).
|
93
|
-
|
94
|
-
> xmsiz
|
95
|
-
|
96
|
-
On a TC Hash database, from the beginning of the file, to the end of the bucket
|
97
|
-
section, all of that space is mmap'd. Setting 'xmsiz' sets the minimum amount
|
98
|
-
of space that is mmap'd. Since 67108864 is the default, this means, that an a
|
99
|
-
minimum, the first 64MiB of the file will be mmap'd.
|
100
|
-
|
101
|
-
If the header size, plus the bucket region is greater than 'xmsize', then xmsiz
|
102
|
-
appers to have no affect.
|
103
|
-
|
104
|
-
> apow
|
105
|
-
|
106
|
-
On a TC Hash database, 'apow' determines on what byte alignment each record will
|
107
|
-
sit. 'apow' is a power of 2. This means that when apow is 4 ( the default for
|
108
|
-
hash databases) all records in the database are aligned on a 16 byte boundary,
|
109
|
-
in the database file.
|
110
|
-
|
111
|
-
This means that every record will take up at a minumum 16 bytes of space, and
|
112
|
-
all records are padded to a length that is a multiple of 16.
|
113
|
-
|
114
|
-
> fpow
|
115
|
-
|
116
|
-
On a TC Hash database, 'fpow' determines the maximum number of free blocks that
|
117
|
-
can exist in the free block pool. This is also a power-of-2 parameter so with
|
118
|
-
the default in a Hash database of 10, this means that there can be a maximum
|
119
|
-
of 2^10, or 1024 free blocks in the database.
|
120
|
-
|
121
|
-
Free blocks come into existence when records are deleted from the database
|
122
|
-
and their space in the db file is up for reuse. If you never delete an
|
123
|
-
item from the database, you will never have any free blocks.
|
124
|
-
|
125
|
-
> dfunit
|
126
|
-
|
127
|
-
On a TC Hash database, 'dfunit' describes how defragmentation takes place.
|
128
|
-
Every time a free block is created a 'dfcnt' is incremented. When 'dfcnt'
|
129
|
-
is greater than 'dfunit' and 'dfunit' is greater than 0, defragmentation
|
130
|
-
takes place.
|
131
|
-
|
132
|
-
I don't know precisely what defragmentation does in TC. A cursory look
|
133
|
-
at 'tchdbdefragimpl', the function implementing defagmentation for hash
|
134
|
-
databases, it looks like it moves records around filling up free blocks
|
135
|
-
in the hash db with real records from the end of the file and then making the
|
136
|
-
file smaller if possible.
|
137
|
-
|
138
|
-
Basically it moves records around minimizing dead space in the file.
|
139
|
-
|
140
|
-
Again, defragmentation will only take place if 'dfunit' has a positive
|
141
|
-
value and you remove records from the db creating free blocks.
|
142
|
-
|
143
|
-
enjoy,
|
144
|
-
|
145
|
-
-jeremy
|
data/lib/wukong/models/graph.rb
DELETED
@@ -1,25 +0,0 @@
|
|
1
|
-
|
2
|
-
module Wukong
|
3
|
-
module Models
|
4
|
-
Edge = TypedStruct.new(
|
5
|
-
[:src, Integer],
|
6
|
-
[:dest, Integer]
|
7
|
-
)
|
8
|
-
|
9
|
-
MultiEdge = TypedStruct.new(
|
10
|
-
[:src, Integer],
|
11
|
-
[:dest, Integer],
|
12
|
-
[:a_follows_b, Integer],
|
13
|
-
[:b_follows_a, Integer],
|
14
|
-
[:a_replies_b, Integer],
|
15
|
-
[:b_replies_a, Integer],
|
16
|
-
[:a_atsigns_b, Integer],
|
17
|
-
[:b_atsigns_a, Integer],
|
18
|
-
[:a_retweets_b, Integer],
|
19
|
-
[:b_retweets_a, Integer],
|
20
|
-
[:a_favorites_b, Integer],
|
21
|
-
[:b_favorites_a, Integer]
|
22
|
-
)
|
23
|
-
|
24
|
-
end
|
25
|
-
end
|