wukong 1.4.9 → 1.4.10

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,24 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems' ;
3
+ require 'redis' ;
4
+
5
+ RDB = Redis.new(:host => 'localhost', :port => 6379)
6
+
7
+ start_time = Time.now.utc.to_f ;
8
+ iter=0;
9
+
10
+
11
+ $stdin.each do |line|
12
+ _r, id, scat, sn, pr, fo, fr, st, fv, crat, sid, full = line.chomp.split("\t");
13
+ iter+=1 ;
14
+ break if iter > 20_000_000
15
+
16
+ if (iter % 10_000 == 0)
17
+ elapsed = (Time.now.utc.to_f - start_time)
18
+ puts "%-20s\t%7d\t%7d\t%7.2f\t%7.2f" % [sn, fo, iter, elapsed, iter.to_f/elapsed]
19
+ end
20
+
21
+ RDB['sn:'+sn.downcase] = id unless sn.empty?
22
+ RDB['sid:'+sid] = id unless sid.empty?
23
+ RDB['uid:'+id] = [sn,sid,crat,scat].join(',') unless id.empty?
24
+ end
@@ -0,0 +1,124 @@
1
+ require 'tokyo_tyrant'
2
+ require 'tokyo_tyrant/balancer'
3
+
4
+ # -- Installing
5
+ # make sure tokyocabinet and tokyotyrant are installed (cehf recipe)
6
+ # make sure ruby-tokyotyrant is installed
7
+ # ldconfig
8
+ # mkdir -p /data/db/ttyrant /var/run/tyrant /var/log/tyrant
9
+ #
10
+ # -- Starting
11
+ # ttserver -port 12001 -thnum 96 -tout 3 -pid /var/run/tyrant/screen_names.pid -kl -log /var/log/tyrant/user_ids.tch '/data/db/ttyrant/user_ids.tch#bnum=100000000#opts=l#rcnum=50000#xmsiz=268435456'
12
+ # ttserver -port 12002 -thnum 96 -tout 3 -pid /var/run/tyrant/screen_names.pid -kl -log /var/log/tyrant/screen_names.tch '/data/db/ttyrant/screen_names.tch#bnum=100000000#opts=l#rcnum=50000#xmsiz=268435456'
13
+ # ttserver -port 12003 -thnum 96 -tout 3 -pid /var/run/tyrant/screen_names.pid -kl -log /var/log/tyrant/search_ids.tch '/data/db/ttyrant/search_ids.tch#bnum=100000000#opts=l#rcnum=50000#xmsiz=268435456'
14
+ # ttserver -port 12004 -thnum 96 -tout 3 -pid /var/run/tyrant/screen_names.pid -kl -log /var/log/tyrant/tweets_parsed.tch '/data/db/ttyrant/tweets_parsed.tch#bnum=800000000#opts=l#rcnum=50000#xmsiz=268435456'
15
+ # ttserver -port 12005 -thnum 96 -tout 3 -pid /var/run/tyrant/screen_names.pid -kl -log /var/log/tyrant/users_parsed.tch '/data/db/ttyrant/users_parsed.tch#bnum=100000000#opts=l#rcnum=50000#xmsiz=268435456'
16
+ #
17
+ # -- Monitoring
18
+ # tcrmgr inform -port $port -st $hostname
19
+ # active conns:
20
+ # lsof -i | grep ttserver | wc -l
21
+ # netstat -a -W | grep ':120' | ruby -ne 'puts $_.split(/ +/)[3 .. 4].join("\t")' | sort | cut -d: -f1-2 | uniq -c | sort -n
22
+ # use db.rnum for most lightweight ping method
23
+ #
24
+ # -- Tuning
25
+ # http://korrespondence.blogspot.com/2009/09/tokyo-tyrant-tuning-parameters.html
26
+ # http://capttofu.livejournal.com/23381.html
27
+ # http://groups.google.com/group/tokyocabinet-users/browse_thread/thread/5a46ee04006a791c#
28
+ # opts "l" of large option (the size of the database can be larger than 2GB by using 64-bit bucket array.), "d" of Deflate option (each record is compressed with Deflate encoding), "b" of BZIP2 option, "t" of TCBS option
29
+ # bnum number of elements of the bucket array. If it is not more than 0, the default value is specified. The default value is 131071 (128K). Suggested size of the bucket array is about from 0.5 to 4 times of the number of all records to be stored.
30
+ # rcnum maximum number of records to be cached. If it is not more than 0, the record cache is disabled. It is disabled by default.
31
+ # xmsiz size of the extra mapped memory. If it is not more than 0, the extra mapped memory is disabled. The default size is 67108864 (64MB).
32
+ # apow size of record alignment by power of 2. If it is negative, the default value is specified. The default value is 4 standing for 2^4=16.
33
+ # fpow maximum number of elements of the free block pool by power of 2. If it is negative, the default value is specified. The default value is 10 standing for 2^10=1024.
34
+ # dfunit unit step number of auto defragmentation. If it is not more than 0, the auto defragmentation is disabled. It is disabled by default.
35
+ # mode "w" of writer, "r" of reader,"c" of creating,"t" of truncating ,"e" of no locking,"f" of non-blocking lock
36
+ #
37
+ # -- Links
38
+ # http://1978th.net/tokyocabinet/spex-en.html
39
+ # http://groups.google.com/group/tokyocabinet-users/browse_thread/thread/3bd2a93322c09eec#
40
+
41
+
42
+ class TokyoTyrant::Balancer::Base
43
+ def initialize(hostnames = [], timeout = 20.0, should_retry = true)
44
+ @servers = hostnames.map do |hostname|
45
+ host, port = hostname.split(':')
46
+ klass.new(host, port.to_i, timeout, should_retry)
47
+ end
48
+ # yes, for some reason it's spelled 'Constistent' here
49
+ # DO NOT fix it because it goes deep...
50
+ @ring = TokyoTyrant::ConstistentHash.new(servers)
51
+ end
52
+
53
+ def close
54
+ @servers.all?{ |server| server.close rescue nil}
55
+ end
56
+
57
+ end
58
+
59
+ module TokyoDbConnection
60
+ class TyrantDb
61
+ attr_reader :dataset
62
+ DB_SERVERS = [
63
+ '10.194.101.156',
64
+ '10.196.73.156',
65
+ '10.196.75.47',
66
+ '10.242.217.140',
67
+ ].freeze unless defined?(TokyoDbConnection::TyrantDb::DB_SERVERS)
68
+
69
+ DB_PORTS = {
70
+ :user_ids => 12001,
71
+ :screen_names => 12002,
72
+ :search_ids => 12003,
73
+ :tweets_parsed => 12004,
74
+ :users_parsed => 12005,
75
+ } unless defined?(TokyoDbConnection::TyrantDb::DB_PORTS)
76
+
77
+ def initialize dataset
78
+ @dataset = dataset
79
+ end
80
+
81
+ def db
82
+ return @db if @db
83
+ port = DB_PORTS[dataset] or raise "Don't know how to reach dataset #{dataset}"
84
+ @db = TokyoTyrant::Balancer::DB.new(DB_SERVERS.map{|s| s+':'+port.to_s})
85
+ # @db = TokyoTyrant::DB.new(DB_SERVERS.first, port.to_i)
86
+ @db
87
+ end
88
+
89
+ def [](*args) ; db[*args] ; end
90
+ def size(*args) ; db.size(*args) ; end
91
+ def vanish!(*args) ; db.vanish(*args) ; end
92
+
93
+ #
94
+ # Insert into the cassandra database with default settings
95
+ #
96
+ def insert key, value
97
+ begin
98
+ db.putnr(key, value)
99
+ rescue StandardError => e ; handle_error("Insert #{[key, value].inspect}", e); end
100
+ end
101
+
102
+ def insert_array key, value
103
+ insert(key, value.join(','))
104
+ end
105
+
106
+ def get *args
107
+ begin
108
+ db.get(*args)
109
+ rescue StandardError => e ; handle_error("Fetch #{args.inspect}", e); end
110
+ end
111
+
112
+ def handle_error action, e
113
+ warn "#{action} failed: #{e} #{e.backtrace.join("\t")}" ;
114
+ invalidate!
115
+ end
116
+
117
+ def invalidate!
118
+ (@db && @db.close) or warn "Couldn't close #{@db.inspect}"
119
+ @db = nil
120
+ sleep 2
121
+ end
122
+ end
123
+ end
124
+
@@ -0,0 +1,145 @@
1
+
2
+ # -- Installing
3
+ # make sure tokyocabinet and tokyotyrant are installed (cehf recipe)
4
+ # make sure ruby-tokyotyrant is installed
5
+ # ldconfig
6
+ # mkdir -p /data/db/ttyrant /var/run/tyrant /var/log/tyrant
7
+ #
8
+ # -- Starting
9
+ # ttserver -port 12001 -thnum 96 -tout 3 -pid /var/run/tyrant/screen_names.pid -kl -log /var/log/tyrant/user_ids.tch '/data/db/ttyrant/user_ids.tch#bnum=100000000#opts=l#rcnum=50000#xmsiz=268435456'
10
+ # ttserver -port 12002 -thnum 96 -tout 3 -pid /var/run/tyrant/screen_names.pid -kl -log /var/log/tyrant/screen_names.tch '/data/db/ttyrant/screen_names.tch#bnum=100000000#opts=l#rcnum=50000#xmsiz=268435456'
11
+ # ttserver -port 12003 -thnum 96 -tout 3 -pid /var/run/tyrant/screen_names.pid -kl -log /var/log/tyrant/search_ids.tch '/data/db/ttyrant/search_ids.tch#bnum=100000000#opts=l#rcnum=50000#xmsiz=268435456'
12
+ # ttserver -port 12004 -thnum 96 -tout 3 -pid /var/run/tyrant/screen_names.pid -kl -log /var/log/tyrant/tweets_parsed.tch '/data/db/ttyrant/tweets_parsed.tch#bnum=800000000#opts=l#rcnum=50000#xmsiz=268435456'
13
+ # ttserver -port 12005 -thnum 96 -tout 3 -pid /var/run/tyrant/screen_names.pid -kl -log /var/log/tyrant/users_parsed.tch '/data/db/ttyrant/users_parsed.tch#bnum=100000000#opts=l#rcnum=50000#xmsiz=268435456'
14
+ #
15
+ # -- Monitoring
16
+ # tcrmgr inform -port $port -st $hostname
17
+ # active conns:
18
+ # lsof -i | grep ttserver | wc -l
19
+ # netstat -a -W | grep ':120' | ruby -ne 'puts $_.split(/ +/)[3 .. 4].join("\t")' | sort | cut -d: -f1-2 | uniq -c | sort -n
20
+ # use db.rnum for most lightweight ping method
21
+ #
22
+ # -- Tuning
23
+ # http://korrespondence.blogspot.com/2009/09/tokyo-tyrant-tuning-parameters.html
24
+ # http://capttofu.livejournal.com/23381.html
25
+ # http://groups.google.com/group/tokyocabinet-users/browse_thread/thread/5a46ee04006a791c#
26
+ # opts "l" of large option (the size of the database can be larger than 2GB by using 64-bit bucket array.), "d" of Deflate option (each record is compressed with Deflate encoding), "b" of BZIP2 option, "t" of TCBS option
27
+ # bnum number of elements of the bucket array. If it is not more than 0, the default value is specified. The default value is 131071 (128K). Suggested size of the bucket array is about from 0.5 to 4 times of the number of all records to be stored.
28
+ # rcnum maximum number of records to be cached. If it is not more than 0, the record cache is disabled. It is disabled by default.
29
+ # xmsiz size of the extra mapped memory. If it is not more than 0, the extra mapped memory is disabled. The default size is 67108864 (64MB).
30
+ # apow size of record alignment by power of 2. If it is negative, the default value is specified. The default value is 4 standing for 2^4=16.
31
+ # fpow maximum number of elements of the free block pool by power of 2. If it is negative, the default value is specified. The default value is 10 standing for 2^10=1024.
32
+ # dfunit unit step number of auto defragmentation. If it is not more than 0, the auto defragmentation is disabled. It is disabled by default.
33
+ # mode "w" of writer, "r" of reader,"c" of creating,"t" of truncating ,"e" of no locking,"f" of non-blocking lock
34
+ #
35
+ # -- Links
36
+ # http://1978th.net/tokyocabinet/spex-en.html
37
+ # http://groups.google.com/group/tokyocabinet-users/browse_thread/thread/3bd2a93322c09eec#
38
+ # Performance limits: http://groups.google.com/group/tokyocabinet-users/browse_thread/thread/3bd2a93322c09eec#
39
+
40
+
41
+ h2. Tyrant: ttserver
42
+
43
+ ttdb="test"
44
+ ttserver -port 12009 -thnum 96 \
45
+ -dmn -pid /var/run/tyrant-${ttdb}.pid
46
+ -ulog /mnt/tmp/ttyrant/tyrant-$[ttdb}.ulog -ulim 268435456 -uas \
47
+ -log /var/log/ttyrant/tyrant-${ttdb}.log \
48
+ "/data/db/ttyrant/${ttdb}.tch#bnum=200000000#opts=l#rcnum=100000#xmsiz=536870912"
49
+
50
+ can also add host, and umask out to be read-only
51
+
52
+ * -host name : specify the host name or the address of the server. By default, every network address is bound.
53
+ * -port num : specify the port number. By default, it is 1978.
54
+ * -thnum num : specify the number of worker threads. By default, it is 8.
55
+ * -tout num : specify the timeout of each session in seconds. By default, no timeout is specified.
56
+ * -dmn : work as a daemon process.
57
+ * -pid path : output the process ID into the file.
58
+ * -kl : kill the existing process if the process ID file is detected.
59
+ * -log path : output log messages into the file.
60
+ * -ld : log debug messages also.
61
+ * -le : log error messages only.
62
+ * -ulog path : specify the update log directory.
63
+ * -ulim num : specify the limit size of each update log file.
64
+ * -uas : use asynchronous I/O for the update log.
65
+ * -sid num : specify the server ID.
66
+ * -mhost name : specify the host name of the replication master server.
67
+ * -mport num : specify the port number of the replication master server.
68
+ * -rts path : specify the replication time stamp file.
69
+ * -rcc : check consistency of replication.
70
+ * -skel name : specify the name of the skeleton database library.
71
+ * -mul num : specify the division number of the multiple database mechanism.
72
+ * -ext path : specify the script language extension file.
73
+ * -extpc name period : specify the function name and the calling period of a periodic command.
74
+ * -mask expr : specify the names of forbidden commands.
75
+ * -unmask expr : specify the names of allowed commands.
76
+
77
+
78
+ h2. From "Wolfgang Gassler":http://groups.google.com/group/tokyocabinet-users/browse_thread/thread/5a46ee04006a791c#
79
+
80
+ On Sat, Dec 05, 2009 at 09:32:20PM +0100, Wolfgang Gassler wrote:
81
+ > Hi,
82
+
83
+ > did anybody look up some of the folowing parameters in the code or can
84
+ > explain them in detail? I just have a guess what they really mean and
85
+ > the short description at the docu homepage
86
+ > http://korrespondence.blogspot.com/2009/09/tokyo-tyrant-tuning-parame...
87
+ > explain them very roughly. Also the already posted blog post
88
+ > http://korrespondence.blogspot.com/2009/09/tokyo-tyrant-tuning-parame...
89
+ > couldn't help.
90
+
91
+ this is what I gleaned from reading the source code for the hash database
92
+ format ( tchdb.c and tchdb.h ).
93
+
94
+ > xmsiz
95
+
96
+ On a TC Hash database, from the beginning of the file, to the end of the bucket
97
+ section, all of that space is mmap'd. Setting 'xmsiz' sets the minimum amount
98
+ of space that is mmap'd. Since 67108864 is the default, this means, that an a
99
+ minimum, the first 64MiB of the file will be mmap'd.
100
+
101
+ If the header size, plus the bucket region is greater than 'xmsize', then xmsiz
102
+ appers to have no affect.
103
+
104
+ > apow
105
+
106
+ On a TC Hash database, 'apow' determines on what byte alignment each record will
107
+ sit. 'apow' is a power of 2. This means that when apow is 4 ( the default for
108
+ hash databases) all records in the database are aligned on a 16 byte boundary,
109
+ in the database file.
110
+
111
+ This means that every record will take up at a minumum 16 bytes of space, and
112
+ all records are padded to a length that is a multiple of 16.
113
+
114
+ > fpow
115
+
116
+ On a TC Hash database, 'fpow' determines the maximum number of free blocks that
117
+ can exist in the free block pool. This is also a power-of-2 parameter so with
118
+ the default in a Hash database of 10, this means that there can be a maximum
119
+ of 2^10, or 1024 free blocks in the database.
120
+
121
+ Free blocks come into existence when records are deleted from the database
122
+ and their space in the db file is up for reuse. If you never delete an
123
+ item from the database, you will never have any free blocks.
124
+
125
+ > dfunit
126
+
127
+ On a TC Hash database, 'dfunit' describes how defragmentation takes place.
128
+ Every time a free block is created a 'dfcnt' is incremented. When 'dfcnt'
129
+ is greater than 'dfunit' and 'dfunit' is greater than 0, defragmentation
130
+ takes place.
131
+
132
+ I don't know precisely what defragmentation does in TC. A cursory look
133
+ at 'tchdbdefragimpl', the function implementing defagmentation for hash
134
+ databases, it looks like it moves records around filling up free blocks
135
+ in the hash db with real records from the end of the file and then making the
136
+ file smaller if possible.
137
+
138
+ Basically it moves records around minimizing dead space in the file.
139
+
140
+ Again, defragmentation will only take place if 'dfunit' has a positive
141
+ value and you remove records from the db creating free blocks.
142
+
143
+ enjoy,
144
+
145
+ -jeremy
@@ -0,0 +1,57 @@
1
+ Settings.define :log_interval, :default => 1000, :type => Integer, :description => 'How many iterations between log statements'
2
+
3
+ #
4
+ # Periodic monitor
5
+ #
6
+ #
7
+ # This is very much a work in progress
8
+ #
9
+ class PeriodicMonitor
10
+ attr_reader :iter, :start_time, :options
11
+ attr_accessor :interval
12
+
13
+ def initialize extra_options={}
14
+ @options = {}
15
+ @options.deep_merge!( extra_options || {} )
16
+ @iter = 0
17
+ @start_time = now
18
+ @interval = (options[:log_interval] || Settings[:log_interval]).to_i
19
+ @interval = 1000 unless @interval >= 1
20
+ end
21
+
22
+ def periodically *args, &block
23
+ incr!
24
+ if ready?
25
+ if block
26
+ block.call(iter, *args)
27
+ else
28
+ $stderr.puts progress(*args)
29
+ end
30
+ end
31
+ end
32
+
33
+ def incr!
34
+ @iter += 1
35
+ end
36
+
37
+ def ready?
38
+ iter % @interval == 0
39
+ end
40
+
41
+ def progress *stuff
42
+ [
43
+ "%15d" % iter,
44
+ "%7.1f"% elapsed_time, "sec",
45
+ "%7.1f"%(iter.to_f / elapsed_time), "/sec",
46
+ now.to_flat,
47
+ *stuff
48
+ ].flatten.join("\t")
49
+ end
50
+
51
+ def elapsed_time
52
+ now - start_time
53
+ end
54
+ def now
55
+ Time.now.utc
56
+ end
57
+ end
@@ -27,6 +27,8 @@ module Wukong
27
27
  Settings.define :timeout, :jobconf => true, :description => 'mapred.task.timeout', :wukong => true
28
28
  Settings.define :reuse_jvms, :jobconf => true, :description => 'mapred.job.reuse.jvm.num.tasks', :wukong => true
29
29
  Settings.define :respect_exit_status, :jobconf => true, :description => 'stream.non.zero.exit.is.failure', :wukong => true
30
+ Settings.define :io_sort_record_percent, :jobconf => true, :description => 'io.sort.record.percent', :wukong => true
31
+ Settings.define :io_sort_mb, :jobconf => true, :description => 'io.sort.mb', :wukong => true
30
32
  Settings.define :noempty, :description => "don't create zero-byte reduce files (hadoop mode only)", :wukong => true
31
33
  Settings.define :job_name, :jobconf => true, :description => 'mapred.job.name', :wukong => true
32
34
  # mapred.linerecordreader.maxlength :description => "Safeguards against corrupted data: lines longer than this (in bytes) are treated as bad records."
@@ -98,7 +100,7 @@ module Wukong
98
100
  # root of your config install.
99
101
  [
100
102
  hadoop_runner,
101
- "jar #{Settings[:hadoop_home]}/contrib/streaming/hadoop-*-streaming.jar",
103
+ "jar #{Settings[:hadoop_home]}/contrib/streaming/hadoop-*streaming*.jar",
102
104
  hadoop_partition_args,
103
105
  hadoop_sort_args,
104
106
  hadoop_num_tasks_args,
@@ -83,6 +83,7 @@
83
83
  super
84
84
  # don't finalize if we never saw any field at all
85
85
  finalize(){|record| emit record } unless (self.key == :__first_pass__)
86
+ after_stream
86
87
  end
87
88
  end
88
89
 
@@ -0,0 +1,61 @@
1
+ # Defines a base class for streaming data into a cassandra db connection.
2
+ require 'cassandra' ; include Cassandra::Constants
3
+ module Wukong
4
+ module Streamer
5
+
6
+ class CassandraStreamer < Wukong::Streamer::Base
7
+ attr_accessor :batch_count, :batch_record_count, :batch_size, :column_space, :db_seeds, :cassandra_db
8
+
9
+ def initialize *args
10
+ super *args
11
+ self.batch_count = 0
12
+ self.batch_record_count = 0
13
+ self.column_space ||= 'Twitter'
14
+ self.batch_size ||= 100
15
+ self.db_seeds ||= %w[10.244.191.178 10.243.19.223 10.243.17.219 10.245.70.85 10.244.206.241].map{ |s| s.to_s+':9160'}
16
+ self.cassandra_db ||= Cassandra.new(self.column_space, self.db_seeds)
17
+ end
18
+
19
+ def stream
20
+ while still_lines? do
21
+ start_batch do
22
+ while still_lines? && batch_not_full? do
23
+ line = get_line
24
+ record = recordize(line.chomp) or next
25
+ next if record.blank?
26
+ process(*record) do |output_record|
27
+ emit output_record
28
+ end
29
+ self.batch_record_count += 1
30
+ end
31
+ end
32
+ end
33
+ end
34
+
35
+ def process *args, &blk
36
+ Raise "Overwrite this method to insert into cassandra db"
37
+ end
38
+
39
+ def start_batch &blk
40
+ self.batch_record_count = 0
41
+ self.batch_count += 1
42
+ self.cassandra_db.batch(&blk)
43
+ end
44
+
45
+ def get_line
46
+ $stdin.gets
47
+ end
48
+
49
+ def still_lines?
50
+ !$stdin.eof?
51
+ end
52
+
53
+ def batch_not_full?
54
+ self.batch_record_count < self.batch_size
55
+ end
56
+
57
+ end
58
+ end
59
+
60
+ end
61
+
@@ -1,16 +1,18 @@
1
1
  module Wukong
2
2
  module Streamer
3
- autoload :Base, 'wukong/streamer/base'
4
- autoload :LineStreamer, 'wukong/streamer/line_streamer'
5
- autoload :RecordStreamer, 'wukong/streamer/record_streamer'
6
- autoload :StructStreamer, 'wukong/streamer/struct_streamer'
7
- autoload :StructRecordizer, 'wukong/streamer/struct_streamer'
3
+ autoload :Base, 'wukong/streamer/base'
4
+ autoload :LineStreamer, 'wukong/streamer/line_streamer'
5
+ autoload :RecordStreamer, 'wukong/streamer/record_streamer'
6
+ autoload :StructStreamer, 'wukong/streamer/struct_streamer'
7
+ autoload :StructRecordizer, 'wukong/streamer/struct_streamer'
8
+ # cassandra goodies
9
+ autoload :CassandraStreamer, 'wukong/streamer/cassandra_streamer'
8
10
  #
9
- autoload :Filter, 'wukong/streamer/filter'
11
+ autoload :Filter, 'wukong/streamer/filter'
10
12
  #
11
- autoload :AccumulatingReducer, 'wukong/streamer/accumulating_reducer'
12
- autoload :ListReducer, 'wukong/streamer/list_reducer'
13
- autoload :UniqByLastReducer, 'wukong/streamer/uniq_by_last_reducer'
14
- autoload :CountingReducer, 'wukong/streamer/counting_reducer'
13
+ autoload :AccumulatingReducer, 'wukong/streamer/accumulating_reducer'
14
+ autoload :ListReducer, 'wukong/streamer/list_reducer'
15
+ autoload :UniqByLastReducer, 'wukong/streamer/uniq_by_last_reducer'
16
+ autoload :CountingReducer, 'wukong/streamer/counting_reducer'
15
17
  end
16
18
  end
data/wukong.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{wukong}
8
- s.version = "1.4.9"
8
+ s.version = "1.4.10"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Philip (flip) Kromer"]
12
- s.date = %q{2010-06-06}
12
+ s.date = %q{2010-07-19}
13
13
  s.description = %q{ Treat your dataset like a:
14
14
 
15
15
  * stream of lines when it's efficient to process by lines
@@ -29,6 +29,7 @@ Gem::Specification.new do |s|
29
29
  "INSTALL.textile",
30
30
  "LICENSE.textile",
31
31
  "README.textile",
32
+ "TODO.textile",
32
33
  "bin/cutc",
33
34
  "bin/cuttab",
34
35
  "bin/greptrue",
@@ -38,6 +39,7 @@ Gem::Specification.new do |s|
38
39
  "bin/hdp-du",
39
40
  "bin/hdp-get",
40
41
  "bin/hdp-kill",
42
+ "bin/hdp-kill-task",
41
43
  "bin/hdp-ls",
42
44
  "bin/hdp-mkdir",
43
45
  "bin/hdp-mv",
@@ -62,6 +64,7 @@ Gem::Specification.new do |s|
62
64
  "bin/wu-sum",
63
65
  "docpages/INSTALL.textile",
64
66
  "docpages/LICENSE.textile",
67
+ "docpages/README-performance.textile",
65
68
  "docpages/README-wulign.textile",
66
69
  "docpages/UsingWukong-part1-get_ready.textile",
67
70
  "docpages/UsingWukong-part2-ThinkingBigData.textile",
@@ -118,14 +121,17 @@ Gem::Specification.new do |s|
118
121
  "docpages/usage.textile",
119
122
  "docpages/wutils.textile",
120
123
  "examples/README.txt",
124
+ "examples/binning_percentile_estimator.rb",
121
125
  "examples/contrib/jeans/README.markdown",
122
126
  "examples/contrib/jeans/data/normalized_sizes",
123
127
  "examples/contrib/jeans/data/orders.tsv",
124
128
  "examples/contrib/jeans/data/sizes",
125
129
  "examples/contrib/jeans/normalize.rb",
126
130
  "examples/contrib/jeans/sizes.rb",
131
+ "examples/corpus/words_to_bigrams.rb",
127
132
  "examples/count_keys.rb",
128
133
  "examples/count_keys_at_mapper.rb",
134
+ "examples/keystore/cassandra_batch_test.rb",
129
135
  "examples/keystore/conditional_outputter_example.rb",
130
136
  "examples/network_graph/adjacency_list.rb",
131
137
  "examples/network_graph/breadth_first_search.rb",
@@ -160,6 +166,7 @@ Gem::Specification.new do |s|
160
166
  "lib/wukong/extensions/class.rb",
161
167
  "lib/wukong/extensions/date_time.rb",
162
168
  "lib/wukong/extensions/emittable.rb",
169
+ "lib/wukong/extensions/enumerable.rb",
163
170
  "lib/wukong/extensions/hash.rb",
164
171
  "lib/wukong/extensions/hash_keys.rb",
165
172
  "lib/wukong/extensions/hash_like.rb",
@@ -170,8 +177,12 @@ Gem::Specification.new do |s|
170
177
  "lib/wukong/extensions/struct.rb",
171
178
  "lib/wukong/extensions/symbol.rb",
172
179
  "lib/wukong/keystore/cassandra_conditional_outputter.rb",
180
+ "lib/wukong/keystore/redis_db.rb",
181
+ "lib/wukong/keystore/tyrant_db.rb",
182
+ "lib/wukong/keystore/tyrant_notes.textile",
173
183
  "lib/wukong/logger.rb",
174
184
  "lib/wukong/models/graph.rb",
185
+ "lib/wukong/periodic_monitor.rb",
175
186
  "lib/wukong/rdf.rb",
176
187
  "lib/wukong/schema.rb",
177
188
  "lib/wukong/script.rb",
@@ -180,6 +191,7 @@ Gem::Specification.new do |s|
180
191
  "lib/wukong/streamer.rb",
181
192
  "lib/wukong/streamer/accumulating_reducer.rb",
182
193
  "lib/wukong/streamer/base.rb",
194
+ "lib/wukong/streamer/cassandra_streamer.rb",
183
195
  "lib/wukong/streamer/count_keys.rb",
184
196
  "lib/wukong/streamer/count_lines.rb",
185
197
  "lib/wukong/streamer/counting_reducer.rb",
@@ -208,51 +220,56 @@ Gem::Specification.new do |s|
208
220
  s.homepage = %q{http://mrflip.github.com/wukong}
209
221
  s.rdoc_options = ["--charset=UTF-8"]
210
222
  s.require_paths = ["lib"]
211
- s.rubygems_version = %q{1.3.6}
223
+ s.rubygems_version = %q{1.3.7}
212
224
  s.summary = %q{Hadoop Streaming for Ruby. Wukong makes Hadoop so easy a chimpanzee can use it, yet handles terabyte-scale computation with ease.}
213
225
  s.test_files = [
214
226
  "spec/spec_helper.rb",
215
227
  "spec/wukong/encoding_spec.rb",
216
228
  "spec/wukong/script_spec.rb",
217
- "examples/contrib/jeans/normalize.rb",
218
- "examples/contrib/jeans/sizes.rb",
219
- "examples/count_keys.rb",
220
- "examples/count_keys_at_mapper.rb",
221
- "examples/keystore/conditional_outputter_example.rb",
222
- "examples/network_graph/adjacency_list.rb",
223
- "examples/network_graph/breadth_first_search.rb",
224
- "examples/network_graph/gen_2paths.rb",
225
- "examples/network_graph/gen_multi_edge.rb",
226
- "examples/network_graph/gen_symmetric_links.rb",
227
229
  "examples/pagerank/pagerank.rb",
228
230
  "examples/pagerank/pagerank_initialize.rb",
229
- "examples/rank_and_bin.rb",
230
231
  "examples/sample_records.rb",
231
232
  "examples/server_logs/apache_log_parser.rb",
232
233
  "examples/server_logs/breadcrumbs.rb",
233
234
  "examples/server_logs/user_agent.rb",
235
+ "examples/corpus/words_to_bigrams.rb",
236
+ "examples/count_keys.rb",
237
+ "examples/rank_and_bin.rb",
238
+ "examples/binning_percentile_estimator.rb",
234
239
  "examples/size.rb",
240
+ "examples/network_graph/breadth_first_search.rb",
241
+ "examples/network_graph/gen_symmetric_links.rb",
242
+ "examples/network_graph/gen_multi_edge.rb",
243
+ "examples/network_graph/adjacency_list.rb",
244
+ "examples/network_graph/gen_2paths.rb",
245
+ "examples/keystore/cassandra_batch_test.rb",
246
+ "examples/keystore/conditional_outputter_example.rb",
235
247
  "examples/stats/avg_value_frequency.rb",
248
+ "examples/contrib/jeans/sizes.rb",
249
+ "examples/contrib/jeans/normalize.rb",
250
+ "examples/word_count.rb",
236
251
  "examples/stupidly_simple_filter.rb",
237
- "examples/word_count.rb"
252
+ "examples/count_keys_at_mapper.rb"
238
253
  ]
239
254
 
240
255
  if s.respond_to? :specification_version then
241
256
  current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
242
257
  s.specification_version = 3
243
258
 
244
- if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
259
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
245
260
  s.add_development_dependency(%q<rspec>, [">= 1.2.9"])
246
261
  s.add_development_dependency(%q<yard>, [">= 0"])
247
262
  s.add_runtime_dependency(%q<addressable>, [">= 0"])
248
263
  s.add_runtime_dependency(%q<extlib>, [">= 0"])
249
264
  s.add_runtime_dependency(%q<htmlentities>, [">= 0"])
265
+ s.add_runtime_dependency(%q<configliere>, [">= 0"])
250
266
  else
251
267
  s.add_dependency(%q<rspec>, [">= 1.2.9"])
252
268
  s.add_dependency(%q<yard>, [">= 0"])
253
269
  s.add_dependency(%q<addressable>, [">= 0"])
254
270
  s.add_dependency(%q<extlib>, [">= 0"])
255
271
  s.add_dependency(%q<htmlentities>, [">= 0"])
272
+ s.add_dependency(%q<configliere>, [">= 0"])
256
273
  end
257
274
  else
258
275
  s.add_dependency(%q<rspec>, [">= 1.2.9"])
@@ -260,6 +277,7 @@ Gem::Specification.new do |s|
260
277
  s.add_dependency(%q<addressable>, [">= 0"])
261
278
  s.add_dependency(%q<extlib>, [">= 0"])
262
279
  s.add_dependency(%q<htmlentities>, [">= 0"])
280
+ s.add_dependency(%q<configliere>, [">= 0"])
263
281
  end
264
282
  end
265
283