gandalf 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
data/lib/gandalf.rb CHANGED
@@ -4,13 +4,17 @@ require 'feedzirra'
4
4
  require 'activesupport'
5
5
  require 'rufus/scheduler'
6
6
  require 'dm-core'
7
- #has a bug
7
+ # Has a bug
8
8
  #require 'dm-aggregates'
9
9
 
10
10
  require 'digest/md5'
11
11
  require 'json'
12
12
 
13
+ require 'redis_ext/redis'
13
14
  require 'redis_ext/redis_queue'
15
+ require 'redis_ext/pipeline'
16
+ require 'redis_ext/dist_redis'
17
+ require 'redis_ext/hash_ring'
14
18
 
15
19
  require 'gandalf/scheduler'
16
20
  require 'gandalf/worker'
@@ -18,7 +18,6 @@ module Gandalf
18
18
  class Post
19
19
  include DataMapper::Resource
20
20
 
21
- storage_names[:default] = 'posts'
22
21
  property :id, Serial, :field => 'psid'
23
22
  property :channel_id, Integer, :length => 11, :index => true
24
23
  property :link, Text, :length => 255
@@ -35,6 +34,8 @@ module Gandalf
35
34
  self.title = self.title[0,255]
36
35
  if self.description
37
36
  self.description.gsub!(/\<[^\>]+\>|\n|&nbsp;/,' ')
37
+ self.description.gsub!(/&gt;/,'<')
38
+ self.description.gsub!(/&lt;/,'>')
38
39
  self.description.gsub!(/\s{2,}/,' ')
39
40
  end
40
41
  end
@@ -17,9 +17,13 @@ module Gandalf
17
17
  attr_accessor :redis
18
18
 
19
19
  # Sets workers' queues with Redis connection object.
20
- def setup(options = {:seed_class => Seed})
20
+ def setup(options = {})
21
21
  @redis = Redis.new(:host => self.redis_host, :db => self.redis_db_id)
22
- @Seed = options[:seed_class]
22
+ if options[:seed_class]
23
+ @Seed = options[:seed_class]
24
+ else
25
+ @Seed = Seed
26
+ end
23
27
  workers.each { |worker| worker.setup(:redis => @redis) }
24
28
  end
25
29
 
@@ -12,15 +12,19 @@ module Gandalf
12
12
 
13
13
  belongs_to :scheduler
14
14
 
15
- def setup(options = {:post_class => Post})
15
+ def setup(options = {})
16
16
  @queue = RedisQueue.new(:key => self.id, :redis => options[:redis]) unless @queue
17
- @Post = (options[:post_class])
17
+ if options[:post_class]
18
+ @Post = options[:post_class]
19
+ else
20
+ @Post = Post
21
+ end
18
22
  end
19
23
 
20
24
  def run
21
25
  @crawl_scheduler = Rufus::Scheduler.start_new unless @crawl_scheduler
22
26
  @crawl_scheduler.every interval do
23
- crawl new_jobs(max_jobs)
27
+ crawl new_jobs(max_jobs) if jobs_to_do > 0
24
28
  end
25
29
  end
26
30
 
@@ -0,0 +1,124 @@
1
+ require 'redis'
2
+ require 'hash_ring'
3
+ class DistRedis
4
+ attr_reader :ring
5
+ def initialize(opts={})
6
+ hosts = []
7
+
8
+ db = opts[:db] || nil
9
+ timeout = opts[:timeout] || nil
10
+
11
+ raise Error, "No hosts given" unless opts[:hosts]
12
+
13
+ opts[:hosts].each do |h|
14
+ host, port = h.split(':')
15
+ hosts << Redis.new(:host => host, :port => port, :db => db, :timeout => timeout)
16
+ end
17
+
18
+ @ring = HashRing.new hosts
19
+ end
20
+
21
+ def node_for_key(key)
22
+ key = $1 if key =~ /\{(.*)?\}/
23
+ @ring.get_node(key)
24
+ end
25
+
26
+ def add_server(server)
27
+ server, port = server.split(':')
28
+ @ring.add_node Redis.new(:host => server, :port => port)
29
+ end
30
+
31
+ def method_missing(sym, *args, &blk)
32
+ if redis = node_for_key(args.first.to_s)
33
+ redis.send sym, *args, &blk
34
+ else
35
+ super
36
+ end
37
+ end
38
+
39
+ def keys(glob)
40
+ @ring.nodes.map do |red|
41
+ red.keys(glob)
42
+ end
43
+ end
44
+
45
+ def save
46
+ on_each_node :save
47
+ end
48
+
49
+ def bgsave
50
+ on_each_node :bgsave
51
+ end
52
+
53
+ def quit
54
+ on_each_node :quit
55
+ end
56
+
57
+ def flush_all
58
+ on_each_node :flush_all
59
+ end
60
+ alias_method :flushall, :flush_all
61
+
62
+ def flush_db
63
+ on_each_node :flush_db
64
+ end
65
+ alias_method :flushdb, :flush_db
66
+
67
+ def delete_cloud!
68
+ @ring.nodes.each do |red|
69
+ red.keys("*").each do |key|
70
+ red.delete key
71
+ end
72
+ end
73
+ end
74
+
75
+ def on_each_node(command, *args)
76
+ @ring.nodes.each do |red|
77
+ red.send(command, *args)
78
+ end
79
+ end
80
+
81
+ end
82
+
83
+
84
+ if __FILE__ == $0
85
+
86
+ r = DistRedis.new 'localhost:6379', 'localhost:6380', 'localhost:6381', 'localhost:6382'
87
+ r['urmom'] = 'urmom'
88
+ r['urdad'] = 'urdad'
89
+ r['urmom1'] = 'urmom1'
90
+ r['urdad1'] = 'urdad1'
91
+ r['urmom2'] = 'urmom2'
92
+ r['urdad2'] = 'urdad2'
93
+ r['urmom3'] = 'urmom3'
94
+ r['urdad3'] = 'urdad3'
95
+ p r['urmom']
96
+ p r['urdad']
97
+ p r['urmom1']
98
+ p r['urdad1']
99
+ p r['urmom2']
100
+ p r['urdad2']
101
+ p r['urmom3']
102
+ p r['urdad3']
103
+
104
+ r.push_tail 'listor', 'foo1'
105
+ r.push_tail 'listor', 'foo2'
106
+ r.push_tail 'listor', 'foo3'
107
+ r.push_tail 'listor', 'foo4'
108
+ r.push_tail 'listor', 'foo5'
109
+
110
+ p r.pop_tail('listor')
111
+ p r.pop_tail('listor')
112
+ p r.pop_tail('listor')
113
+ p r.pop_tail('listor')
114
+ p r.pop_tail('listor')
115
+
116
+ puts "key distribution:"
117
+
118
+ r.ring.nodes.each do |red|
119
+ p [red.port, red.keys("*")]
120
+ end
121
+ r.delete_cloud!
122
+ p r.keys('*')
123
+
124
+ end
@@ -0,0 +1,128 @@
1
+ require 'zlib'
2
+
3
+ class HashRing
4
+
5
+ POINTS_PER_SERVER = 160 # this is the default in libmemcached
6
+
7
+ attr_reader :ring, :sorted_keys, :replicas, :nodes
8
+
9
+ # nodes is a list of objects that have a proper to_s representation.
10
+ # replicas indicates how many virtual points should be used pr. node,
11
+ # replicas are required to improve the distribution.
12
+ def initialize(nodes=[], replicas=POINTS_PER_SERVER)
13
+ @replicas = replicas
14
+ @ring = {}
15
+ @nodes = []
16
+ @sorted_keys = []
17
+ nodes.each do |node|
18
+ add_node(node)
19
+ end
20
+ end
21
+
22
+ # Adds a `node` to the hash ring (including a number of replicas).
23
+ def add_node(node)
24
+ @nodes << node
25
+ @replicas.times do |i|
26
+ key = Zlib.crc32("#{node}:#{i}")
27
+ @ring[key] = node
28
+ @sorted_keys << key
29
+ end
30
+ @sorted_keys.sort!
31
+ end
32
+
33
+ def remove_node(node)
34
+ @nodes.reject!{|n| n.to_s == node.to_s}
35
+ @replicas.times do |i|
36
+ key = Zlib.crc32("#{node}:#{i}")
37
+ @ring.delete(key)
38
+ @sorted_keys.reject! {|k| k == key}
39
+ end
40
+ end
41
+
42
+ # get the node in the hash ring for this key
43
+ def get_node(key)
44
+ get_node_pos(key)[0]
45
+ end
46
+
47
+ def get_node_pos(key)
48
+ return [nil,nil] if @ring.size == 0
49
+ crc = Zlib.crc32(key)
50
+ idx = HashRing.binary_search(@sorted_keys, crc)
51
+ return [@ring[@sorted_keys[idx]], idx]
52
+ end
53
+
54
+ def iter_nodes(key)
55
+ return [nil,nil] if @ring.size == 0
56
+ node, pos = get_node_pos(key)
57
+ @sorted_keys[pos..-1].each do |k|
58
+ yield @ring[k]
59
+ end
60
+ end
61
+
62
+ class << self
63
+
64
+ # gem install RubyInline to use this code
65
+ # Native extension to perform the binary search within the hashring.
66
+ # There's a pure ruby version below so this is purely optional
67
+ # for performance. In testing 20k gets and sets, the native
68
+ # binary search shaved about 12% off the runtime (9sec -> 8sec).
69
+ begin
70
+ require 'inline'
71
+ inline do |builder|
72
+ builder.c <<-EOM
73
+ int binary_search(VALUE ary, unsigned int r) {
74
+ int upper = RARRAY_LEN(ary) - 1;
75
+ int lower = 0;
76
+ int idx = 0;
77
+
78
+ while (lower <= upper) {
79
+ idx = (lower + upper) / 2;
80
+
81
+ VALUE continuumValue = RARRAY_PTR(ary)[idx];
82
+ unsigned int l = NUM2UINT(continuumValue);
83
+ if (l == r) {
84
+ return idx;
85
+ }
86
+ else if (l > r) {
87
+ upper = idx - 1;
88
+ }
89
+ else {
90
+ lower = idx + 1;
91
+ }
92
+ }
93
+ return upper;
94
+ }
95
+ EOM
96
+ end
97
+ rescue Exception => e
98
+ # Find the closest index in HashRing with value <= the given value
99
+ def binary_search(ary, value, &block)
100
+ upper = ary.size - 1
101
+ lower = 0
102
+ idx = 0
103
+
104
+ while(lower <= upper) do
105
+ idx = (lower + upper) / 2
106
+ comp = ary[idx] <=> value
107
+
108
+ if comp == 0
109
+ return idx
110
+ elsif comp > 0
111
+ upper = idx - 1
112
+ else
113
+ lower = idx + 1
114
+ end
115
+ end
116
+ return upper
117
+ end
118
+
119
+ end
120
+ end
121
+
122
+ end
123
+
124
+ # ring = HashRing.new ['server1', 'server2', 'server3']
125
+ # p ring
126
+ # #
127
+ # p ring.get_node "kjhjkjlkjlkkh"
128
+ #
@@ -0,0 +1,22 @@
1
+ require "redis"
2
+
3
+ class Redis
4
+ class Pipeline < Redis
5
+ BUFFER_SIZE = 50_000
6
+
7
+ def initialize(redis)
8
+ @redis = redis
9
+ @commands = []
10
+ end
11
+
12
+ def call_command(command)
13
+ @commands << command
14
+ end
15
+
16
+ def execute
17
+ @redis.call_command(@commands)
18
+ @commands.clear
19
+ end
20
+
21
+ end
22
+ end
@@ -0,0 +1,316 @@
1
+ require 'socket'
2
+ require File.join(File.dirname(__FILE__),'pipeline')
3
+
4
+ begin
5
+ if RUBY_VERSION >= '1.9'
6
+ require 'timeout'
7
+ RedisTimer = Timeout
8
+ else
9
+ require 'system_timer'
10
+ RedisTimer = SystemTimer
11
+ end
12
+ rescue LoadError
13
+ RedisTimer = nil
14
+ end
15
+
16
+ class Redis
17
+ OK = "OK".freeze
18
+ MINUS = "-".freeze
19
+ PLUS = "+".freeze
20
+ COLON = ":".freeze
21
+ DOLLAR = "$".freeze
22
+ ASTERISK = "*".freeze
23
+
24
+ BULK_COMMANDS = {
25
+ "set" => true,
26
+ "setnx" => true,
27
+ "rpush" => true,
28
+ "lpush" => true,
29
+ "lset" => true,
30
+ "lrem" => true,
31
+ "sadd" => true,
32
+ "srem" => true,
33
+ "sismember" => true,
34
+ "echo" => true,
35
+ "getset" => true,
36
+ "smove" => true
37
+ }
38
+
39
+ BOOLEAN_PROCESSOR = lambda{|r| r == 1 }
40
+
41
+ REPLY_PROCESSOR = {
42
+ "exists" => BOOLEAN_PROCESSOR,
43
+ "sismember" => BOOLEAN_PROCESSOR,
44
+ "sadd" => BOOLEAN_PROCESSOR,
45
+ "srem" => BOOLEAN_PROCESSOR,
46
+ "smove" => BOOLEAN_PROCESSOR,
47
+ "move" => BOOLEAN_PROCESSOR,
48
+ "setnx" => BOOLEAN_PROCESSOR,
49
+ "del" => BOOLEAN_PROCESSOR,
50
+ "renamenx" => BOOLEAN_PROCESSOR,
51
+ "expire" => BOOLEAN_PROCESSOR,
52
+ "keys" => lambda{|r| r.split(" ")},
53
+ "info" => lambda{|r|
54
+ info = {}
55
+ r.each_line {|kv|
56
+ k,v = kv.split(":",2).map{|x| x.chomp}
57
+ info[k.to_sym] = v
58
+ }
59
+ info
60
+ }
61
+ }
62
+
63
+ ALIASES = {
64
+ "flush_db" => "flushdb",
65
+ "flush_all" => "flushall",
66
+ "last_save" => "lastsave",
67
+ "key?" => "exists",
68
+ "delete" => "del",
69
+ "randkey" => "randomkey",
70
+ "list_length" => "llen",
71
+ "push_tail" => "rpush",
72
+ "push_head" => "lpush",
73
+ "pop_tail" => "rpop",
74
+ "pop_head" => "lpop",
75
+ "list_set" => "lset",
76
+ "list_range" => "lrange",
77
+ "list_trim" => "ltrim",
78
+ "list_index" => "lindex",
79
+ "list_rm" => "lrem",
80
+ "set_add" => "sadd",
81
+ "set_delete" => "srem",
82
+ "set_count" => "scard",
83
+ "set_member?" => "sismember",
84
+ "set_members" => "smembers",
85
+ "set_intersect" => "sinter",
86
+ "set_intersect_store" => "sinterstore",
87
+ "set_inter_store" => "sinterstore",
88
+ "set_union" => "sunion",
89
+ "set_union_store" => "sunionstore",
90
+ "set_diff" => "sdiff",
91
+ "set_diff_store" => "sdiffstore",
92
+ "set_move" => "smove",
93
+ "set_unless_exists" => "setnx",
94
+ "rename_unless_exists" => "renamenx",
95
+ "type?" => "type"
96
+ }
97
+
98
+ DISABLED_COMMANDS = {
99
+ "monitor" => true,
100
+ "sync" => true
101
+ }
102
+
103
+ def initialize(options = {})
104
+ @host = options[:host] || '127.0.0.1'
105
+ @port = (options[:port] || 6379).to_i
106
+ @db = (options[:db] || 0).to_i
107
+ @timeout = (options[:timeout] || 5).to_i
108
+ @password = options[:password]
109
+ @logger = options[:logger]
110
+
111
+ @logger.info { self.to_s } if @logger
112
+ connect_to_server
113
+ end
114
+
115
+ def to_s
116
+ "Redis Client connected to #{server} against DB #{@db}"
117
+ end
118
+
119
+ def server
120
+ "#{@host}:#{@port}"
121
+ end
122
+
123
+ def connect_to_server
124
+ @sock = connect_to(@host, @port, @timeout == 0 ? nil : @timeout)
125
+ call_command(["auth",@password]) if @password
126
+ call_command(["select",@db]) unless @db == 0
127
+ end
128
+
129
+ def connect_to(host, port, timeout=nil)
130
+ # We support connect() timeout only if system_timer is availabe
131
+ # or if we are running against Ruby >= 1.9
132
+ # Timeout reading from the socket instead will be supported anyway.
133
+ if @timeout != 0 and RedisTimer
134
+ begin
135
+ sock = TCPSocket.new(host, port)
136
+ rescue Timeout::Error
137
+ @sock = nil
138
+ raise Timeout::Error, "Timeout connecting to the server"
139
+ end
140
+ else
141
+ sock = TCPSocket.new(host, port)
142
+ end
143
+ sock.setsockopt Socket::IPPROTO_TCP, Socket::TCP_NODELAY, 1
144
+
145
+ # If the timeout is set we set the low level socket options in order
146
+ # to make sure a blocking read will return after the specified number
147
+ # of seconds. This hack is from memcached ruby client.
148
+ if timeout
149
+ secs = Integer(timeout)
150
+ usecs = Integer((timeout - secs) * 1_000_000)
151
+ optval = [secs, usecs].pack("l_2")
152
+ sock.setsockopt Socket::SOL_SOCKET, Socket::SO_RCVTIMEO, optval
153
+ sock.setsockopt Socket::SOL_SOCKET, Socket::SO_SNDTIMEO, optval
154
+ end
155
+ sock
156
+ end
157
+
158
+ def method_missing(*argv)
159
+ call_command(argv)
160
+ end
161
+
162
+ def call_command(argv)
163
+ @logger.debug { argv.inspect } if @logger
164
+
165
+ # this wrapper to raw_call_command handle reconnection on socket
166
+ # error. We try to reconnect just one time, otherwise let the error
167
+ # araise.
168
+ connect_to_server if !@sock
169
+
170
+ begin
171
+ raw_call_command(argv.dup)
172
+ rescue Errno::ECONNRESET, Errno::EPIPE
173
+ @sock.close
174
+ @sock = nil
175
+ connect_to_server
176
+ raw_call_command(argv.dup)
177
+ end
178
+ end
179
+
180
+ def raw_call_command(argvp)
181
+ pipeline = argvp[0].is_a?(Array)
182
+
183
+ unless pipeline
184
+ argvv = [argvp]
185
+ else
186
+ argvv = argvp
187
+ end
188
+
189
+ command = ''
190
+
191
+ argvv.each do |argv|
192
+ bulk = nil
193
+ argv[0] = argv[0].to_s.downcase
194
+ argv[0] = ALIASES[argv[0]] if ALIASES[argv[0]]
195
+ raise "#{argv[0]} command is disabled" if DISABLED_COMMANDS[argv[0]]
196
+ if BULK_COMMANDS[argv[0]] and argv.length > 1
197
+ bulk = argv[-1].to_s
198
+ argv[-1] = bulk.respond_to?(:bytesize) ? bulk.bytesize : bulk.size
199
+ end
200
+ command << "#{argv.join(' ')}\r\n"
201
+ command << "#{bulk}\r\n" if bulk
202
+ end
203
+
204
+ @sock.write(command)
205
+
206
+ results = argvv.map do |argv|
207
+ processor = REPLY_PROCESSOR[argv[0]]
208
+ processor ? processor.call(read_reply) : read_reply
209
+ end
210
+
211
+ return pipeline ? results : results[0]
212
+ end
213
+
214
+ def select(*args)
215
+ raise "SELECT not allowed, use the :db option when creating the object"
216
+ end
217
+
218
+ def [](key)
219
+ self.get(key)
220
+ end
221
+
222
+ def []=(key,value)
223
+ set(key,value)
224
+ end
225
+
226
+ def set(key, value, expiry=nil)
227
+ s = call_command([:set, key, value]) == OK
228
+ expire(key, expiry) if s && expiry
229
+ s
230
+ end
231
+
232
+ def sort(key, options = {})
233
+ cmd = ["SORT"]
234
+ cmd << key
235
+ cmd << "BY #{options[:by]}" if options[:by]
236
+ cmd << "GET #{[options[:get]].flatten * ' GET '}" if options[:get]
237
+ cmd << "#{options[:order]}" if options[:order]
238
+ cmd << "LIMIT #{options[:limit].join(' ')}" if options[:limit]
239
+ call_command(cmd)
240
+ end
241
+
242
+ def incr(key, increment = nil)
243
+ call_command(increment ? ["incrby",key,increment] : ["incr",key])
244
+ end
245
+
246
+ def decr(key,decrement = nil)
247
+ call_command(decrement ? ["decrby",key,decrement] : ["decr",key])
248
+ end
249
+
250
+ # Similar to memcache.rb's #get_multi, returns a hash mapping
251
+ # keys to values.
252
+ def mapped_mget(*keys)
253
+ mget(*keys).inject({}) do |hash, value|
254
+ key = keys.shift
255
+ value.nil? ? hash : hash.merge(key => value)
256
+ end
257
+ end
258
+
259
+ # Ruby defines a now deprecated type method so we need to override it here
260
+ # since it will never hit method_missing
261
+ def type(key)
262
+ call_command(['type', key])
263
+ end
264
+
265
+ def quit
266
+ call_command(['quit'])
267
+ rescue Errno::ECONNRESET
268
+ end
269
+
270
+ def pipelined(&block)
271
+ pipeline = Pipeline.new self
272
+ yield pipeline
273
+ pipeline.execute
274
+ end
275
+
276
+ def read_reply
277
+ # We read the first byte using read() mainly because gets() is
278
+ # immune to raw socket timeouts.
279
+ begin
280
+ rtype = @sock.read(1)
281
+ rescue Errno::EAGAIN
282
+ # We want to make sure it reconnects on the next command after the
283
+ # timeout. Otherwise the server may reply in the meantime leaving
284
+ # the protocol in a desync status.
285
+ @sock = nil
286
+ raise Errno::EAGAIN, "Timeout reading from the socket"
287
+ end
288
+
289
+ raise Errno::ECONNRESET,"Connection lost" if !rtype
290
+ line = @sock.gets
291
+ case rtype
292
+ when MINUS
293
+ raise MINUS + line.strip
294
+ when PLUS
295
+ line.strip
296
+ when COLON
297
+ line.to_i
298
+ when DOLLAR
299
+ bulklen = line.to_i
300
+ return nil if bulklen == -1
301
+ data = @sock.read(bulklen)
302
+ @sock.read(2) # CRLF
303
+ data
304
+ when ASTERISK
305
+ objects = line.to_i
306
+ return nil if bulklen == -1
307
+ res = []
308
+ objects.times {
309
+ res << read_reply
310
+ }
311
+ res
312
+ else
313
+ raise "Protocol error, got '#{rtype}' as initial reply byte"
314
+ end
315
+ end
316
+ end
@@ -1,4 +1,3 @@
1
- require 'redis'
2
1
  require 'json'
3
2
 
4
3
  class RedisQueue
@@ -20,8 +19,8 @@ class RedisQueue
20
19
  new_hash = {}
21
20
  value.each{|k, v| new_hash[k.to_sym] = v}
22
21
  return new_hash
23
- rescue Exception => ex
24
- puts ex
22
+ # Returns nil on any kind of exception
23
+ rescue Exception
25
24
  return nil
26
25
  end
27
26
  end
@@ -30,9 +29,9 @@ class RedisQueue
30
29
  @redis.llen(@key)
31
30
  end
32
31
 
33
- def pop_first(length)
34
- list = []
35
- length.times do
32
+ def pop_first(count)
33
+ list = []
34
+ count.times do
36
35
  element = self.pop
37
36
  break unless element
38
37
  list << element
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gandalf
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kijun Seo
@@ -62,6 +62,10 @@ extra_rdoc_files: []
62
62
 
63
63
  files:
64
64
  - lib/redis_ext/redis_queue.rb
65
+ - lib/redis_ext/redis.rb
66
+ - lib/redis_ext/dist_redis.rb
67
+ - lib/redis_ext/pipeline.rb
68
+ - lib/redis_ext/hash_ring.rb
65
69
  - lib/gandalf.rb
66
70
  - lib/gandalf/models.rb
67
71
  - lib/gandalf/scheduler.rb