gandalf 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/gandalf.rb CHANGED
@@ -4,13 +4,17 @@ require 'feedzirra'
4
4
  require 'activesupport'
5
5
  require 'rufus/scheduler'
6
6
  require 'dm-core'
7
- #has a bug
7
+ # Has a bug
8
8
  #require 'dm-aggregates'
9
9
 
10
10
  require 'digest/md5'
11
11
  require 'json'
12
12
 
13
+ require 'redis_ext/redis'
13
14
  require 'redis_ext/redis_queue'
15
+ require 'redis_ext/pipeline'
16
+ require 'redis_ext/dist_redis'
17
+ require 'redis_ext/hash_ring'
14
18
 
15
19
  require 'gandalf/scheduler'
16
20
  require 'gandalf/worker'
@@ -18,7 +18,6 @@ module Gandalf
18
18
  class Post
19
19
  include DataMapper::Resource
20
20
 
21
- storage_names[:default] = 'posts'
22
21
  property :id, Serial, :field => 'psid'
23
22
  property :channel_id, Integer, :length => 11, :index => true
24
23
  property :link, Text, :length => 255
@@ -35,6 +34,8 @@ module Gandalf
35
34
  self.title = self.title[0,255]
36
35
  if self.description
37
36
  self.description.gsub!(/\<[^\>]+\>|\n|&nbsp;/,' ')
37
+ self.description.gsub!(/&gt;/,'<')
38
+ self.description.gsub!(/&lt;/,'>')
38
39
  self.description.gsub!(/\s{2,}/,' ')
39
40
  end
40
41
  end
@@ -17,9 +17,13 @@ module Gandalf
17
17
  attr_accessor :redis
18
18
 
19
19
  # Sets workers' queues with Redis connection object.
20
- def setup(options = {:seed_class => Seed})
20
+ def setup(options = {})
21
21
  @redis = Redis.new(:host => self.redis_host, :db => self.redis_db_id)
22
- @Seed = options[:seed_class]
22
+ if options[:seed_class]
23
+ @Seed = options[:seed_class]
24
+ else
25
+ @Seed = Seed
26
+ end
23
27
  workers.each { |worker| worker.setup(:redis => @redis) }
24
28
  end
25
29
 
@@ -12,15 +12,19 @@ module Gandalf
12
12
 
13
13
  belongs_to :scheduler
14
14
 
15
- def setup(options = {:post_class => Post})
15
+ def setup(options = {})
16
16
  @queue = RedisQueue.new(:key => self.id, :redis => options[:redis]) unless @queue
17
- @Post = (options[:post_class])
17
+ if options[:post_class]
18
+ @Post = options[:post_class]
19
+ else
20
+ @Post = Post
21
+ end
18
22
  end
19
23
 
20
24
  def run
21
25
  @crawl_scheduler = Rufus::Scheduler.start_new unless @crawl_scheduler
22
26
  @crawl_scheduler.every interval do
23
- crawl new_jobs(max_jobs)
27
+ crawl new_jobs(max_jobs) if jobs_to_do > 0
24
28
  end
25
29
  end
26
30
 
@@ -0,0 +1,124 @@
1
+ require 'redis'
2
+ require 'hash_ring'
3
+ class DistRedis
4
+ attr_reader :ring
5
+ def initialize(opts={})
6
+ hosts = []
7
+
8
+ db = opts[:db] || nil
9
+ timeout = opts[:timeout] || nil
10
+
11
+ raise Error, "No hosts given" unless opts[:hosts]
12
+
13
+ opts[:hosts].each do |h|
14
+ host, port = h.split(':')
15
+ hosts << Redis.new(:host => host, :port => port, :db => db, :timeout => timeout)
16
+ end
17
+
18
+ @ring = HashRing.new hosts
19
+ end
20
+
21
+ def node_for_key(key)
22
+ key = $1 if key =~ /\{(.*)?\}/
23
+ @ring.get_node(key)
24
+ end
25
+
26
+ def add_server(server)
27
+ server, port = server.split(':')
28
+ @ring.add_node Redis.new(:host => server, :port => port)
29
+ end
30
+
31
+ def method_missing(sym, *args, &blk)
32
+ if redis = node_for_key(args.first.to_s)
33
+ redis.send sym, *args, &blk
34
+ else
35
+ super
36
+ end
37
+ end
38
+
39
+ def keys(glob)
40
+ @ring.nodes.map do |red|
41
+ red.keys(glob)
42
+ end
43
+ end
44
+
45
+ def save
46
+ on_each_node :save
47
+ end
48
+
49
+ def bgsave
50
+ on_each_node :bgsave
51
+ end
52
+
53
+ def quit
54
+ on_each_node :quit
55
+ end
56
+
57
+ def flush_all
58
+ on_each_node :flush_all
59
+ end
60
+ alias_method :flushall, :flush_all
61
+
62
+ def flush_db
63
+ on_each_node :flush_db
64
+ end
65
+ alias_method :flushdb, :flush_db
66
+
67
+ def delete_cloud!
68
+ @ring.nodes.each do |red|
69
+ red.keys("*").each do |key|
70
+ red.delete key
71
+ end
72
+ end
73
+ end
74
+
75
+ def on_each_node(command, *args)
76
+ @ring.nodes.each do |red|
77
+ red.send(command, *args)
78
+ end
79
+ end
80
+
81
+ end
82
+
83
+
84
+ if __FILE__ == $0
85
+
86
+ r = DistRedis.new 'localhost:6379', 'localhost:6380', 'localhost:6381', 'localhost:6382'
87
+ r['urmom'] = 'urmom'
88
+ r['urdad'] = 'urdad'
89
+ r['urmom1'] = 'urmom1'
90
+ r['urdad1'] = 'urdad1'
91
+ r['urmom2'] = 'urmom2'
92
+ r['urdad2'] = 'urdad2'
93
+ r['urmom3'] = 'urmom3'
94
+ r['urdad3'] = 'urdad3'
95
+ p r['urmom']
96
+ p r['urdad']
97
+ p r['urmom1']
98
+ p r['urdad1']
99
+ p r['urmom2']
100
+ p r['urdad2']
101
+ p r['urmom3']
102
+ p r['urdad3']
103
+
104
+ r.push_tail 'listor', 'foo1'
105
+ r.push_tail 'listor', 'foo2'
106
+ r.push_tail 'listor', 'foo3'
107
+ r.push_tail 'listor', 'foo4'
108
+ r.push_tail 'listor', 'foo5'
109
+
110
+ p r.pop_tail('listor')
111
+ p r.pop_tail('listor')
112
+ p r.pop_tail('listor')
113
+ p r.pop_tail('listor')
114
+ p r.pop_tail('listor')
115
+
116
+ puts "key distribution:"
117
+
118
+ r.ring.nodes.each do |red|
119
+ p [red.port, red.keys("*")]
120
+ end
121
+ r.delete_cloud!
122
+ p r.keys('*')
123
+
124
+ end
@@ -0,0 +1,128 @@
1
+ require 'zlib'
2
+
3
+ class HashRing
4
+
5
+ POINTS_PER_SERVER = 160 # this is the default in libmemcached
6
+
7
+ attr_reader :ring, :sorted_keys, :replicas, :nodes
8
+
9
+ # nodes is a list of objects that have a proper to_s representation.
10
+ # replicas indicates how many virtual points should be used pr. node,
11
+ # replicas are required to improve the distribution.
12
+ def initialize(nodes=[], replicas=POINTS_PER_SERVER)
13
+ @replicas = replicas
14
+ @ring = {}
15
+ @nodes = []
16
+ @sorted_keys = []
17
+ nodes.each do |node|
18
+ add_node(node)
19
+ end
20
+ end
21
+
22
+ # Adds a `node` to the hash ring (including a number of replicas).
23
+ def add_node(node)
24
+ @nodes << node
25
+ @replicas.times do |i|
26
+ key = Zlib.crc32("#{node}:#{i}")
27
+ @ring[key] = node
28
+ @sorted_keys << key
29
+ end
30
+ @sorted_keys.sort!
31
+ end
32
+
33
+ def remove_node(node)
34
+ @nodes.reject!{|n| n.to_s == node.to_s}
35
+ @replicas.times do |i|
36
+ key = Zlib.crc32("#{node}:#{i}")
37
+ @ring.delete(key)
38
+ @sorted_keys.reject! {|k| k == key}
39
+ end
40
+ end
41
+
42
+ # get the node in the hash ring for this key
43
+ def get_node(key)
44
+ get_node_pos(key)[0]
45
+ end
46
+
47
+ def get_node_pos(key)
48
+ return [nil,nil] if @ring.size == 0
49
+ crc = Zlib.crc32(key)
50
+ idx = HashRing.binary_search(@sorted_keys, crc)
51
+ return [@ring[@sorted_keys[idx]], idx]
52
+ end
53
+
54
+ def iter_nodes(key)
55
+ return [nil,nil] if @ring.size == 0
56
+ node, pos = get_node_pos(key)
57
+ @sorted_keys[pos..-1].each do |k|
58
+ yield @ring[k]
59
+ end
60
+ end
61
+
62
+ class << self
63
+
64
+ # gem install RubyInline to use this code
65
+ # Native extension to perform the binary search within the hashring.
66
+ # There's a pure ruby version below so this is purely optional
67
+ # for performance. In testing 20k gets and sets, the native
68
+ # binary search shaved about 12% off the runtime (9sec -> 8sec).
69
+ begin
70
+ require 'inline'
71
+ inline do |builder|
72
+ builder.c <<-EOM
73
+ int binary_search(VALUE ary, unsigned int r) {
74
+ int upper = RARRAY_LEN(ary) - 1;
75
+ int lower = 0;
76
+ int idx = 0;
77
+
78
+ while (lower <= upper) {
79
+ idx = (lower + upper) / 2;
80
+
81
+ VALUE continuumValue = RARRAY_PTR(ary)[idx];
82
+ unsigned int l = NUM2UINT(continuumValue);
83
+ if (l == r) {
84
+ return idx;
85
+ }
86
+ else if (l > r) {
87
+ upper = idx - 1;
88
+ }
89
+ else {
90
+ lower = idx + 1;
91
+ }
92
+ }
93
+ return upper;
94
+ }
95
+ EOM
96
+ end
97
+ rescue Exception => e
98
+ # Find the closest index in HashRing with value <= the given value
99
+ def binary_search(ary, value, &block)
100
+ upper = ary.size - 1
101
+ lower = 0
102
+ idx = 0
103
+
104
+ while(lower <= upper) do
105
+ idx = (lower + upper) / 2
106
+ comp = ary[idx] <=> value
107
+
108
+ if comp == 0
109
+ return idx
110
+ elsif comp > 0
111
+ upper = idx - 1
112
+ else
113
+ lower = idx + 1
114
+ end
115
+ end
116
+ return upper
117
+ end
118
+
119
+ end
120
+ end
121
+
122
+ end
123
+
124
+ # ring = HashRing.new ['server1', 'server2', 'server3']
125
+ # p ring
126
+ # #
127
+ # p ring.get_node "kjhjkjlkjlkkh"
128
+ #
@@ -0,0 +1,22 @@
1
+ require "redis"
2
+
3
+ class Redis
4
+ class Pipeline < Redis
5
+ BUFFER_SIZE = 50_000
6
+
7
+ def initialize(redis)
8
+ @redis = redis
9
+ @commands = []
10
+ end
11
+
12
+ def call_command(command)
13
+ @commands << command
14
+ end
15
+
16
+ def execute
17
+ @redis.call_command(@commands)
18
+ @commands.clear
19
+ end
20
+
21
+ end
22
+ end
@@ -0,0 +1,316 @@
1
+ require 'socket'
2
+ require File.join(File.dirname(__FILE__),'pipeline')
3
+
4
+ begin
5
+ if RUBY_VERSION >= '1.9'
6
+ require 'timeout'
7
+ RedisTimer = Timeout
8
+ else
9
+ require 'system_timer'
10
+ RedisTimer = SystemTimer
11
+ end
12
+ rescue LoadError
13
+ RedisTimer = nil
14
+ end
15
+
16
+ class Redis
17
+ OK = "OK".freeze
18
+ MINUS = "-".freeze
19
+ PLUS = "+".freeze
20
+ COLON = ":".freeze
21
+ DOLLAR = "$".freeze
22
+ ASTERISK = "*".freeze
23
+
24
+ BULK_COMMANDS = {
25
+ "set" => true,
26
+ "setnx" => true,
27
+ "rpush" => true,
28
+ "lpush" => true,
29
+ "lset" => true,
30
+ "lrem" => true,
31
+ "sadd" => true,
32
+ "srem" => true,
33
+ "sismember" => true,
34
+ "echo" => true,
35
+ "getset" => true,
36
+ "smove" => true
37
+ }
38
+
39
+ BOOLEAN_PROCESSOR = lambda{|r| r == 1 }
40
+
41
+ REPLY_PROCESSOR = {
42
+ "exists" => BOOLEAN_PROCESSOR,
43
+ "sismember" => BOOLEAN_PROCESSOR,
44
+ "sadd" => BOOLEAN_PROCESSOR,
45
+ "srem" => BOOLEAN_PROCESSOR,
46
+ "smove" => BOOLEAN_PROCESSOR,
47
+ "move" => BOOLEAN_PROCESSOR,
48
+ "setnx" => BOOLEAN_PROCESSOR,
49
+ "del" => BOOLEAN_PROCESSOR,
50
+ "renamenx" => BOOLEAN_PROCESSOR,
51
+ "expire" => BOOLEAN_PROCESSOR,
52
+ "keys" => lambda{|r| r.split(" ")},
53
+ "info" => lambda{|r|
54
+ info = {}
55
+ r.each_line {|kv|
56
+ k,v = kv.split(":",2).map{|x| x.chomp}
57
+ info[k.to_sym] = v
58
+ }
59
+ info
60
+ }
61
+ }
62
+
63
+ ALIASES = {
64
+ "flush_db" => "flushdb",
65
+ "flush_all" => "flushall",
66
+ "last_save" => "lastsave",
67
+ "key?" => "exists",
68
+ "delete" => "del",
69
+ "randkey" => "randomkey",
70
+ "list_length" => "llen",
71
+ "push_tail" => "rpush",
72
+ "push_head" => "lpush",
73
+ "pop_tail" => "rpop",
74
+ "pop_head" => "lpop",
75
+ "list_set" => "lset",
76
+ "list_range" => "lrange",
77
+ "list_trim" => "ltrim",
78
+ "list_index" => "lindex",
79
+ "list_rm" => "lrem",
80
+ "set_add" => "sadd",
81
+ "set_delete" => "srem",
82
+ "set_count" => "scard",
83
+ "set_member?" => "sismember",
84
+ "set_members" => "smembers",
85
+ "set_intersect" => "sinter",
86
+ "set_intersect_store" => "sinterstore",
87
+ "set_inter_store" => "sinterstore",
88
+ "set_union" => "sunion",
89
+ "set_union_store" => "sunionstore",
90
+ "set_diff" => "sdiff",
91
+ "set_diff_store" => "sdiffstore",
92
+ "set_move" => "smove",
93
+ "set_unless_exists" => "setnx",
94
+ "rename_unless_exists" => "renamenx",
95
+ "type?" => "type"
96
+ }
97
+
98
+ DISABLED_COMMANDS = {
99
+ "monitor" => true,
100
+ "sync" => true
101
+ }
102
+
103
+ def initialize(options = {})
104
+ @host = options[:host] || '127.0.0.1'
105
+ @port = (options[:port] || 6379).to_i
106
+ @db = (options[:db] || 0).to_i
107
+ @timeout = (options[:timeout] || 5).to_i
108
+ @password = options[:password]
109
+ @logger = options[:logger]
110
+
111
+ @logger.info { self.to_s } if @logger
112
+ connect_to_server
113
+ end
114
+
115
+ def to_s
116
+ "Redis Client connected to #{server} against DB #{@db}"
117
+ end
118
+
119
+ def server
120
+ "#{@host}:#{@port}"
121
+ end
122
+
123
+ def connect_to_server
124
+ @sock = connect_to(@host, @port, @timeout == 0 ? nil : @timeout)
125
+ call_command(["auth",@password]) if @password
126
+ call_command(["select",@db]) unless @db == 0
127
+ end
128
+
129
+ def connect_to(host, port, timeout=nil)
130
+ # We support connect() timeout only if system_timer is availabe
131
+ # or if we are running against Ruby >= 1.9
132
+ # Timeout reading from the socket instead will be supported anyway.
133
+ if @timeout != 0 and RedisTimer
134
+ begin
135
+ sock = TCPSocket.new(host, port)
136
+ rescue Timeout::Error
137
+ @sock = nil
138
+ raise Timeout::Error, "Timeout connecting to the server"
139
+ end
140
+ else
141
+ sock = TCPSocket.new(host, port)
142
+ end
143
+ sock.setsockopt Socket::IPPROTO_TCP, Socket::TCP_NODELAY, 1
144
+
145
+ # If the timeout is set we set the low level socket options in order
146
+ # to make sure a blocking read will return after the specified number
147
+ # of seconds. This hack is from memcached ruby client.
148
+ if timeout
149
+ secs = Integer(timeout)
150
+ usecs = Integer((timeout - secs) * 1_000_000)
151
+ optval = [secs, usecs].pack("l_2")
152
+ sock.setsockopt Socket::SOL_SOCKET, Socket::SO_RCVTIMEO, optval
153
+ sock.setsockopt Socket::SOL_SOCKET, Socket::SO_SNDTIMEO, optval
154
+ end
155
+ sock
156
+ end
157
+
158
+ def method_missing(*argv)
159
+ call_command(argv)
160
+ end
161
+
162
+ def call_command(argv)
163
+ @logger.debug { argv.inspect } if @logger
164
+
165
+ # this wrapper to raw_call_command handle reconnection on socket
166
+ # error. We try to reconnect just one time, otherwise let the error
167
+ # araise.
168
+ connect_to_server if !@sock
169
+
170
+ begin
171
+ raw_call_command(argv.dup)
172
+ rescue Errno::ECONNRESET, Errno::EPIPE
173
+ @sock.close
174
+ @sock = nil
175
+ connect_to_server
176
+ raw_call_command(argv.dup)
177
+ end
178
+ end
179
+
180
+ def raw_call_command(argvp)
181
+ pipeline = argvp[0].is_a?(Array)
182
+
183
+ unless pipeline
184
+ argvv = [argvp]
185
+ else
186
+ argvv = argvp
187
+ end
188
+
189
+ command = ''
190
+
191
+ argvv.each do |argv|
192
+ bulk = nil
193
+ argv[0] = argv[0].to_s.downcase
194
+ argv[0] = ALIASES[argv[0]] if ALIASES[argv[0]]
195
+ raise "#{argv[0]} command is disabled" if DISABLED_COMMANDS[argv[0]]
196
+ if BULK_COMMANDS[argv[0]] and argv.length > 1
197
+ bulk = argv[-1].to_s
198
+ argv[-1] = bulk.respond_to?(:bytesize) ? bulk.bytesize : bulk.size
199
+ end
200
+ command << "#{argv.join(' ')}\r\n"
201
+ command << "#{bulk}\r\n" if bulk
202
+ end
203
+
204
+ @sock.write(command)
205
+
206
+ results = argvv.map do |argv|
207
+ processor = REPLY_PROCESSOR[argv[0]]
208
+ processor ? processor.call(read_reply) : read_reply
209
+ end
210
+
211
+ return pipeline ? results : results[0]
212
+ end
213
+
214
+ def select(*args)
215
+ raise "SELECT not allowed, use the :db option when creating the object"
216
+ end
217
+
218
+ def [](key)
219
+ self.get(key)
220
+ end
221
+
222
+ def []=(key,value)
223
+ set(key,value)
224
+ end
225
+
226
+ def set(key, value, expiry=nil)
227
+ s = call_command([:set, key, value]) == OK
228
+ expire(key, expiry) if s && expiry
229
+ s
230
+ end
231
+
232
+ def sort(key, options = {})
233
+ cmd = ["SORT"]
234
+ cmd << key
235
+ cmd << "BY #{options[:by]}" if options[:by]
236
+ cmd << "GET #{[options[:get]].flatten * ' GET '}" if options[:get]
237
+ cmd << "#{options[:order]}" if options[:order]
238
+ cmd << "LIMIT #{options[:limit].join(' ')}" if options[:limit]
239
+ call_command(cmd)
240
+ end
241
+
242
+ def incr(key, increment = nil)
243
+ call_command(increment ? ["incrby",key,increment] : ["incr",key])
244
+ end
245
+
246
+ def decr(key,decrement = nil)
247
+ call_command(decrement ? ["decrby",key,decrement] : ["decr",key])
248
+ end
249
+
250
+ # Similar to memcache.rb's #get_multi, returns a hash mapping
251
+ # keys to values.
252
+ def mapped_mget(*keys)
253
+ mget(*keys).inject({}) do |hash, value|
254
+ key = keys.shift
255
+ value.nil? ? hash : hash.merge(key => value)
256
+ end
257
+ end
258
+
259
+ # Ruby defines a now deprecated type method so we need to override it here
260
+ # since it will never hit method_missing
261
+ def type(key)
262
+ call_command(['type', key])
263
+ end
264
+
265
+ def quit
266
+ call_command(['quit'])
267
+ rescue Errno::ECONNRESET
268
+ end
269
+
270
+ def pipelined(&block)
271
+ pipeline = Pipeline.new self
272
+ yield pipeline
273
+ pipeline.execute
274
+ end
275
+
276
+ def read_reply
277
+ # We read the first byte using read() mainly because gets() is
278
+ # immune to raw socket timeouts.
279
+ begin
280
+ rtype = @sock.read(1)
281
+ rescue Errno::EAGAIN
282
+ # We want to make sure it reconnects on the next command after the
283
+ # timeout. Otherwise the server may reply in the meantime leaving
284
+ # the protocol in a desync status.
285
+ @sock = nil
286
+ raise Errno::EAGAIN, "Timeout reading from the socket"
287
+ end
288
+
289
+ raise Errno::ECONNRESET,"Connection lost" if !rtype
290
+ line = @sock.gets
291
+ case rtype
292
+ when MINUS
293
+ raise MINUS + line.strip
294
+ when PLUS
295
+ line.strip
296
+ when COLON
297
+ line.to_i
298
+ when DOLLAR
299
+ bulklen = line.to_i
300
+ return nil if bulklen == -1
301
+ data = @sock.read(bulklen)
302
+ @sock.read(2) # CRLF
303
+ data
304
+ when ASTERISK
305
+ objects = line.to_i
306
+ return nil if bulklen == -1
307
+ res = []
308
+ objects.times {
309
+ res << read_reply
310
+ }
311
+ res
312
+ else
313
+ raise "Protocol error, got '#{rtype}' as initial reply byte"
314
+ end
315
+ end
316
+ end
@@ -1,4 +1,3 @@
1
- require 'redis'
2
1
  require 'json'
3
2
 
4
3
  class RedisQueue
@@ -20,8 +19,8 @@ class RedisQueue
20
19
  new_hash = {}
21
20
  value.each{|k, v| new_hash[k.to_sym] = v}
22
21
  return new_hash
23
- rescue Exception => ex
24
- puts ex
22
+ # Returns nil on any kind of exception
23
+ rescue Exception
25
24
  return nil
26
25
  end
27
26
  end
@@ -30,9 +29,9 @@ class RedisQueue
30
29
  @redis.llen(@key)
31
30
  end
32
31
 
33
- def pop_first(length)
34
- list = []
35
- length.times do
32
+ def pop_first(count)
33
+ list = []
34
+ count.times do
36
35
  element = self.pop
37
36
  break unless element
38
37
  list << element
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gandalf
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kijun Seo
@@ -62,6 +62,10 @@ extra_rdoc_files: []
62
62
 
63
63
  files:
64
64
  - lib/redis_ext/redis_queue.rb
65
+ - lib/redis_ext/redis.rb
66
+ - lib/redis_ext/dist_redis.rb
67
+ - lib/redis_ext/pipeline.rb
68
+ - lib/redis_ext/hash_ring.rb
65
69
  - lib/gandalf.rb
66
70
  - lib/gandalf/models.rb
67
71
  - lib/gandalf/scheduler.rb