gandalf 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/gandalf/models.rb +55 -0
- data/lib/gandalf/scheduler.rb +89 -0
- data/lib/gandalf/worker.rb +93 -0
- data/lib/gandalf.rb +21 -0
- data/lib/redis_ext/redis_queue.rb +42 -0
- metadata +98 -0
@@ -0,0 +1,55 @@
|
|
1
|
+
module Gandalf
|
2
|
+
class Seed
|
3
|
+
include DataMapper::Resource
|
4
|
+
|
5
|
+
property :id, Serial
|
6
|
+
property :include_update, Boolean, :index => true
|
7
|
+
property :interval_update, Integer
|
8
|
+
property :url, String, :length => 255, :unique_index => true
|
9
|
+
|
10
|
+
def to_json
|
11
|
+
{:id => id,
|
12
|
+
:include_update => include_update,
|
13
|
+
:interval_update => interval_update,
|
14
|
+
:url => url}.to_json
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
class Post
|
19
|
+
include DataMapper::Resource
|
20
|
+
|
21
|
+
storage_names[:default] = 'posts'
|
22
|
+
property :id, Serial, :field => 'psid'
|
23
|
+
property :channel_id, Integer, :length => 11, :index => true
|
24
|
+
property :link, Text, :length => 255
|
25
|
+
property :title, String, :length => 255
|
26
|
+
property :author, String, :lazy => true
|
27
|
+
property :pub_date, DateTime, :field => 'pubDate'
|
28
|
+
property :update_date, DateTime, :field => 'updateDate', :default => DateTime.now
|
29
|
+
property :description, Text
|
30
|
+
property :cache_link, String, :length => 32, :unique_index => true
|
31
|
+
|
32
|
+
belongs_to :seed, :child_key => [:channel_id]
|
33
|
+
|
34
|
+
def clean!
|
35
|
+
self.title = self.title[0,255]
|
36
|
+
if self.description
|
37
|
+
self.description.gsub!(/\<[^\>]+\>|\n| /,' ')
|
38
|
+
self.description.gsub!(/\s{2,}/,' ')
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def Post.parse(feed)
|
43
|
+
feed.entries.map do |entry|
|
44
|
+
post = self.new({
|
45
|
+
:title => entry.title,
|
46
|
+
:link => entry.url,
|
47
|
+
:author => entry.author,
|
48
|
+
:description => entry.summary,
|
49
|
+
:pub_date => entry.published.to_datetime,
|
50
|
+
:cache_link => Digest::MD5.hexdigest(entry.url)
|
51
|
+
})
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,89 @@
|
|
1
|
+
module Gandalf
|
2
|
+
# A magical scheduler
|
3
|
+
class Scheduler
|
4
|
+
include DataMapper::Resource
|
5
|
+
|
6
|
+
property :id, Serial
|
7
|
+
property :redis_host, String, :default => nil
|
8
|
+
property :redis_db_id, Integer, :default => 0
|
9
|
+
property :seed_table, String
|
10
|
+
property :seed_count, Integer, :default => 0
|
11
|
+
property :last_job_id, Integer, :default => 0
|
12
|
+
# Sleep length in seconds, initialized to 60
|
13
|
+
property :interval, Integer, :length => 5, :default => 60
|
14
|
+
|
15
|
+
has n, :workers
|
16
|
+
|
17
|
+
attr_accessor :redis
|
18
|
+
|
19
|
+
# Sets workers' queues with Redis connection object.
|
20
|
+
def setup(options = {:seed_class => Seed})
|
21
|
+
@redis = Redis.new(:host => self.redis_host, :db => self.redis_db_id)
|
22
|
+
@Seed = options[:seed_class]
|
23
|
+
workers.each { |worker| worker.setup(:redis => @redis) }
|
24
|
+
end
|
25
|
+
|
26
|
+
# Executes jobs using a scheduler
|
27
|
+
def run
|
28
|
+
scheduler = Rufus::Scheduler.start_new
|
29
|
+
scheduler.every interval do
|
30
|
+
execute
|
31
|
+
end
|
32
|
+
|
33
|
+
scheduler.every 10*interval do
|
34
|
+
# TODO Use dm-aggregates when the bug gets fixed.
|
35
|
+
self.seed_count = repository.adapter.query("SELECT COUNT(*) FROM #{seed_table} WHERE include_update = 1")
|
36
|
+
save
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def execute
|
41
|
+
jobs = new_jobs
|
42
|
+
self.last_job_id = jobs.last.id
|
43
|
+
save
|
44
|
+
|
45
|
+
new_loads = job_distribution(current_workload, jobs.count)
|
46
|
+
push_jobs(jobs, new_loads)
|
47
|
+
end
|
48
|
+
|
49
|
+
def jobs_per_interval
|
50
|
+
seed_count / (1440 * 60 / interval)
|
51
|
+
end
|
52
|
+
|
53
|
+
def new_jobs
|
54
|
+
jobs = @Seed.all(:id.gte => last_job_id,
|
55
|
+
:include_update => true,
|
56
|
+
:limit => jobs_per_interval)
|
57
|
+
if jobs.length < jobs_per_interval
|
58
|
+
jobs += @Seed.all(:limit => jobs_per_interval - jobs.length)
|
59
|
+
end
|
60
|
+
jobs
|
61
|
+
end
|
62
|
+
|
63
|
+
def current_workload
|
64
|
+
workload = {}
|
65
|
+
workers.each { |worker| workload[worker.id] = worker.jobs_to_do }
|
66
|
+
workload
|
67
|
+
end
|
68
|
+
|
69
|
+
def push_jobs(jobs, workload)
|
70
|
+
workload.each do |worker_id, wload|
|
71
|
+
worker = workers.get(worker_id)
|
72
|
+
worker.push(jobs.slice!(0,wload))
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
# Recursively calculates next job distribution
|
77
|
+
# TODO Find a formula
|
78
|
+
def job_distribution(workload, jobs)
|
79
|
+
workload = workload.clone
|
80
|
+
distribution = Hash.new(0)
|
81
|
+
jobs.times do
|
82
|
+
min_index = workload.min_by{|k, v| v}.first
|
83
|
+
workload[min_index] += 1
|
84
|
+
distribution[min_index] += 1
|
85
|
+
end
|
86
|
+
distribution
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
@@ -0,0 +1,93 @@
|
|
1
|
+
module Gandalf
|
2
|
+
# A magical slave
|
3
|
+
class Worker
|
4
|
+
include DataMapper::Resource
|
5
|
+
|
6
|
+
property :id, Serial
|
7
|
+
# Maximum number of jobs per interval
|
8
|
+
property :max_jobs, Integer, :default => 30
|
9
|
+
# Sleep length in seconds, initialized to 15
|
10
|
+
property :interval, Integer, :default => 15
|
11
|
+
property :max_errors, Integer, :length => 1, :default => 2
|
12
|
+
|
13
|
+
belongs_to :scheduler
|
14
|
+
|
15
|
+
def setup(options = {:post_class => Post})
|
16
|
+
@queue = RedisQueue.new(:key => self.id, :redis => options[:redis]) unless @queue
|
17
|
+
@Post = (options[:post_class])
|
18
|
+
end
|
19
|
+
|
20
|
+
def run
|
21
|
+
@crawl_scheduler = Rufus::Scheduler.start_new unless @crawl_scheduler
|
22
|
+
@crawl_scheduler.every interval do
|
23
|
+
crawl new_jobs(max_jobs)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def stop
|
28
|
+
@crawl_scheduler.stop
|
29
|
+
end
|
30
|
+
|
31
|
+
def stop
|
32
|
+
@crawl_scheduler.start
|
33
|
+
end
|
34
|
+
|
35
|
+
def crawl(jobs)
|
36
|
+
urls = jobs.keys
|
37
|
+
feeds = Feedzirra::Feed.fetch_and_parse(urls)
|
38
|
+
jobs.each do |url, job|
|
39
|
+
if feeds[url].is_a? Feedzirra::Parser::RSS
|
40
|
+
save_posts(feeds[url], job[:id])
|
41
|
+
else
|
42
|
+
handle_error(job)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def save_posts(feed, channel_id)
|
48
|
+
posts = @Post.parse(feed)
|
49
|
+
posts.each do |p|
|
50
|
+
p.channel_id = channel_id
|
51
|
+
p.clean!
|
52
|
+
begin
|
53
|
+
p.save
|
54
|
+
rescue MysqlError => err
|
55
|
+
break
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def handle_error(job)
|
61
|
+
if job[:errors].is_a? Fixnum
|
62
|
+
job[:errors] += 1
|
63
|
+
else
|
64
|
+
job[:errors] = 1
|
65
|
+
end
|
66
|
+
|
67
|
+
if job[:errors] >= max_errors
|
68
|
+
puts job
|
69
|
+
else
|
70
|
+
@queue.push(job)
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
def jobs_to_do
|
75
|
+
@queue.length
|
76
|
+
end
|
77
|
+
|
78
|
+
def push(jobs)
|
79
|
+
jobs.each do |job|
|
80
|
+
@queue.push(job)
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
def new_jobs(count)
|
85
|
+
jobs = @queue.pop_first(count)
|
86
|
+
hash = {}
|
87
|
+
jobs.each do |job|
|
88
|
+
hash[job[:url]] = job
|
89
|
+
end
|
90
|
+
hash
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
data/lib/gandalf.rb
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__)) unless $LOAD_PATH.include?(File.dirname(__FILE__))
|
2
|
+
|
3
|
+
require 'feedzirra'
|
4
|
+
require 'activesupport'
|
5
|
+
require 'rufus/scheduler'
|
6
|
+
require 'dm-core'
|
7
|
+
#has a bug
|
8
|
+
#require 'dm-aggregates'
|
9
|
+
|
10
|
+
require 'digest/md5'
|
11
|
+
require 'json'
|
12
|
+
|
13
|
+
require 'redis_ext/redis_queue'
|
14
|
+
|
15
|
+
require 'gandalf/scheduler'
|
16
|
+
require 'gandalf/worker'
|
17
|
+
require 'gandalf/models'
|
18
|
+
|
19
|
+
module Gandalf
|
20
|
+
VERSION = "0.0.1"
|
21
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
require 'redis'
|
2
|
+
require 'json'
|
3
|
+
|
4
|
+
class RedisQueue
|
5
|
+
|
6
|
+
attr_accessor :key, :redis
|
7
|
+
|
8
|
+
def initialize(options = {})
|
9
|
+
@key = options[:key]
|
10
|
+
@redis = (options[:redis] || Redis.new(options))
|
11
|
+
end
|
12
|
+
|
13
|
+
def push(value)
|
14
|
+
@redis.rpush(@key, value.to_json)
|
15
|
+
end
|
16
|
+
|
17
|
+
def pop
|
18
|
+
begin
|
19
|
+
value = JSON.parse(@redis.lpop(@key))
|
20
|
+
new_hash = {}
|
21
|
+
value.each{|k, v| new_hash[k.to_sym] = v}
|
22
|
+
return new_hash
|
23
|
+
rescue Exception => ex
|
24
|
+
puts ex
|
25
|
+
return nil
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def length
|
30
|
+
@redis.llen(@key)
|
31
|
+
end
|
32
|
+
|
33
|
+
def pop_first(length)
|
34
|
+
list = []
|
35
|
+
length.times do
|
36
|
+
element = self.pop
|
37
|
+
break unless element
|
38
|
+
list << element
|
39
|
+
end
|
40
|
+
list
|
41
|
+
end
|
42
|
+
end
|
metadata
ADDED
@@ -0,0 +1,98 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: gandalf
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Kijun Seo
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-09-30 00:00:00 +09:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: feedzirra
|
17
|
+
type: :runtime
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: "0"
|
24
|
+
version:
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: activesupport
|
27
|
+
type: :runtime
|
28
|
+
version_requirement:
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: "0"
|
34
|
+
version:
|
35
|
+
- !ruby/object:Gem::Dependency
|
36
|
+
name: rufus-scheduler
|
37
|
+
type: :runtime
|
38
|
+
version_requirement:
|
39
|
+
version_requirements: !ruby/object:Gem::Requirement
|
40
|
+
requirements:
|
41
|
+
- - ">="
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: "0"
|
44
|
+
version:
|
45
|
+
- !ruby/object:Gem::Dependency
|
46
|
+
name: dm-core
|
47
|
+
type: :runtime
|
48
|
+
version_requirement:
|
49
|
+
version_requirements: !ruby/object:Gem::Requirement
|
50
|
+
requirements:
|
51
|
+
- - ">="
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: "0"
|
54
|
+
version:
|
55
|
+
description: Scheduler and Worker makes a unit. You can also make your own worker as well.
|
56
|
+
email: kijun@kijunseo.com
|
57
|
+
executables: []
|
58
|
+
|
59
|
+
extensions: []
|
60
|
+
|
61
|
+
extra_rdoc_files: []
|
62
|
+
|
63
|
+
files:
|
64
|
+
- lib/redis_ext/redis_queue.rb
|
65
|
+
- lib/gandalf.rb
|
66
|
+
- lib/gandalf/models.rb
|
67
|
+
- lib/gandalf/scheduler.rb
|
68
|
+
- lib/gandalf/worker.rb
|
69
|
+
has_rdoc: true
|
70
|
+
homepage: http://github.com/kijun/gandalf
|
71
|
+
licenses: []
|
72
|
+
|
73
|
+
post_install_message:
|
74
|
+
rdoc_options: []
|
75
|
+
|
76
|
+
require_paths:
|
77
|
+
- lib
|
78
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: "0"
|
83
|
+
version:
|
84
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
85
|
+
requirements:
|
86
|
+
- - ">="
|
87
|
+
- !ruby/object:Gem::Version
|
88
|
+
version: "0"
|
89
|
+
version:
|
90
|
+
requirements: []
|
91
|
+
|
92
|
+
rubyforge_project:
|
93
|
+
rubygems_version: 1.3.5
|
94
|
+
signing_key:
|
95
|
+
specification_version: 3
|
96
|
+
summary: A magical distributed web crawler
|
97
|
+
test_files: []
|
98
|
+
|