gandalf 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/gandalf/models.rb +55 -0
- data/lib/gandalf/scheduler.rb +89 -0
- data/lib/gandalf/worker.rb +93 -0
- data/lib/gandalf.rb +21 -0
- data/lib/redis_ext/redis_queue.rb +42 -0
- metadata +98 -0
@@ -0,0 +1,55 @@
|
|
1
|
+
module Gandalf
|
2
|
+
class Seed
|
3
|
+
include DataMapper::Resource
|
4
|
+
|
5
|
+
property :id, Serial
|
6
|
+
property :include_update, Boolean, :index => true
|
7
|
+
property :interval_update, Integer
|
8
|
+
property :url, String, :length => 255, :unique_index => true
|
9
|
+
|
10
|
+
def to_json
|
11
|
+
{:id => id,
|
12
|
+
:include_update => include_update,
|
13
|
+
:interval_update => interval_update,
|
14
|
+
:url => url}.to_json
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
class Post
|
19
|
+
include DataMapper::Resource
|
20
|
+
|
21
|
+
storage_names[:default] = 'posts'
|
22
|
+
property :id, Serial, :field => 'psid'
|
23
|
+
property :channel_id, Integer, :length => 11, :index => true
|
24
|
+
property :link, Text, :length => 255
|
25
|
+
property :title, String, :length => 255
|
26
|
+
property :author, String, :lazy => true
|
27
|
+
property :pub_date, DateTime, :field => 'pubDate'
|
28
|
+
property :update_date, DateTime, :field => 'updateDate', :default => DateTime.now
|
29
|
+
property :description, Text
|
30
|
+
property :cache_link, String, :length => 32, :unique_index => true
|
31
|
+
|
32
|
+
belongs_to :seed, :child_key => [:channel_id]
|
33
|
+
|
34
|
+
def clean!
|
35
|
+
self.title = self.title[0,255]
|
36
|
+
if self.description
|
37
|
+
self.description.gsub!(/\<[^\>]+\>|\n| /,' ')
|
38
|
+
self.description.gsub!(/\s{2,}/,' ')
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def Post.parse(feed)
|
43
|
+
feed.entries.map do |entry|
|
44
|
+
post = self.new({
|
45
|
+
:title => entry.title,
|
46
|
+
:link => entry.url,
|
47
|
+
:author => entry.author,
|
48
|
+
:description => entry.summary,
|
49
|
+
:pub_date => entry.published.to_datetime,
|
50
|
+
:cache_link => Digest::MD5.hexdigest(entry.url)
|
51
|
+
})
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,89 @@
|
|
1
|
+
module Gandalf
|
2
|
+
# A magical scheduler
|
3
|
+
class Scheduler
|
4
|
+
include DataMapper::Resource
|
5
|
+
|
6
|
+
property :id, Serial
|
7
|
+
property :redis_host, String, :default => nil
|
8
|
+
property :redis_db_id, Integer, :default => 0
|
9
|
+
property :seed_table, String
|
10
|
+
property :seed_count, Integer, :default => 0
|
11
|
+
property :last_job_id, Integer, :default => 0
|
12
|
+
# Sleep length in seconds, initialized to 60
|
13
|
+
property :interval, Integer, :length => 5, :default => 60
|
14
|
+
|
15
|
+
has n, :workers
|
16
|
+
|
17
|
+
attr_accessor :redis
|
18
|
+
|
19
|
+
# Sets workers' queues with Redis connection object.
|
20
|
+
def setup(options = {:seed_class => Seed})
|
21
|
+
@redis = Redis.new(:host => self.redis_host, :db => self.redis_db_id)
|
22
|
+
@Seed = options[:seed_class]
|
23
|
+
workers.each { |worker| worker.setup(:redis => @redis) }
|
24
|
+
end
|
25
|
+
|
26
|
+
# Executes jobs using a scheduler
|
27
|
+
def run
|
28
|
+
scheduler = Rufus::Scheduler.start_new
|
29
|
+
scheduler.every interval do
|
30
|
+
execute
|
31
|
+
end
|
32
|
+
|
33
|
+
scheduler.every 10*interval do
|
34
|
+
# TODO Use dm-aggregates when the bug gets fixed.
|
35
|
+
self.seed_count = repository.adapter.query("SELECT COUNT(*) FROM #{seed_table} WHERE include_update = 1")
|
36
|
+
save
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def execute
|
41
|
+
jobs = new_jobs
|
42
|
+
self.last_job_id = jobs.last.id
|
43
|
+
save
|
44
|
+
|
45
|
+
new_loads = job_distribution(current_workload, jobs.count)
|
46
|
+
push_jobs(jobs, new_loads)
|
47
|
+
end
|
48
|
+
|
49
|
+
def jobs_per_interval
|
50
|
+
seed_count / (1440 * 60 / interval)
|
51
|
+
end
|
52
|
+
|
53
|
+
def new_jobs
|
54
|
+
jobs = @Seed.all(:id.gte => last_job_id,
|
55
|
+
:include_update => true,
|
56
|
+
:limit => jobs_per_interval)
|
57
|
+
if jobs.length < jobs_per_interval
|
58
|
+
jobs += @Seed.all(:limit => jobs_per_interval - jobs.length)
|
59
|
+
end
|
60
|
+
jobs
|
61
|
+
end
|
62
|
+
|
63
|
+
def current_workload
|
64
|
+
workload = {}
|
65
|
+
workers.each { |worker| workload[worker.id] = worker.jobs_to_do }
|
66
|
+
workload
|
67
|
+
end
|
68
|
+
|
69
|
+
def push_jobs(jobs, workload)
|
70
|
+
workload.each do |worker_id, wload|
|
71
|
+
worker = workers.get(worker_id)
|
72
|
+
worker.push(jobs.slice!(0,wload))
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
# Recursively calculates next job distribution
|
77
|
+
# TODO Find a formula
|
78
|
+
def job_distribution(workload, jobs)
|
79
|
+
workload = workload.clone
|
80
|
+
distribution = Hash.new(0)
|
81
|
+
jobs.times do
|
82
|
+
min_index = workload.min_by{|k, v| v}.first
|
83
|
+
workload[min_index] += 1
|
84
|
+
distribution[min_index] += 1
|
85
|
+
end
|
86
|
+
distribution
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
@@ -0,0 +1,93 @@
|
|
1
|
+
module Gandalf
|
2
|
+
# A magical slave
|
3
|
+
class Worker
|
4
|
+
include DataMapper::Resource
|
5
|
+
|
6
|
+
property :id, Serial
|
7
|
+
# Maximum number of jobs per interval
|
8
|
+
property :max_jobs, Integer, :default => 30
|
9
|
+
# Sleep length in seconds, initialized to 15
|
10
|
+
property :interval, Integer, :default => 15
|
11
|
+
property :max_errors, Integer, :length => 1, :default => 2
|
12
|
+
|
13
|
+
belongs_to :scheduler
|
14
|
+
|
15
|
+
def setup(options = {:post_class => Post})
|
16
|
+
@queue = RedisQueue.new(:key => self.id, :redis => options[:redis]) unless @queue
|
17
|
+
@Post = (options[:post_class])
|
18
|
+
end
|
19
|
+
|
20
|
+
def run
|
21
|
+
@crawl_scheduler = Rufus::Scheduler.start_new unless @crawl_scheduler
|
22
|
+
@crawl_scheduler.every interval do
|
23
|
+
crawl new_jobs(max_jobs)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def stop
|
28
|
+
@crawl_scheduler.stop
|
29
|
+
end
|
30
|
+
|
31
|
+
def stop
|
32
|
+
@crawl_scheduler.start
|
33
|
+
end
|
34
|
+
|
35
|
+
def crawl(jobs)
|
36
|
+
urls = jobs.keys
|
37
|
+
feeds = Feedzirra::Feed.fetch_and_parse(urls)
|
38
|
+
jobs.each do |url, job|
|
39
|
+
if feeds[url].is_a? Feedzirra::Parser::RSS
|
40
|
+
save_posts(feeds[url], job[:id])
|
41
|
+
else
|
42
|
+
handle_error(job)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def save_posts(feed, channel_id)
|
48
|
+
posts = @Post.parse(feed)
|
49
|
+
posts.each do |p|
|
50
|
+
p.channel_id = channel_id
|
51
|
+
p.clean!
|
52
|
+
begin
|
53
|
+
p.save
|
54
|
+
rescue MysqlError => err
|
55
|
+
break
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def handle_error(job)
|
61
|
+
if job[:errors].is_a? Fixnum
|
62
|
+
job[:errors] += 1
|
63
|
+
else
|
64
|
+
job[:errors] = 1
|
65
|
+
end
|
66
|
+
|
67
|
+
if job[:errors] >= max_errors
|
68
|
+
puts job
|
69
|
+
else
|
70
|
+
@queue.push(job)
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
def jobs_to_do
|
75
|
+
@queue.length
|
76
|
+
end
|
77
|
+
|
78
|
+
def push(jobs)
|
79
|
+
jobs.each do |job|
|
80
|
+
@queue.push(job)
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
def new_jobs(count)
|
85
|
+
jobs = @queue.pop_first(count)
|
86
|
+
hash = {}
|
87
|
+
jobs.each do |job|
|
88
|
+
hash[job[:url]] = job
|
89
|
+
end
|
90
|
+
hash
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
data/lib/gandalf.rb
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__)) unless $LOAD_PATH.include?(File.dirname(__FILE__))
|
2
|
+
|
3
|
+
require 'feedzirra'
|
4
|
+
require 'activesupport'
|
5
|
+
require 'rufus/scheduler'
|
6
|
+
require 'dm-core'
|
7
|
+
#has a bug
|
8
|
+
#require 'dm-aggregates'
|
9
|
+
|
10
|
+
require 'digest/md5'
|
11
|
+
require 'json'
|
12
|
+
|
13
|
+
require 'redis_ext/redis_queue'
|
14
|
+
|
15
|
+
require 'gandalf/scheduler'
|
16
|
+
require 'gandalf/worker'
|
17
|
+
require 'gandalf/models'
|
18
|
+
|
19
|
+
module Gandalf
|
20
|
+
VERSION = "0.0.1"
|
21
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
require 'redis'
|
2
|
+
require 'json'
|
3
|
+
|
4
|
+
class RedisQueue
|
5
|
+
|
6
|
+
attr_accessor :key, :redis
|
7
|
+
|
8
|
+
def initialize(options = {})
|
9
|
+
@key = options[:key]
|
10
|
+
@redis = (options[:redis] || Redis.new(options))
|
11
|
+
end
|
12
|
+
|
13
|
+
def push(value)
|
14
|
+
@redis.rpush(@key, value.to_json)
|
15
|
+
end
|
16
|
+
|
17
|
+
def pop
|
18
|
+
begin
|
19
|
+
value = JSON.parse(@redis.lpop(@key))
|
20
|
+
new_hash = {}
|
21
|
+
value.each{|k, v| new_hash[k.to_sym] = v}
|
22
|
+
return new_hash
|
23
|
+
rescue Exception => ex
|
24
|
+
puts ex
|
25
|
+
return nil
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def length
|
30
|
+
@redis.llen(@key)
|
31
|
+
end
|
32
|
+
|
33
|
+
def pop_first(length)
|
34
|
+
list = []
|
35
|
+
length.times do
|
36
|
+
element = self.pop
|
37
|
+
break unless element
|
38
|
+
list << element
|
39
|
+
end
|
40
|
+
list
|
41
|
+
end
|
42
|
+
end
|
metadata
ADDED
@@ -0,0 +1,98 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: gandalf
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Kijun Seo
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-09-30 00:00:00 +09:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: feedzirra
|
17
|
+
type: :runtime
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: "0"
|
24
|
+
version:
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: activesupport
|
27
|
+
type: :runtime
|
28
|
+
version_requirement:
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: "0"
|
34
|
+
version:
|
35
|
+
- !ruby/object:Gem::Dependency
|
36
|
+
name: rufus-scheduler
|
37
|
+
type: :runtime
|
38
|
+
version_requirement:
|
39
|
+
version_requirements: !ruby/object:Gem::Requirement
|
40
|
+
requirements:
|
41
|
+
- - ">="
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: "0"
|
44
|
+
version:
|
45
|
+
- !ruby/object:Gem::Dependency
|
46
|
+
name: dm-core
|
47
|
+
type: :runtime
|
48
|
+
version_requirement:
|
49
|
+
version_requirements: !ruby/object:Gem::Requirement
|
50
|
+
requirements:
|
51
|
+
- - ">="
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: "0"
|
54
|
+
version:
|
55
|
+
description: Scheduler and Worker makes a unit. You can also make your own worker as well.
|
56
|
+
email: kijun@kijunseo.com
|
57
|
+
executables: []
|
58
|
+
|
59
|
+
extensions: []
|
60
|
+
|
61
|
+
extra_rdoc_files: []
|
62
|
+
|
63
|
+
files:
|
64
|
+
- lib/redis_ext/redis_queue.rb
|
65
|
+
- lib/gandalf.rb
|
66
|
+
- lib/gandalf/models.rb
|
67
|
+
- lib/gandalf/scheduler.rb
|
68
|
+
- lib/gandalf/worker.rb
|
69
|
+
has_rdoc: true
|
70
|
+
homepage: http://github.com/kijun/gandalf
|
71
|
+
licenses: []
|
72
|
+
|
73
|
+
post_install_message:
|
74
|
+
rdoc_options: []
|
75
|
+
|
76
|
+
require_paths:
|
77
|
+
- lib
|
78
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: "0"
|
83
|
+
version:
|
84
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
85
|
+
requirements:
|
86
|
+
- - ">="
|
87
|
+
- !ruby/object:Gem::Version
|
88
|
+
version: "0"
|
89
|
+
version:
|
90
|
+
requirements: []
|
91
|
+
|
92
|
+
rubyforge_project:
|
93
|
+
rubygems_version: 1.3.5
|
94
|
+
signing_key:
|
95
|
+
specification_version: 3
|
96
|
+
summary: A magical distributed web crawler
|
97
|
+
test_files: []
|
98
|
+
|