gandalf 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,55 @@
1
+ module Gandalf
2
+ class Seed
3
+ include DataMapper::Resource
4
+
5
+ property :id, Serial
6
+ property :include_update, Boolean, :index => true
7
+ property :interval_update, Integer
8
+ property :url, String, :length => 255, :unique_index => true
9
+
10
+ def to_json
11
+ {:id => id,
12
+ :include_update => include_update,
13
+ :interval_update => interval_update,
14
+ :url => url}.to_json
15
+ end
16
+ end
17
+
18
+ class Post
19
+ include DataMapper::Resource
20
+
21
+ storage_names[:default] = 'posts'
22
+ property :id, Serial, :field => 'psid'
23
+ property :channel_id, Integer, :length => 11, :index => true
24
+ property :link, Text, :length => 255
25
+ property :title, String, :length => 255
26
+ property :author, String, :lazy => true
27
+ property :pub_date, DateTime, :field => 'pubDate'
28
+ property :update_date, DateTime, :field => 'updateDate', :default => DateTime.now
29
+ property :description, Text
30
+ property :cache_link, String, :length => 32, :unique_index => true
31
+
32
+ belongs_to :seed, :child_key => [:channel_id]
33
+
34
+ def clean!
35
+ self.title = self.title[0,255]
36
+ if self.description
37
+ self.description.gsub!(/\<[^\>]+\>|\n|&nbsp;/,' ')
38
+ self.description.gsub!(/\s{2,}/,' ')
39
+ end
40
+ end
41
+
42
+ def Post.parse(feed)
43
+ feed.entries.map do |entry|
44
+ post = self.new({
45
+ :title => entry.title,
46
+ :link => entry.url,
47
+ :author => entry.author,
48
+ :description => entry.summary,
49
+ :pub_date => entry.published.to_datetime,
50
+ :cache_link => Digest::MD5.hexdigest(entry.url)
51
+ })
52
+ end
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,89 @@
1
+ module Gandalf
2
+ # A magical scheduler
3
+ class Scheduler
4
+ include DataMapper::Resource
5
+
6
+ property :id, Serial
7
+ property :redis_host, String, :default => nil
8
+ property :redis_db_id, Integer, :default => 0
9
+ property :seed_table, String
10
+ property :seed_count, Integer, :default => 0
11
+ property :last_job_id, Integer, :default => 0
12
+ # Sleep length in seconds, initialized to 60
13
+ property :interval, Integer, :length => 5, :default => 60
14
+
15
+ has n, :workers
16
+
17
+ attr_accessor :redis
18
+
19
+ # Sets workers' queues with Redis connection object.
20
+ def setup(options = {:seed_class => Seed})
21
+ @redis = Redis.new(:host => self.redis_host, :db => self.redis_db_id)
22
+ @Seed = options[:seed_class]
23
+ workers.each { |worker| worker.setup(:redis => @redis) }
24
+ end
25
+
26
+ # Executes jobs using a scheduler
27
+ def run
28
+ scheduler = Rufus::Scheduler.start_new
29
+ scheduler.every interval do
30
+ execute
31
+ end
32
+
33
+ scheduler.every 10*interval do
34
+ # TODO Use dm-aggregates when the bug gets fixed.
35
+ self.seed_count = repository.adapter.query("SELECT COUNT(*) FROM #{seed_table} WHERE include_update = 1")
36
+ save
37
+ end
38
+ end
39
+
40
+ def execute
41
+ jobs = new_jobs
42
+ self.last_job_id = jobs.last.id
43
+ save
44
+
45
+ new_loads = job_distribution(current_workload, jobs.count)
46
+ push_jobs(jobs, new_loads)
47
+ end
48
+
49
+ def jobs_per_interval
50
+ seed_count / (1440 * 60 / interval)
51
+ end
52
+
53
+ def new_jobs
54
+ jobs = @Seed.all(:id.gte => last_job_id,
55
+ :include_update => true,
56
+ :limit => jobs_per_interval)
57
+ if jobs.length < jobs_per_interval
58
+ jobs += @Seed.all(:limit => jobs_per_interval - jobs.length)
59
+ end
60
+ jobs
61
+ end
62
+
63
+ def current_workload
64
+ workload = {}
65
+ workers.each { |worker| workload[worker.id] = worker.jobs_to_do }
66
+ workload
67
+ end
68
+
69
+ def push_jobs(jobs, workload)
70
+ workload.each do |worker_id, wload|
71
+ worker = workers.get(worker_id)
72
+ worker.push(jobs.slice!(0,wload))
73
+ end
74
+ end
75
+
76
+ # Recursively calculates next job distribution
77
+ # TODO Find a formula
78
+ def job_distribution(workload, jobs)
79
+ workload = workload.clone
80
+ distribution = Hash.new(0)
81
+ jobs.times do
82
+ min_index = workload.min_by{|k, v| v}.first
83
+ workload[min_index] += 1
84
+ distribution[min_index] += 1
85
+ end
86
+ distribution
87
+ end
88
+ end
89
+ end
@@ -0,0 +1,93 @@
1
+ module Gandalf
2
+ # A magical slave
3
+ class Worker
4
+ include DataMapper::Resource
5
+
6
+ property :id, Serial
7
+ # Maximum number of jobs per interval
8
+ property :max_jobs, Integer, :default => 30
9
+ # Sleep length in seconds, initialized to 15
10
+ property :interval, Integer, :default => 15
11
+ property :max_errors, Integer, :length => 1, :default => 2
12
+
13
+ belongs_to :scheduler
14
+
15
+ def setup(options = {:post_class => Post})
16
+ @queue = RedisQueue.new(:key => self.id, :redis => options[:redis]) unless @queue
17
+ @Post = (options[:post_class])
18
+ end
19
+
20
+ def run
21
+ @crawl_scheduler = Rufus::Scheduler.start_new unless @crawl_scheduler
22
+ @crawl_scheduler.every interval do
23
+ crawl new_jobs(max_jobs)
24
+ end
25
+ end
26
+
27
+ def stop
28
+ @crawl_scheduler.stop
29
+ end
30
+
31
+ def stop
32
+ @crawl_scheduler.start
33
+ end
34
+
35
+ def crawl(jobs)
36
+ urls = jobs.keys
37
+ feeds = Feedzirra::Feed.fetch_and_parse(urls)
38
+ jobs.each do |url, job|
39
+ if feeds[url].is_a? Feedzirra::Parser::RSS
40
+ save_posts(feeds[url], job[:id])
41
+ else
42
+ handle_error(job)
43
+ end
44
+ end
45
+ end
46
+
47
+ def save_posts(feed, channel_id)
48
+ posts = @Post.parse(feed)
49
+ posts.each do |p|
50
+ p.channel_id = channel_id
51
+ p.clean!
52
+ begin
53
+ p.save
54
+ rescue MysqlError => err
55
+ break
56
+ end
57
+ end
58
+ end
59
+
60
+ def handle_error(job)
61
+ if job[:errors].is_a? Fixnum
62
+ job[:errors] += 1
63
+ else
64
+ job[:errors] = 1
65
+ end
66
+
67
+ if job[:errors] >= max_errors
68
+ puts job
69
+ else
70
+ @queue.push(job)
71
+ end
72
+ end
73
+
74
+ def jobs_to_do
75
+ @queue.length
76
+ end
77
+
78
+ def push(jobs)
79
+ jobs.each do |job|
80
+ @queue.push(job)
81
+ end
82
+ end
83
+
84
+ def new_jobs(count)
85
+ jobs = @queue.pop_first(count)
86
+ hash = {}
87
+ jobs.each do |job|
88
+ hash[job[:url]] = job
89
+ end
90
+ hash
91
+ end
92
+ end
93
+ end
data/lib/gandalf.rb ADDED
@@ -0,0 +1,21 @@
1
+ $LOAD_PATH.unshift(File.dirname(__FILE__)) unless $LOAD_PATH.include?(File.dirname(__FILE__))
2
+
3
+ require 'feedzirra'
4
+ require 'activesupport'
5
+ require 'rufus/scheduler'
6
+ require 'dm-core'
7
+ #has a bug
8
+ #require 'dm-aggregates'
9
+
10
+ require 'digest/md5'
11
+ require 'json'
12
+
13
+ require 'redis_ext/redis_queue'
14
+
15
+ require 'gandalf/scheduler'
16
+ require 'gandalf/worker'
17
+ require 'gandalf/models'
18
+
19
+ module Gandalf
20
+ VERSION = "0.0.1"
21
+ end
@@ -0,0 +1,42 @@
1
+ require 'redis'
2
+ require 'json'
3
+
4
+ class RedisQueue
5
+
6
+ attr_accessor :key, :redis
7
+
8
+ def initialize(options = {})
9
+ @key = options[:key]
10
+ @redis = (options[:redis] || Redis.new(options))
11
+ end
12
+
13
+ def push(value)
14
+ @redis.rpush(@key, value.to_json)
15
+ end
16
+
17
+ def pop
18
+ begin
19
+ value = JSON.parse(@redis.lpop(@key))
20
+ new_hash = {}
21
+ value.each{|k, v| new_hash[k.to_sym] = v}
22
+ return new_hash
23
+ rescue Exception => ex
24
+ puts ex
25
+ return nil
26
+ end
27
+ end
28
+
29
+ def length
30
+ @redis.llen(@key)
31
+ end
32
+
33
+ def pop_first(length)
34
+ list = []
35
+ length.times do
36
+ element = self.pop
37
+ break unless element
38
+ list << element
39
+ end
40
+ list
41
+ end
42
+ end
metadata ADDED
@@ -0,0 +1,98 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: gandalf
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Kijun Seo
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-09-30 00:00:00 +09:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: feedzirra
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: "0"
24
+ version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: activesupport
27
+ type: :runtime
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: "0"
34
+ version:
35
+ - !ruby/object:Gem::Dependency
36
+ name: rufus-scheduler
37
+ type: :runtime
38
+ version_requirement:
39
+ version_requirements: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: "0"
44
+ version:
45
+ - !ruby/object:Gem::Dependency
46
+ name: dm-core
47
+ type: :runtime
48
+ version_requirement:
49
+ version_requirements: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ version: "0"
54
+ version:
55
+ description: Scheduler and Worker makes a unit. You can also make your own worker as well.
56
+ email: kijun@kijunseo.com
57
+ executables: []
58
+
59
+ extensions: []
60
+
61
+ extra_rdoc_files: []
62
+
63
+ files:
64
+ - lib/redis_ext/redis_queue.rb
65
+ - lib/gandalf.rb
66
+ - lib/gandalf/models.rb
67
+ - lib/gandalf/scheduler.rb
68
+ - lib/gandalf/worker.rb
69
+ has_rdoc: true
70
+ homepage: http://github.com/kijun/gandalf
71
+ licenses: []
72
+
73
+ post_install_message:
74
+ rdoc_options: []
75
+
76
+ require_paths:
77
+ - lib
78
+ required_ruby_version: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: "0"
83
+ version:
84
+ required_rubygems_version: !ruby/object:Gem::Requirement
85
+ requirements:
86
+ - - ">="
87
+ - !ruby/object:Gem::Version
88
+ version: "0"
89
+ version:
90
+ requirements: []
91
+
92
+ rubyforge_project:
93
+ rubygems_version: 1.3.5
94
+ signing_key:
95
+ specification_version: 3
96
+ summary: A magical distributed web crawler
97
+ test_files: []
98
+