gandalf 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,55 @@
1
+ module Gandalf
2
+ class Seed
3
+ include DataMapper::Resource
4
+
5
+ property :id, Serial
6
+ property :include_update, Boolean, :index => true
7
+ property :interval_update, Integer
8
+ property :url, String, :length => 255, :unique_index => true
9
+
10
+ def to_json
11
+ {:id => id,
12
+ :include_update => include_update,
13
+ :interval_update => interval_update,
14
+ :url => url}.to_json
15
+ end
16
+ end
17
+
18
+ class Post
19
+ include DataMapper::Resource
20
+
21
+ storage_names[:default] = 'posts'
22
+ property :id, Serial, :field => 'psid'
23
+ property :channel_id, Integer, :length => 11, :index => true
24
+ property :link, Text, :length => 255
25
+ property :title, String, :length => 255
26
+ property :author, String, :lazy => true
27
+ property :pub_date, DateTime, :field => 'pubDate'
28
+ property :update_date, DateTime, :field => 'updateDate', :default => DateTime.now
29
+ property :description, Text
30
+ property :cache_link, String, :length => 32, :unique_index => true
31
+
32
+ belongs_to :seed, :child_key => [:channel_id]
33
+
34
+ def clean!
35
+ self.title = self.title[0,255]
36
+ if self.description
37
+ self.description.gsub!(/\<[^\>]+\>|\n|&nbsp;/,' ')
38
+ self.description.gsub!(/\s{2,}/,' ')
39
+ end
40
+ end
41
+
42
+ def Post.parse(feed)
43
+ feed.entries.map do |entry|
44
+ post = self.new({
45
+ :title => entry.title,
46
+ :link => entry.url,
47
+ :author => entry.author,
48
+ :description => entry.summary,
49
+ :pub_date => entry.published.to_datetime,
50
+ :cache_link => Digest::MD5.hexdigest(entry.url)
51
+ })
52
+ end
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,89 @@
1
+ module Gandalf
2
+ # A magical scheduler
3
+ class Scheduler
4
+ include DataMapper::Resource
5
+
6
+ property :id, Serial
7
+ property :redis_host, String, :default => nil
8
+ property :redis_db_id, Integer, :default => 0
9
+ property :seed_table, String
10
+ property :seed_count, Integer, :default => 0
11
+ property :last_job_id, Integer, :default => 0
12
+ # Sleep length in seconds, initialized to 60
13
+ property :interval, Integer, :length => 5, :default => 60
14
+
15
+ has n, :workers
16
+
17
+ attr_accessor :redis
18
+
19
+ # Sets workers' queues with Redis connection object.
20
+ def setup(options = {:seed_class => Seed})
21
+ @redis = Redis.new(:host => self.redis_host, :db => self.redis_db_id)
22
+ @Seed = options[:seed_class]
23
+ workers.each { |worker| worker.setup(:redis => @redis) }
24
+ end
25
+
26
+ # Executes jobs using a scheduler
27
+ def run
28
+ scheduler = Rufus::Scheduler.start_new
29
+ scheduler.every interval do
30
+ execute
31
+ end
32
+
33
+ scheduler.every 10*interval do
34
+ # TODO Use dm-aggregates when the bug gets fixed.
35
+ self.seed_count = repository.adapter.query("SELECT COUNT(*) FROM #{seed_table} WHERE include_update = 1")
36
+ save
37
+ end
38
+ end
39
+
40
+ def execute
41
+ jobs = new_jobs
42
+ self.last_job_id = jobs.last.id
43
+ save
44
+
45
+ new_loads = job_distribution(current_workload, jobs.count)
46
+ push_jobs(jobs, new_loads)
47
+ end
48
+
49
+ def jobs_per_interval
50
+ seed_count / (1440 * 60 / interval)
51
+ end
52
+
53
+ def new_jobs
54
+ jobs = @Seed.all(:id.gte => last_job_id,
55
+ :include_update => true,
56
+ :limit => jobs_per_interval)
57
+ if jobs.length < jobs_per_interval
58
+ jobs += @Seed.all(:limit => jobs_per_interval - jobs.length)
59
+ end
60
+ jobs
61
+ end
62
+
63
+ def current_workload
64
+ workload = {}
65
+ workers.each { |worker| workload[worker.id] = worker.jobs_to_do }
66
+ workload
67
+ end
68
+
69
+ def push_jobs(jobs, workload)
70
+ workload.each do |worker_id, wload|
71
+ worker = workers.get(worker_id)
72
+ worker.push(jobs.slice!(0,wload))
73
+ end
74
+ end
75
+
76
+ # Recursively calculates next job distribution
77
+ # TODO Find a formula
78
+ def job_distribution(workload, jobs)
79
+ workload = workload.clone
80
+ distribution = Hash.new(0)
81
+ jobs.times do
82
+ min_index = workload.min_by{|k, v| v}.first
83
+ workload[min_index] += 1
84
+ distribution[min_index] += 1
85
+ end
86
+ distribution
87
+ end
88
+ end
89
+ end
@@ -0,0 +1,93 @@
1
+ module Gandalf
2
+ # A magical slave
3
+ class Worker
4
+ include DataMapper::Resource
5
+
6
+ property :id, Serial
7
+ # Maximum number of jobs per interval
8
+ property :max_jobs, Integer, :default => 30
9
+ # Sleep length in seconds, initialized to 15
10
+ property :interval, Integer, :default => 15
11
+ property :max_errors, Integer, :length => 1, :default => 2
12
+
13
+ belongs_to :scheduler
14
+
15
+ def setup(options = {:post_class => Post})
16
+ @queue = RedisQueue.new(:key => self.id, :redis => options[:redis]) unless @queue
17
+ @Post = (options[:post_class])
18
+ end
19
+
20
+ def run
21
+ @crawl_scheduler = Rufus::Scheduler.start_new unless @crawl_scheduler
22
+ @crawl_scheduler.every interval do
23
+ crawl new_jobs(max_jobs)
24
+ end
25
+ end
26
+
27
+ def stop
28
+ @crawl_scheduler.stop
29
+ end
30
+
31
+ def stop
32
+ @crawl_scheduler.start
33
+ end
34
+
35
+ def crawl(jobs)
36
+ urls = jobs.keys
37
+ feeds = Feedzirra::Feed.fetch_and_parse(urls)
38
+ jobs.each do |url, job|
39
+ if feeds[url].is_a? Feedzirra::Parser::RSS
40
+ save_posts(feeds[url], job[:id])
41
+ else
42
+ handle_error(job)
43
+ end
44
+ end
45
+ end
46
+
47
+ def save_posts(feed, channel_id)
48
+ posts = @Post.parse(feed)
49
+ posts.each do |p|
50
+ p.channel_id = channel_id
51
+ p.clean!
52
+ begin
53
+ p.save
54
+ rescue MysqlError => err
55
+ break
56
+ end
57
+ end
58
+ end
59
+
60
+ def handle_error(job)
61
+ if job[:errors].is_a? Fixnum
62
+ job[:errors] += 1
63
+ else
64
+ job[:errors] = 1
65
+ end
66
+
67
+ if job[:errors] >= max_errors
68
+ puts job
69
+ else
70
+ @queue.push(job)
71
+ end
72
+ end
73
+
74
+ def jobs_to_do
75
+ @queue.length
76
+ end
77
+
78
+ def push(jobs)
79
+ jobs.each do |job|
80
+ @queue.push(job)
81
+ end
82
+ end
83
+
84
+ def new_jobs(count)
85
+ jobs = @queue.pop_first(count)
86
+ hash = {}
87
+ jobs.each do |job|
88
+ hash[job[:url]] = job
89
+ end
90
+ hash
91
+ end
92
+ end
93
+ end
data/lib/gandalf.rb ADDED
@@ -0,0 +1,21 @@
1
+ $LOAD_PATH.unshift(File.dirname(__FILE__)) unless $LOAD_PATH.include?(File.dirname(__FILE__))
2
+
3
+ require 'feedzirra'
4
+ require 'activesupport'
5
+ require 'rufus/scheduler'
6
+ require 'dm-core'
7
+ #has a bug
8
+ #require 'dm-aggregates'
9
+
10
+ require 'digest/md5'
11
+ require 'json'
12
+
13
+ require 'redis_ext/redis_queue'
14
+
15
+ require 'gandalf/scheduler'
16
+ require 'gandalf/worker'
17
+ require 'gandalf/models'
18
+
19
+ module Gandalf
20
+ VERSION = "0.0.1"
21
+ end
@@ -0,0 +1,42 @@
1
+ require 'redis'
2
+ require 'json'
3
+
4
+ class RedisQueue
5
+
6
+ attr_accessor :key, :redis
7
+
8
+ def initialize(options = {})
9
+ @key = options[:key]
10
+ @redis = (options[:redis] || Redis.new(options))
11
+ end
12
+
13
+ def push(value)
14
+ @redis.rpush(@key, value.to_json)
15
+ end
16
+
17
+ def pop
18
+ begin
19
+ value = JSON.parse(@redis.lpop(@key))
20
+ new_hash = {}
21
+ value.each{|k, v| new_hash[k.to_sym] = v}
22
+ return new_hash
23
+ rescue Exception => ex
24
+ puts ex
25
+ return nil
26
+ end
27
+ end
28
+
29
+ def length
30
+ @redis.llen(@key)
31
+ end
32
+
33
+ def pop_first(length)
34
+ list = []
35
+ length.times do
36
+ element = self.pop
37
+ break unless element
38
+ list << element
39
+ end
40
+ list
41
+ end
42
+ end
metadata ADDED
@@ -0,0 +1,98 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: gandalf
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Kijun Seo
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-09-30 00:00:00 +09:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: feedzirra
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: "0"
24
+ version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: activesupport
27
+ type: :runtime
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: "0"
34
+ version:
35
+ - !ruby/object:Gem::Dependency
36
+ name: rufus-scheduler
37
+ type: :runtime
38
+ version_requirement:
39
+ version_requirements: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: "0"
44
+ version:
45
+ - !ruby/object:Gem::Dependency
46
+ name: dm-core
47
+ type: :runtime
48
+ version_requirement:
49
+ version_requirements: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ version: "0"
54
+ version:
55
+ description: Scheduler and Worker makes a unit. You can also make your own worker as well.
56
+ email: kijun@kijunseo.com
57
+ executables: []
58
+
59
+ extensions: []
60
+
61
+ extra_rdoc_files: []
62
+
63
+ files:
64
+ - lib/redis_ext/redis_queue.rb
65
+ - lib/gandalf.rb
66
+ - lib/gandalf/models.rb
67
+ - lib/gandalf/scheduler.rb
68
+ - lib/gandalf/worker.rb
69
+ has_rdoc: true
70
+ homepage: http://github.com/kijun/gandalf
71
+ licenses: []
72
+
73
+ post_install_message:
74
+ rdoc_options: []
75
+
76
+ require_paths:
77
+ - lib
78
+ required_ruby_version: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: "0"
83
+ version:
84
+ required_rubygems_version: !ruby/object:Gem::Requirement
85
+ requirements:
86
+ - - ">="
87
+ - !ruby/object:Gem::Version
88
+ version: "0"
89
+ version:
90
+ requirements: []
91
+
92
+ rubyforge_project:
93
+ rubygems_version: 1.3.5
94
+ signing_key:
95
+ specification_version: 3
96
+ summary: A magical distributed web crawler
97
+ test_files: []
98
+