mrflip-edamame 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +8 -0
- data/.gitignore +31 -0
- data/LICENSE.textile +21 -0
- data/README.textile +178 -0
- data/Rakefile +75 -0
- data/VERSION +1 -0
- data/app/edamame_san/config.ru +4 -0
- data/app/edamame_san/config.yml +17 -0
- data/app/edamame_san/config/.gitignore +1 -0
- data/app/edamame_san/edamame_san.rb +71 -0
- data/app/edamame_san/public/favicon.ico +0 -0
- data/app/edamame_san/public/images/edamame_logo.icns +0 -0
- data/app/edamame_san/public/images/edamame_logo.ico +0 -0
- data/app/edamame_san/public/images/edamame_logo.png +0 -0
- data/app/edamame_san/public/images/edamame_logo_2.icns +0 -0
- data/app/edamame_san/public/javascripts/application.js +8 -0
- data/app/edamame_san/public/javascripts/jquery/jquery-ui.js +8694 -0
- data/app/edamame_san/public/javascripts/jquery/jquery.js +4376 -0
- data/app/edamame_san/public/stylesheets/application.css +32 -0
- data/app/edamame_san/public/stylesheets/layout.css +88 -0
- data/app/edamame_san/views/layout.haml +13 -0
- data/app/edamame_san/views/load.haml +37 -0
- data/app/edamame_san/views/root.haml +25 -0
- data/bin/edamame-ps +2 -0
- data/bin/empty_all.rb +15 -0
- data/bin/stats.rb +13 -0
- data/bin/sync.rb +15 -0
- data/bin/test_run.rb +14 -0
- data/edamame.gemspec +110 -0
- data/lib/edamame.rb +193 -0
- data/lib/edamame/job.rb +134 -0
- data/lib/edamame/queue.rb +6 -0
- data/lib/edamame/queue/beanstalk.rb +132 -0
- data/lib/edamame/rescheduled.rb +89 -0
- data/lib/edamame/scheduling.rb +69 -0
- data/lib/edamame/store.rb +8 -0
- data/lib/edamame/store/base.rb +62 -0
- data/lib/edamame/store/tyrant_store.rb +50 -0
- data/lib/methods.txt +94 -0
- data/spec/edamame_spec.rb +7 -0
- data/spec/spec_helper.rb +9 -0
- data/utils/god/README-god.textile +54 -0
- data/utils/god/beanstalkd_god.rb +34 -0
- data/utils/god/edamame.god +30 -0
- data/utils/god/god-etc-init-dot-d-example +40 -0
- data/utils/god/god_email.rb +45 -0
- data/utils/god/god_process.rb +140 -0
- data/utils/god/god_site_config.rb +4 -0
- data/utils/god/sinatra_god.rb +36 -0
- data/utils/god/tyrant_god.rb +67 -0
- data/utils/simulation/Add Percent Variation.vi +0 -0
- data/utils/simulation/Harmonic Average.vi +0 -0
- data/utils/simulation/Rescheduling Simulation.aliases +3 -0
- data/utils/simulation/Rescheduling Simulation.lvlps +3 -0
- data/utils/simulation/Rescheduling Simulation.lvproj +22 -0
- data/utils/simulation/Rescheduling.vi +0 -0
- data/utils/simulation/Weighted Average.vi +0 -0
- metadata +135 -0
data/lib/edamame/job.rb
ADDED
@@ -0,0 +1,134 @@
|
|
1
|
+
module Edamame
|
2
|
+
|
3
|
+
#
|
4
|
+
#
|
5
|
+
# id, name, body, timeouts, time-left, age, state, delay, pri, ttr
|
6
|
+
#
|
7
|
+
#
|
8
|
+
# * A job, pulled from the queue: it is connected to its beanstalk presence
|
9
|
+
# body contains
|
10
|
+
# ** obj
|
11
|
+
# ** scheduling
|
12
|
+
# ** stats
|
13
|
+
#
|
14
|
+
# * A DB job
|
15
|
+
# body contains
|
16
|
+
# ** tube, priority, ttr, state
|
17
|
+
# ** obj
|
18
|
+
# ** scheduling
|
19
|
+
# ** stats
|
20
|
+
module JobCore
|
21
|
+
|
22
|
+
def key
|
23
|
+
[ tube, obj[:key]||obj['key'] ].join('-')
|
24
|
+
end
|
25
|
+
|
26
|
+
#
|
27
|
+
def since_last
|
28
|
+
scheduling.last_run - Time.now
|
29
|
+
end
|
30
|
+
|
31
|
+
# Beanstalk::Job stats:
|
32
|
+
# { "pri"=>65536, "ttr"=>120,
|
33
|
+
# {"releases"=>8, "delay"=>5, "kicks"=>0, "buries"=>0, "id"=>202,
|
34
|
+
# "tube"=>"default", "time-left"=>120,
|
35
|
+
# "timeouts"=>0, "age"=>1415, "state"=>"reserved"}
|
36
|
+
#
|
37
|
+
# [ "id",
|
38
|
+
# "tube", "pri", "ttr", "state",
|
39
|
+
# "delay",
|
40
|
+
# "releases", "kicks", "buries",
|
41
|
+
# "time-left", "timeouts", "age", ]
|
42
|
+
end
|
43
|
+
|
44
|
+
|
45
|
+
Beanstalk::Job.class_eval do
|
46
|
+
include JobCore
|
47
|
+
|
48
|
+
def scheduling
|
49
|
+
@scheduling ||= Edamame::Scheduling.from_hash ybody['scheduling']
|
50
|
+
end
|
51
|
+
|
52
|
+
def obj
|
53
|
+
ybody['obj']
|
54
|
+
end
|
55
|
+
|
56
|
+
def priority
|
57
|
+
pri
|
58
|
+
end
|
59
|
+
|
60
|
+
def tube
|
61
|
+
stats['tube']
|
62
|
+
end
|
63
|
+
|
64
|
+
# Override this for rescheduling
|
65
|
+
def update!
|
66
|
+
scheduling.total_runs = scheduling.total_runs.to_i + stats['releases']
|
67
|
+
scheduling.last_run = Time.now
|
68
|
+
end
|
69
|
+
|
70
|
+
def to_hash flatten=true
|
71
|
+
hsh = {
|
72
|
+
"tube" => tube,
|
73
|
+
"priority" => priority,
|
74
|
+
"ttr" => ttr,
|
75
|
+
"state" => state,
|
76
|
+
"scheduling" => scheduling.to_hash,
|
77
|
+
'key' => key,
|
78
|
+
"obj" => obj.to_hash,
|
79
|
+
}
|
80
|
+
if flatten
|
81
|
+
hsh["scheduling"] = hsh['scheduling'].to_yaml
|
82
|
+
hsh["obj"] = hsh['obj'].to_yaml
|
83
|
+
end
|
84
|
+
hsh
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
class Job < Struct.new(
|
89
|
+
:tube, :priority, :ttr, :state,
|
90
|
+
:scheduling, :obj
|
91
|
+
)
|
92
|
+
# Job.class_eval do
|
93
|
+
include JobCore
|
94
|
+
|
95
|
+
DEFAULT_OPTIONS = {
|
96
|
+
'priority' => 65536,
|
97
|
+
'ttr' => 120,
|
98
|
+
'state' => 1,
|
99
|
+
'scheduling' => Edamame::Scheduling::Once.new()
|
100
|
+
}
|
101
|
+
|
102
|
+
# attr_accessor :runs, :failures, :prev_run_at
|
103
|
+
def initialize *args
|
104
|
+
super *args
|
105
|
+
DEFAULT_OPTIONS.each{|key,val| self[key] ||= val }
|
106
|
+
[:priority, :ttr, :state].each{|key| self[key] = self[key].to_i }
|
107
|
+
case self.scheduling
|
108
|
+
when String
|
109
|
+
scheduling_hash = YAML.load(self.scheduling) rescue nil
|
110
|
+
self.scheduling = Scheduling.from_hash(scheduling_hash) if scheduling_hash
|
111
|
+
when Hash
|
112
|
+
self.scheduling = Scheduling.from_hash(scheduling)
|
113
|
+
else raise "Can't build a Scheduling from #{self.scheduling}" ; end
|
114
|
+
if self.obj.is_a?(String) then self.obj = YAML.load(self.obj) rescue nil ; end
|
115
|
+
end
|
116
|
+
|
117
|
+
def delay
|
118
|
+
scheduling.delay
|
119
|
+
end
|
120
|
+
|
121
|
+
def to_hash flatten=true
|
122
|
+
hsh = super()
|
123
|
+
hsh["scheduling"] = scheduling.to_hash
|
124
|
+
hsh["obj"] = obj.to_hash
|
125
|
+
if flatten
|
126
|
+
hsh["scheduling"] = hsh['scheduling'].to_yaml
|
127
|
+
hsh["obj"] = hsh['obj'].to_yaml
|
128
|
+
end
|
129
|
+
hsh
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
|
@@ -0,0 +1,132 @@
|
|
1
|
+
module Edamame
|
2
|
+
module Queue
|
3
|
+
#
|
4
|
+
# Persistent job queue for periodic requests.
|
5
|
+
#
|
6
|
+
# Jobs are reserved, run, and if successful put back with an updated delay parameter.
|
7
|
+
#
|
8
|
+
# This is useful for mass scraping of timelines (RSS feeds, twitter search
|
9
|
+
# results, etc. See http://github.com/mrflip/wuclan for )
|
10
|
+
#
|
11
|
+
class BeanstalkQueue
|
12
|
+
DEFAULT_OPTIONS = {
|
13
|
+
:priority => 65536, # default job queue priority
|
14
|
+
:time_to_run => 60*5, # 5 minutes to complete a job or assume dead
|
15
|
+
:uris => ['localhost:11300'],
|
16
|
+
:default_tube => 'default',
|
17
|
+
}
|
18
|
+
attr_accessor :options
|
19
|
+
|
20
|
+
#
|
21
|
+
# beanstalk_pool -- specify nil to use the default single-node ['localhost:11300'] pool
|
22
|
+
#
|
23
|
+
def initialize _options={}
|
24
|
+
self.options = DEFAULT_OPTIONS.deep_merge(_options.compact)
|
25
|
+
options[:default_tube] = options[:default_tube].to_s
|
26
|
+
end
|
27
|
+
|
28
|
+
#
|
29
|
+
# Add a new Job to the queue
|
30
|
+
#
|
31
|
+
def put job, priority=nil, delay=nil
|
32
|
+
beanstalk.yput(job.to_hash(false),
|
33
|
+
(priority || job.priority), (delay || job.delay), job.ttr)
|
34
|
+
end
|
35
|
+
|
36
|
+
#
|
37
|
+
# Remove the job from the queue.
|
38
|
+
#
|
39
|
+
def delete(job)
|
40
|
+
job.delete
|
41
|
+
end
|
42
|
+
|
43
|
+
#
|
44
|
+
# Returns the job to the queue, to be re-run later.
|
45
|
+
#
|
46
|
+
# release'ing a job acknowledges it was completed, successfully or not
|
47
|
+
#
|
48
|
+
def release job, priority=nil, delay=nil
|
49
|
+
job.release( (priority || job.priority), (delay || job.delay) )
|
50
|
+
end
|
51
|
+
|
52
|
+
#
|
53
|
+
# Take the next (highest priority, delay met) job.
|
54
|
+
# Set timeout (default is 10s)
|
55
|
+
# Returns nil on error or timeout. Interrupt error passes through
|
56
|
+
#
|
57
|
+
def reserve timeout=10
|
58
|
+
begin
|
59
|
+
job = beanstalk.reserve(timeout) or return
|
60
|
+
rescue Beanstalk::TimedOut => e ; warn e.to_s ; sleep 0.4 ; return ;
|
61
|
+
rescue StandardError => e ; warn e.to_s ; sleep 1 ; return ; end
|
62
|
+
job
|
63
|
+
end
|
64
|
+
|
65
|
+
#
|
66
|
+
# Shelves the job.
|
67
|
+
#
|
68
|
+
def bury
|
69
|
+
job.bury job.priority
|
70
|
+
end
|
71
|
+
|
72
|
+
# The beanstalk pool which acts as job queue
|
73
|
+
def beanstalk
|
74
|
+
return @beanstalk if @beanstalk
|
75
|
+
@beanstalk = Beanstalk::Pool.new(options[:uris], options[:default_tube])
|
76
|
+
self.tube= options[:default_tube]
|
77
|
+
@beanstalk
|
78
|
+
end
|
79
|
+
# Close the job queue
|
80
|
+
def close
|
81
|
+
@beanstalk.close if @beanstalk
|
82
|
+
@beanstalk = nil
|
83
|
+
end
|
84
|
+
|
85
|
+
# uses and watches the given beanstalk tube
|
86
|
+
def tube= _tube
|
87
|
+
puts "#{self.class} setting tube to #{_tube}, was #{@tube}"
|
88
|
+
@beanstalk.use _tube
|
89
|
+
@beanstalk.watch _tube
|
90
|
+
end
|
91
|
+
|
92
|
+
# Stats on job count across the pool
|
93
|
+
def stats
|
94
|
+
beanstalk.stats.select{|k,v| k =~ /jobs/}
|
95
|
+
end
|
96
|
+
# Total jobs in the queue, whether reserved, ready, buried or delayed.
|
97
|
+
def total_jobs
|
98
|
+
[:reserved, :ready, :buried, :delayed].inject(0){|sum,type| sum += stats["current-jobs-#{type}"]}
|
99
|
+
end
|
100
|
+
|
101
|
+
#
|
102
|
+
#
|
103
|
+
#
|
104
|
+
def empty tube=nil, &block
|
105
|
+
tube = tube.to_s if tube
|
106
|
+
curr_tube = beanstalk.list_tube_used.values.first
|
107
|
+
curr_watches = beanstalk.list_tubes_watched.values.first
|
108
|
+
beanstalk.use tube if tube
|
109
|
+
beanstalk.watch tube if tube
|
110
|
+
p ["emptying", tube, beanstalk_total_jobs]
|
111
|
+
loop do
|
112
|
+
kicked = beanstalk.open_connections.map{|conxn| conxn.kick(20) }
|
113
|
+
break if (beanstalk_total_jobs == 0) || (!beanstalk.peek_ready)
|
114
|
+
qjob = reserve(5) or break
|
115
|
+
yield qjob
|
116
|
+
qjob.delete
|
117
|
+
end
|
118
|
+
beanstalk.use curr_tube
|
119
|
+
beanstalk.ignore tube if (! curr_watches.include?(tube))
|
120
|
+
end
|
121
|
+
|
122
|
+
def empty_all &block
|
123
|
+
tubes = beanstalk.list_tubes.values.flatten.uniq
|
124
|
+
tubes.each do |tube|
|
125
|
+
empty tube, &block
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
end # class
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
@@ -0,0 +1,89 @@
|
|
1
|
+
module Edamame
|
2
|
+
module Rescheduled
|
3
|
+
|
4
|
+
|
5
|
+
# ===========================================================================
|
6
|
+
#
|
7
|
+
# Rescheduling
|
8
|
+
|
9
|
+
#
|
10
|
+
# Finish the qjob and re-insert it at the same priority but with the new
|
11
|
+
# delay setting.
|
12
|
+
#
|
13
|
+
def reschedule qjob, scrape_job
|
14
|
+
priority = qjob.stats['pri']
|
15
|
+
qjob.delete
|
16
|
+
self.save scrape_job, priority
|
17
|
+
end
|
18
|
+
|
19
|
+
#
|
20
|
+
# Flattens the scrape_job and enqueues it with a delay appropriate for the
|
21
|
+
# average item rate so far. You can explicitly supply a +priority+ to
|
22
|
+
# override the priority set at instantiation.
|
23
|
+
#
|
24
|
+
# This doesn't delete the job -- use reschedule if you are putting back an
|
25
|
+
# existing qjob.
|
26
|
+
#
|
27
|
+
def save scrape_job, priority=nil, delay=nil
|
28
|
+
body = scrape_job.to_flat.join("\t")
|
29
|
+
delay ||= delay_to_next_scrape(scrape_job)
|
30
|
+
priority ||= config[:priority]
|
31
|
+
log scrape_job, priority, delay
|
32
|
+
job_queue.put body, priority, delay, config[:time_to_run]
|
33
|
+
end
|
34
|
+
# delegates to #save() -- priority and delay are unchanged.
|
35
|
+
def <<(scrape_job) save scrape_job end
|
36
|
+
|
37
|
+
#
|
38
|
+
# if we can't determine an actual rate, uses max_resched_delay (assumes it
|
39
|
+
# is rare)
|
40
|
+
#
|
41
|
+
def delay_to_next_scrape scrape_job
|
42
|
+
rate = scrape_job.avg_rate or return max_resched_delay
|
43
|
+
delay = items_goal.to_f / rate
|
44
|
+
delay = delay.clamp(min_resched_delay, max_resched_delay)
|
45
|
+
delay.to_i
|
46
|
+
end
|
47
|
+
|
48
|
+
#
|
49
|
+
# A (very prolix) log statement
|
50
|
+
#
|
51
|
+
def log scrape_job, priority=nil, delay=nil
|
52
|
+
delay ||= delay_to_next_scrape(scrape_job)
|
53
|
+
rate_str = scrape_job.avg_rate ? "%10.5f/s" % (scrape_job.avg_rate) : " "*12
|
54
|
+
ll = "Rescheduling\t#{"%-23s"%scrape_job.query_term}\t"
|
55
|
+
ll << "%6d" % priority if priority
|
56
|
+
ll << "\t#{rate_str}"
|
57
|
+
ll << "\t#{"%7d" % (scrape_job.prev_items||0)}"
|
58
|
+
ll << "\t#{"%4d"%(scrape_job.new_items||0)} nu"
|
59
|
+
ll << "\tin #{"%8.2f" % delay} s"
|
60
|
+
ll << "\t#{(Time.now + delay).strftime("%Y-%m-%d %H:%M:%S")}"
|
61
|
+
Monkeyshines.logger.info ll
|
62
|
+
end
|
63
|
+
|
64
|
+
# ===========================================================================
|
65
|
+
#
|
66
|
+
# Beanstalkd interface
|
67
|
+
#
|
68
|
+
|
69
|
+
#
|
70
|
+
# De-serialize the scrape job from the queue.
|
71
|
+
#
|
72
|
+
def scrape_job_from_qjob qjob
|
73
|
+
args = qjob.body.split("\t")
|
74
|
+
# request_klass = Wukong.class_from_resource(args.shift)
|
75
|
+
scrape_job = request_klass.new(*args[1..-1])
|
76
|
+
end
|
77
|
+
|
78
|
+
# Take the next (highest priority, delay met) job.
|
79
|
+
# Set timeout (default is 10s)
|
80
|
+
# Returns nil on error or timeout. Interrupt error passes through
|
81
|
+
def reserve_job! to=10
|
82
|
+
begin qjob = job_queue.reserve(to)
|
83
|
+
rescue Beanstalk::TimedOut => e ; Monkeyshines.logger.info e.to_s ; sleep 0.4 ; return ;
|
84
|
+
rescue StandardError => e ; Monkeyshines.logger.warn e.to_s ; sleep 1 ; return ; end
|
85
|
+
qjob
|
86
|
+
end
|
87
|
+
|
88
|
+
end
|
89
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
require 'wukong/extensions/hashlike_class'
|
2
|
+
module Edamame
|
3
|
+
|
4
|
+
module Scheduling
|
5
|
+
extend FactoryModule
|
6
|
+
|
7
|
+
# def type
|
8
|
+
# self.class.to_s
|
9
|
+
# end
|
10
|
+
# def to_hash
|
11
|
+
# end
|
12
|
+
|
13
|
+
class Base
|
14
|
+
include Wukong::HashlikeClass
|
15
|
+
has_members :last_run, :total_runs
|
16
|
+
|
17
|
+
def initialize *args
|
18
|
+
members.zip(args).each do |key, val|
|
19
|
+
self[key] = val if val
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def last_run_time
|
24
|
+
last_run.is_a?(String) ? Time.parse(last_run) : last_run
|
25
|
+
end
|
26
|
+
|
27
|
+
def since_last
|
28
|
+
Time.now - last_run_time
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
32
|
+
|
33
|
+
class Every < Base
|
34
|
+
has_member :period
|
35
|
+
def delay
|
36
|
+
period
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
class At < Base
|
41
|
+
attr_accessor :time
|
42
|
+
def initialize *args
|
43
|
+
super *args
|
44
|
+
self.time = Time.parse(time) if time.is_a?(String)
|
45
|
+
end
|
46
|
+
def delay
|
47
|
+
@delay ||= time - Time.now
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
class Once < Base
|
52
|
+
def delay
|
53
|
+
nil
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
class Rescheduling < Base
|
58
|
+
has_members :period, :total_items, :goal_items, :prev_max
|
59
|
+
|
60
|
+
cattr_accessor :min_resched_delay, :max_resched_delay
|
61
|
+
self.min_resched_delay = 10
|
62
|
+
self.max_resched_delay = 24*60*60
|
63
|
+
def delay
|
64
|
+
period
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
# require 'monkeyshines/utils/factory_module'
|
2
|
+
module Edamame
|
3
|
+
module Store
|
4
|
+
class Base
|
5
|
+
# The actual backing store; should respond to #set and #get methods
|
6
|
+
attr_accessor :db
|
7
|
+
|
8
|
+
def initialize options
|
9
|
+
end
|
10
|
+
|
11
|
+
#
|
12
|
+
# Executes block once for each element in the whole DB, in whatever order
|
13
|
+
# the DB thinks you should see it.
|
14
|
+
#
|
15
|
+
# Your block will see |key, val|
|
16
|
+
#
|
17
|
+
# key_store.each do |key, val|
|
18
|
+
# # ... stuff ...
|
19
|
+
# end
|
20
|
+
#
|
21
|
+
def each &block
|
22
|
+
db.iterinit
|
23
|
+
loop do
|
24
|
+
key = db.iternext or break
|
25
|
+
val = db[key]
|
26
|
+
yield key, val
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def each_as klass, &block
|
31
|
+
self.each do |key, hsh|
|
32
|
+
yield [key, klass.from_hash(hsh)]
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
# Delegate to store
|
37
|
+
def set(key, val)
|
38
|
+
return unless val
|
39
|
+
db.put key, val.to_hash.compact
|
40
|
+
end
|
41
|
+
def save obj
|
42
|
+
return unless obj
|
43
|
+
db.put obj.key, obj.to_hash.compact
|
44
|
+
end
|
45
|
+
|
46
|
+
def get(key) db[key] end
|
47
|
+
def put(key, val) db.put key, val end
|
48
|
+
def [](key) db[key] end
|
49
|
+
def close() db.close end
|
50
|
+
def size() db.size end
|
51
|
+
def delete(key) db.delete(key) end
|
52
|
+
|
53
|
+
#
|
54
|
+
# Load from standard command-line options
|
55
|
+
#
|
56
|
+
# obvs only works when there's just one store
|
57
|
+
#
|
58
|
+
def self.create type, config
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|