mrflip-edamame 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +8 -0
- data/.gitignore +31 -0
- data/LICENSE.textile +21 -0
- data/README.textile +178 -0
- data/Rakefile +75 -0
- data/VERSION +1 -0
- data/app/edamame_san/config.ru +4 -0
- data/app/edamame_san/config.yml +17 -0
- data/app/edamame_san/config/.gitignore +1 -0
- data/app/edamame_san/edamame_san.rb +71 -0
- data/app/edamame_san/public/favicon.ico +0 -0
- data/app/edamame_san/public/images/edamame_logo.icns +0 -0
- data/app/edamame_san/public/images/edamame_logo.ico +0 -0
- data/app/edamame_san/public/images/edamame_logo.png +0 -0
- data/app/edamame_san/public/images/edamame_logo_2.icns +0 -0
- data/app/edamame_san/public/javascripts/application.js +8 -0
- data/app/edamame_san/public/javascripts/jquery/jquery-ui.js +8694 -0
- data/app/edamame_san/public/javascripts/jquery/jquery.js +4376 -0
- data/app/edamame_san/public/stylesheets/application.css +32 -0
- data/app/edamame_san/public/stylesheets/layout.css +88 -0
- data/app/edamame_san/views/layout.haml +13 -0
- data/app/edamame_san/views/load.haml +37 -0
- data/app/edamame_san/views/root.haml +25 -0
- data/bin/edamame-ps +2 -0
- data/bin/empty_all.rb +15 -0
- data/bin/stats.rb +13 -0
- data/bin/sync.rb +15 -0
- data/bin/test_run.rb +14 -0
- data/edamame.gemspec +110 -0
- data/lib/edamame.rb +193 -0
- data/lib/edamame/job.rb +134 -0
- data/lib/edamame/queue.rb +6 -0
- data/lib/edamame/queue/beanstalk.rb +132 -0
- data/lib/edamame/rescheduled.rb +89 -0
- data/lib/edamame/scheduling.rb +69 -0
- data/lib/edamame/store.rb +8 -0
- data/lib/edamame/store/base.rb +62 -0
- data/lib/edamame/store/tyrant_store.rb +50 -0
- data/lib/methods.txt +94 -0
- data/spec/edamame_spec.rb +7 -0
- data/spec/spec_helper.rb +9 -0
- data/utils/god/README-god.textile +54 -0
- data/utils/god/beanstalkd_god.rb +34 -0
- data/utils/god/edamame.god +30 -0
- data/utils/god/god-etc-init-dot-d-example +40 -0
- data/utils/god/god_email.rb +45 -0
- data/utils/god/god_process.rb +140 -0
- data/utils/god/god_site_config.rb +4 -0
- data/utils/god/sinatra_god.rb +36 -0
- data/utils/god/tyrant_god.rb +67 -0
- data/utils/simulation/Add Percent Variation.vi +0 -0
- data/utils/simulation/Harmonic Average.vi +0 -0
- data/utils/simulation/Rescheduling Simulation.aliases +3 -0
- data/utils/simulation/Rescheduling Simulation.lvlps +3 -0
- data/utils/simulation/Rescheduling Simulation.lvproj +22 -0
- data/utils/simulation/Rescheduling.vi +0 -0
- data/utils/simulation/Weighted Average.vi +0 -0
- metadata +135 -0
data/lib/edamame/job.rb
ADDED
@@ -0,0 +1,134 @@
|
|
1
|
+
module Edamame
|
2
|
+
|
3
|
+
#
|
4
|
+
#
|
5
|
+
# id, name, body, timeouts, time-left, age, state, delay, pri, ttr
|
6
|
+
#
|
7
|
+
#
|
8
|
+
# * A job, pulled from the queue: it is connected to its beanstalk presence
|
9
|
+
# body contains
|
10
|
+
# ** obj
|
11
|
+
# ** scheduling
|
12
|
+
# ** stats
|
13
|
+
#
|
14
|
+
# * A DB job
|
15
|
+
# body contains
|
16
|
+
# ** tube, priority, ttr, state
|
17
|
+
# ** obj
|
18
|
+
# ** scheduling
|
19
|
+
# ** stats
|
20
|
+
module JobCore
|
21
|
+
|
22
|
+
def key
|
23
|
+
[ tube, obj[:key]||obj['key'] ].join('-')
|
24
|
+
end
|
25
|
+
|
26
|
+
#
|
27
|
+
def since_last
|
28
|
+
scheduling.last_run - Time.now
|
29
|
+
end
|
30
|
+
|
31
|
+
# Beanstalk::Job stats:
|
32
|
+
# { "pri"=>65536, "ttr"=>120,
|
33
|
+
# {"releases"=>8, "delay"=>5, "kicks"=>0, "buries"=>0, "id"=>202,
|
34
|
+
# "tube"=>"default", "time-left"=>120,
|
35
|
+
# "timeouts"=>0, "age"=>1415, "state"=>"reserved"}
|
36
|
+
#
|
37
|
+
# [ "id",
|
38
|
+
# "tube", "pri", "ttr", "state",
|
39
|
+
# "delay",
|
40
|
+
# "releases", "kicks", "buries",
|
41
|
+
# "time-left", "timeouts", "age", ]
|
42
|
+
end
|
43
|
+
|
44
|
+
|
45
|
+
Beanstalk::Job.class_eval do
|
46
|
+
include JobCore
|
47
|
+
|
48
|
+
def scheduling
|
49
|
+
@scheduling ||= Edamame::Scheduling.from_hash ybody['scheduling']
|
50
|
+
end
|
51
|
+
|
52
|
+
def obj
|
53
|
+
ybody['obj']
|
54
|
+
end
|
55
|
+
|
56
|
+
def priority
|
57
|
+
pri
|
58
|
+
end
|
59
|
+
|
60
|
+
def tube
|
61
|
+
stats['tube']
|
62
|
+
end
|
63
|
+
|
64
|
+
# Override this for rescheduling
|
65
|
+
def update!
|
66
|
+
scheduling.total_runs = scheduling.total_runs.to_i + stats['releases']
|
67
|
+
scheduling.last_run = Time.now
|
68
|
+
end
|
69
|
+
|
70
|
+
def to_hash flatten=true
|
71
|
+
hsh = {
|
72
|
+
"tube" => tube,
|
73
|
+
"priority" => priority,
|
74
|
+
"ttr" => ttr,
|
75
|
+
"state" => state,
|
76
|
+
"scheduling" => scheduling.to_hash,
|
77
|
+
'key' => key,
|
78
|
+
"obj" => obj.to_hash,
|
79
|
+
}
|
80
|
+
if flatten
|
81
|
+
hsh["scheduling"] = hsh['scheduling'].to_yaml
|
82
|
+
hsh["obj"] = hsh['obj'].to_yaml
|
83
|
+
end
|
84
|
+
hsh
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
class Job < Struct.new(
|
89
|
+
:tube, :priority, :ttr, :state,
|
90
|
+
:scheduling, :obj
|
91
|
+
)
|
92
|
+
# Job.class_eval do
|
93
|
+
include JobCore
|
94
|
+
|
95
|
+
DEFAULT_OPTIONS = {
|
96
|
+
'priority' => 65536,
|
97
|
+
'ttr' => 120,
|
98
|
+
'state' => 1,
|
99
|
+
'scheduling' => Edamame::Scheduling::Once.new()
|
100
|
+
}
|
101
|
+
|
102
|
+
# attr_accessor :runs, :failures, :prev_run_at
|
103
|
+
def initialize *args
|
104
|
+
super *args
|
105
|
+
DEFAULT_OPTIONS.each{|key,val| self[key] ||= val }
|
106
|
+
[:priority, :ttr, :state].each{|key| self[key] = self[key].to_i }
|
107
|
+
case self.scheduling
|
108
|
+
when String
|
109
|
+
scheduling_hash = YAML.load(self.scheduling) rescue nil
|
110
|
+
self.scheduling = Scheduling.from_hash(scheduling_hash) if scheduling_hash
|
111
|
+
when Hash
|
112
|
+
self.scheduling = Scheduling.from_hash(scheduling)
|
113
|
+
else raise "Can't build a Scheduling from #{self.scheduling}" ; end
|
114
|
+
if self.obj.is_a?(String) then self.obj = YAML.load(self.obj) rescue nil ; end
|
115
|
+
end
|
116
|
+
|
117
|
+
def delay
|
118
|
+
scheduling.delay
|
119
|
+
end
|
120
|
+
|
121
|
+
def to_hash flatten=true
|
122
|
+
hsh = super()
|
123
|
+
hsh["scheduling"] = scheduling.to_hash
|
124
|
+
hsh["obj"] = obj.to_hash
|
125
|
+
if flatten
|
126
|
+
hsh["scheduling"] = hsh['scheduling'].to_yaml
|
127
|
+
hsh["obj"] = hsh['obj'].to_yaml
|
128
|
+
end
|
129
|
+
hsh
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
|
@@ -0,0 +1,132 @@
|
|
1
|
+
module Edamame
|
2
|
+
module Queue
|
3
|
+
#
|
4
|
+
# Persistent job queue for periodic requests.
|
5
|
+
#
|
6
|
+
# Jobs are reserved, run, and if successful put back with an updated delay parameter.
|
7
|
+
#
|
8
|
+
# This is useful for mass scraping of timelines (RSS feeds, twitter search
|
9
|
+
# results, etc. See http://github.com/mrflip/wuclan for )
|
10
|
+
#
|
11
|
+
class BeanstalkQueue
|
12
|
+
DEFAULT_OPTIONS = {
|
13
|
+
:priority => 65536, # default job queue priority
|
14
|
+
:time_to_run => 60*5, # 5 minutes to complete a job or assume dead
|
15
|
+
:uris => ['localhost:11300'],
|
16
|
+
:default_tube => 'default',
|
17
|
+
}
|
18
|
+
attr_accessor :options
|
19
|
+
|
20
|
+
#
|
21
|
+
# beanstalk_pool -- specify nil to use the default single-node ['localhost:11300'] pool
|
22
|
+
#
|
23
|
+
def initialize _options={}
|
24
|
+
self.options = DEFAULT_OPTIONS.deep_merge(_options.compact)
|
25
|
+
options[:default_tube] = options[:default_tube].to_s
|
26
|
+
end
|
27
|
+
|
28
|
+
#
|
29
|
+
# Add a new Job to the queue
|
30
|
+
#
|
31
|
+
def put job, priority=nil, delay=nil
|
32
|
+
beanstalk.yput(job.to_hash(false),
|
33
|
+
(priority || job.priority), (delay || job.delay), job.ttr)
|
34
|
+
end
|
35
|
+
|
36
|
+
#
|
37
|
+
# Remove the job from the queue.
|
38
|
+
#
|
39
|
+
def delete(job)
|
40
|
+
job.delete
|
41
|
+
end
|
42
|
+
|
43
|
+
#
|
44
|
+
# Returns the job to the queue, to be re-run later.
|
45
|
+
#
|
46
|
+
# release'ing a job acknowledges it was completed, successfully or not
|
47
|
+
#
|
48
|
+
def release job, priority=nil, delay=nil
|
49
|
+
job.release( (priority || job.priority), (delay || job.delay) )
|
50
|
+
end
|
51
|
+
|
52
|
+
#
|
53
|
+
# Take the next (highest priority, delay met) job.
|
54
|
+
# Set timeout (default is 10s)
|
55
|
+
# Returns nil on error or timeout. Interrupt error passes through
|
56
|
+
#
|
57
|
+
def reserve timeout=10
|
58
|
+
begin
|
59
|
+
job = beanstalk.reserve(timeout) or return
|
60
|
+
rescue Beanstalk::TimedOut => e ; warn e.to_s ; sleep 0.4 ; return ;
|
61
|
+
rescue StandardError => e ; warn e.to_s ; sleep 1 ; return ; end
|
62
|
+
job
|
63
|
+
end
|
64
|
+
|
65
|
+
#
|
66
|
+
# Shelves the job.
|
67
|
+
#
|
68
|
+
def bury
|
69
|
+
job.bury job.priority
|
70
|
+
end
|
71
|
+
|
72
|
+
# The beanstalk pool which acts as job queue
|
73
|
+
def beanstalk
|
74
|
+
return @beanstalk if @beanstalk
|
75
|
+
@beanstalk = Beanstalk::Pool.new(options[:uris], options[:default_tube])
|
76
|
+
self.tube= options[:default_tube]
|
77
|
+
@beanstalk
|
78
|
+
end
|
79
|
+
# Close the job queue
|
80
|
+
def close
|
81
|
+
@beanstalk.close if @beanstalk
|
82
|
+
@beanstalk = nil
|
83
|
+
end
|
84
|
+
|
85
|
+
# uses and watches the given beanstalk tube
|
86
|
+
def tube= _tube
|
87
|
+
puts "#{self.class} setting tube to #{_tube}, was #{@tube}"
|
88
|
+
@beanstalk.use _tube
|
89
|
+
@beanstalk.watch _tube
|
90
|
+
end
|
91
|
+
|
92
|
+
# Stats on job count across the pool
|
93
|
+
def stats
|
94
|
+
beanstalk.stats.select{|k,v| k =~ /jobs/}
|
95
|
+
end
|
96
|
+
# Total jobs in the queue, whether reserved, ready, buried or delayed.
|
97
|
+
def total_jobs
|
98
|
+
[:reserved, :ready, :buried, :delayed].inject(0){|sum,type| sum += stats["current-jobs-#{type}"]}
|
99
|
+
end
|
100
|
+
|
101
|
+
#
|
102
|
+
#
|
103
|
+
#
|
104
|
+
def empty tube=nil, &block
|
105
|
+
tube = tube.to_s if tube
|
106
|
+
curr_tube = beanstalk.list_tube_used.values.first
|
107
|
+
curr_watches = beanstalk.list_tubes_watched.values.first
|
108
|
+
beanstalk.use tube if tube
|
109
|
+
beanstalk.watch tube if tube
|
110
|
+
p ["emptying", tube, beanstalk_total_jobs]
|
111
|
+
loop do
|
112
|
+
kicked = beanstalk.open_connections.map{|conxn| conxn.kick(20) }
|
113
|
+
break if (beanstalk_total_jobs == 0) || (!beanstalk.peek_ready)
|
114
|
+
qjob = reserve(5) or break
|
115
|
+
yield qjob
|
116
|
+
qjob.delete
|
117
|
+
end
|
118
|
+
beanstalk.use curr_tube
|
119
|
+
beanstalk.ignore tube if (! curr_watches.include?(tube))
|
120
|
+
end
|
121
|
+
|
122
|
+
def empty_all &block
|
123
|
+
tubes = beanstalk.list_tubes.values.flatten.uniq
|
124
|
+
tubes.each do |tube|
|
125
|
+
empty tube, &block
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
end # class
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
@@ -0,0 +1,89 @@
|
|
1
|
+
module Edamame
|
2
|
+
module Rescheduled
|
3
|
+
|
4
|
+
|
5
|
+
# ===========================================================================
|
6
|
+
#
|
7
|
+
# Rescheduling
|
8
|
+
|
9
|
+
#
|
10
|
+
# Finish the qjob and re-insert it at the same priority but with the new
|
11
|
+
# delay setting.
|
12
|
+
#
|
13
|
+
def reschedule qjob, scrape_job
|
14
|
+
priority = qjob.stats['pri']
|
15
|
+
qjob.delete
|
16
|
+
self.save scrape_job, priority
|
17
|
+
end
|
18
|
+
|
19
|
+
#
|
20
|
+
# Flattens the scrape_job and enqueues it with a delay appropriate for the
|
21
|
+
# average item rate so far. You can explicitly supply a +priority+ to
|
22
|
+
# override the priority set at instantiation.
|
23
|
+
#
|
24
|
+
# This doesn't delete the job -- use reschedule if you are putting back an
|
25
|
+
# existing qjob.
|
26
|
+
#
|
27
|
+
def save scrape_job, priority=nil, delay=nil
|
28
|
+
body = scrape_job.to_flat.join("\t")
|
29
|
+
delay ||= delay_to_next_scrape(scrape_job)
|
30
|
+
priority ||= config[:priority]
|
31
|
+
log scrape_job, priority, delay
|
32
|
+
job_queue.put body, priority, delay, config[:time_to_run]
|
33
|
+
end
|
34
|
+
# delegates to #save() -- priority and delay are unchanged.
|
35
|
+
def <<(scrape_job) save scrape_job end
|
36
|
+
|
37
|
+
#
|
38
|
+
# if we can't determine an actual rate, uses max_resched_delay (assumes it
|
39
|
+
# is rare)
|
40
|
+
#
|
41
|
+
def delay_to_next_scrape scrape_job
|
42
|
+
rate = scrape_job.avg_rate or return max_resched_delay
|
43
|
+
delay = items_goal.to_f / rate
|
44
|
+
delay = delay.clamp(min_resched_delay, max_resched_delay)
|
45
|
+
delay.to_i
|
46
|
+
end
|
47
|
+
|
48
|
+
#
|
49
|
+
# A (very prolix) log statement
|
50
|
+
#
|
51
|
+
def log scrape_job, priority=nil, delay=nil
|
52
|
+
delay ||= delay_to_next_scrape(scrape_job)
|
53
|
+
rate_str = scrape_job.avg_rate ? "%10.5f/s" % (scrape_job.avg_rate) : " "*12
|
54
|
+
ll = "Rescheduling\t#{"%-23s"%scrape_job.query_term}\t"
|
55
|
+
ll << "%6d" % priority if priority
|
56
|
+
ll << "\t#{rate_str}"
|
57
|
+
ll << "\t#{"%7d" % (scrape_job.prev_items||0)}"
|
58
|
+
ll << "\t#{"%4d"%(scrape_job.new_items||0)} nu"
|
59
|
+
ll << "\tin #{"%8.2f" % delay} s"
|
60
|
+
ll << "\t#{(Time.now + delay).strftime("%Y-%m-%d %H:%M:%S")}"
|
61
|
+
Monkeyshines.logger.info ll
|
62
|
+
end
|
63
|
+
|
64
|
+
# ===========================================================================
|
65
|
+
#
|
66
|
+
# Beanstalkd interface
|
67
|
+
#
|
68
|
+
|
69
|
+
#
|
70
|
+
# De-serialize the scrape job from the queue.
|
71
|
+
#
|
72
|
+
def scrape_job_from_qjob qjob
|
73
|
+
args = qjob.body.split("\t")
|
74
|
+
# request_klass = Wukong.class_from_resource(args.shift)
|
75
|
+
scrape_job = request_klass.new(*args[1..-1])
|
76
|
+
end
|
77
|
+
|
78
|
+
# Take the next (highest priority, delay met) job.
|
79
|
+
# Set timeout (default is 10s)
|
80
|
+
# Returns nil on error or timeout. Interrupt error passes through
|
81
|
+
def reserve_job! to=10
|
82
|
+
begin qjob = job_queue.reserve(to)
|
83
|
+
rescue Beanstalk::TimedOut => e ; Monkeyshines.logger.info e.to_s ; sleep 0.4 ; return ;
|
84
|
+
rescue StandardError => e ; Monkeyshines.logger.warn e.to_s ; sleep 1 ; return ; end
|
85
|
+
qjob
|
86
|
+
end
|
87
|
+
|
88
|
+
end
|
89
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
require 'wukong/extensions/hashlike_class'
|
2
|
+
module Edamame
|
3
|
+
|
4
|
+
module Scheduling
|
5
|
+
extend FactoryModule
|
6
|
+
|
7
|
+
# def type
|
8
|
+
# self.class.to_s
|
9
|
+
# end
|
10
|
+
# def to_hash
|
11
|
+
# end
|
12
|
+
|
13
|
+
class Base
|
14
|
+
include Wukong::HashlikeClass
|
15
|
+
has_members :last_run, :total_runs
|
16
|
+
|
17
|
+
def initialize *args
|
18
|
+
members.zip(args).each do |key, val|
|
19
|
+
self[key] = val if val
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def last_run_time
|
24
|
+
last_run.is_a?(String) ? Time.parse(last_run) : last_run
|
25
|
+
end
|
26
|
+
|
27
|
+
def since_last
|
28
|
+
Time.now - last_run_time
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
32
|
+
|
33
|
+
class Every < Base
|
34
|
+
has_member :period
|
35
|
+
def delay
|
36
|
+
period
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
class At < Base
|
41
|
+
attr_accessor :time
|
42
|
+
def initialize *args
|
43
|
+
super *args
|
44
|
+
self.time = Time.parse(time) if time.is_a?(String)
|
45
|
+
end
|
46
|
+
def delay
|
47
|
+
@delay ||= time - Time.now
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
class Once < Base
|
52
|
+
def delay
|
53
|
+
nil
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
class Rescheduling < Base
|
58
|
+
has_members :period, :total_items, :goal_items, :prev_max
|
59
|
+
|
60
|
+
cattr_accessor :min_resched_delay, :max_resched_delay
|
61
|
+
self.min_resched_delay = 10
|
62
|
+
self.max_resched_delay = 24*60*60
|
63
|
+
def delay
|
64
|
+
period
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
# require 'monkeyshines/utils/factory_module'
|
2
|
+
module Edamame
|
3
|
+
module Store
|
4
|
+
class Base
|
5
|
+
# The actual backing store; should respond to #set and #get methods
|
6
|
+
attr_accessor :db
|
7
|
+
|
8
|
+
def initialize options
|
9
|
+
end
|
10
|
+
|
11
|
+
#
|
12
|
+
# Executes block once for each element in the whole DB, in whatever order
|
13
|
+
# the DB thinks you should see it.
|
14
|
+
#
|
15
|
+
# Your block will see |key, val|
|
16
|
+
#
|
17
|
+
# key_store.each do |key, val|
|
18
|
+
# # ... stuff ...
|
19
|
+
# end
|
20
|
+
#
|
21
|
+
def each &block
|
22
|
+
db.iterinit
|
23
|
+
loop do
|
24
|
+
key = db.iternext or break
|
25
|
+
val = db[key]
|
26
|
+
yield key, val
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def each_as klass, &block
|
31
|
+
self.each do |key, hsh|
|
32
|
+
yield [key, klass.from_hash(hsh)]
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
# Delegate to store
|
37
|
+
def set(key, val)
|
38
|
+
return unless val
|
39
|
+
db.put key, val.to_hash.compact
|
40
|
+
end
|
41
|
+
def save obj
|
42
|
+
return unless obj
|
43
|
+
db.put obj.key, obj.to_hash.compact
|
44
|
+
end
|
45
|
+
|
46
|
+
def get(key) db[key] end
|
47
|
+
def put(key, val) db.put key, val end
|
48
|
+
def [](key) db[key] end
|
49
|
+
def close() db.close end
|
50
|
+
def size() db.size end
|
51
|
+
def delete(key) db.delete(key) end
|
52
|
+
|
53
|
+
#
|
54
|
+
# Load from standard command-line options
|
55
|
+
#
|
56
|
+
# obvs only works when there's just one store
|
57
|
+
#
|
58
|
+
def self.create type, config
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|