mrflip-edamame 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (58) hide show
  1. data/.document +8 -0
  2. data/.gitignore +31 -0
  3. data/LICENSE.textile +21 -0
  4. data/README.textile +178 -0
  5. data/Rakefile +75 -0
  6. data/VERSION +1 -0
  7. data/app/edamame_san/config.ru +4 -0
  8. data/app/edamame_san/config.yml +17 -0
  9. data/app/edamame_san/config/.gitignore +1 -0
  10. data/app/edamame_san/edamame_san.rb +71 -0
  11. data/app/edamame_san/public/favicon.ico +0 -0
  12. data/app/edamame_san/public/images/edamame_logo.icns +0 -0
  13. data/app/edamame_san/public/images/edamame_logo.ico +0 -0
  14. data/app/edamame_san/public/images/edamame_logo.png +0 -0
  15. data/app/edamame_san/public/images/edamame_logo_2.icns +0 -0
  16. data/app/edamame_san/public/javascripts/application.js +8 -0
  17. data/app/edamame_san/public/javascripts/jquery/jquery-ui.js +8694 -0
  18. data/app/edamame_san/public/javascripts/jquery/jquery.js +4376 -0
  19. data/app/edamame_san/public/stylesheets/application.css +32 -0
  20. data/app/edamame_san/public/stylesheets/layout.css +88 -0
  21. data/app/edamame_san/views/layout.haml +13 -0
  22. data/app/edamame_san/views/load.haml +37 -0
  23. data/app/edamame_san/views/root.haml +25 -0
  24. data/bin/edamame-ps +2 -0
  25. data/bin/empty_all.rb +15 -0
  26. data/bin/stats.rb +13 -0
  27. data/bin/sync.rb +15 -0
  28. data/bin/test_run.rb +14 -0
  29. data/edamame.gemspec +110 -0
  30. data/lib/edamame.rb +193 -0
  31. data/lib/edamame/job.rb +134 -0
  32. data/lib/edamame/queue.rb +6 -0
  33. data/lib/edamame/queue/beanstalk.rb +132 -0
  34. data/lib/edamame/rescheduled.rb +89 -0
  35. data/lib/edamame/scheduling.rb +69 -0
  36. data/lib/edamame/store.rb +8 -0
  37. data/lib/edamame/store/base.rb +62 -0
  38. data/lib/edamame/store/tyrant_store.rb +50 -0
  39. data/lib/methods.txt +94 -0
  40. data/spec/edamame_spec.rb +7 -0
  41. data/spec/spec_helper.rb +9 -0
  42. data/utils/god/README-god.textile +54 -0
  43. data/utils/god/beanstalkd_god.rb +34 -0
  44. data/utils/god/edamame.god +30 -0
  45. data/utils/god/god-etc-init-dot-d-example +40 -0
  46. data/utils/god/god_email.rb +45 -0
  47. data/utils/god/god_process.rb +140 -0
  48. data/utils/god/god_site_config.rb +4 -0
  49. data/utils/god/sinatra_god.rb +36 -0
  50. data/utils/god/tyrant_god.rb +67 -0
  51. data/utils/simulation/Add Percent Variation.vi +0 -0
  52. data/utils/simulation/Harmonic Average.vi +0 -0
  53. data/utils/simulation/Rescheduling Simulation.aliases +3 -0
  54. data/utils/simulation/Rescheduling Simulation.lvlps +3 -0
  55. data/utils/simulation/Rescheduling Simulation.lvproj +22 -0
  56. data/utils/simulation/Rescheduling.vi +0 -0
  57. data/utils/simulation/Weighted Average.vi +0 -0
  58. metadata +135 -0
@@ -0,0 +1,134 @@
1
+ module Edamame
2
+
3
+ #
4
+ #
5
+ # id, name, body, timeouts, time-left, age, state, delay, pri, ttr
6
+ #
7
+ #
8
+ # * A job, pulled from the queue: it is connected to its beanstalk presence
9
+ # body contains
10
+ # ** obj
11
+ # ** scheduling
12
+ # ** stats
13
+ #
14
+ # * A DB job
15
+ # body contains
16
+ # ** tube, priority, ttr, state
17
+ # ** obj
18
+ # ** scheduling
19
+ # ** stats
20
+ module JobCore
21
+
22
+ def key
23
+ [ tube, obj[:key]||obj['key'] ].join('-')
24
+ end
25
+
26
+ #
27
+ def since_last
28
+ scheduling.last_run - Time.now
29
+ end
30
+
31
+ # Beanstalk::Job stats:
32
+ # { "pri"=>65536, "ttr"=>120,
33
+ # {"releases"=>8, "delay"=>5, "kicks"=>0, "buries"=>0, "id"=>202,
34
+ # "tube"=>"default", "time-left"=>120,
35
+ # "timeouts"=>0, "age"=>1415, "state"=>"reserved"}
36
+ #
37
+ # [ "id",
38
+ # "tube", "pri", "ttr", "state",
39
+ # "delay",
40
+ # "releases", "kicks", "buries",
41
+ # "time-left", "timeouts", "age", ]
42
+ end
43
+
44
+
45
+ Beanstalk::Job.class_eval do
46
+ include JobCore
47
+
48
+ def scheduling
49
+ @scheduling ||= Edamame::Scheduling.from_hash ybody['scheduling']
50
+ end
51
+
52
+ def obj
53
+ ybody['obj']
54
+ end
55
+
56
+ def priority
57
+ pri
58
+ end
59
+
60
+ def tube
61
+ stats['tube']
62
+ end
63
+
64
+ # Override this for rescheduling
65
+ def update!
66
+ scheduling.total_runs = scheduling.total_runs.to_i + stats['releases']
67
+ scheduling.last_run = Time.now
68
+ end
69
+
70
+ def to_hash flatten=true
71
+ hsh = {
72
+ "tube" => tube,
73
+ "priority" => priority,
74
+ "ttr" => ttr,
75
+ "state" => state,
76
+ "scheduling" => scheduling.to_hash,
77
+ 'key' => key,
78
+ "obj" => obj.to_hash,
79
+ }
80
+ if flatten
81
+ hsh["scheduling"] = hsh['scheduling'].to_yaml
82
+ hsh["obj"] = hsh['obj'].to_yaml
83
+ end
84
+ hsh
85
+ end
86
+ end
87
+
88
+ class Job < Struct.new(
89
+ :tube, :priority, :ttr, :state,
90
+ :scheduling, :obj
91
+ )
92
+ # Job.class_eval do
93
+ include JobCore
94
+
95
+ DEFAULT_OPTIONS = {
96
+ 'priority' => 65536,
97
+ 'ttr' => 120,
98
+ 'state' => 1,
99
+ 'scheduling' => Edamame::Scheduling::Once.new()
100
+ }
101
+
102
+ # attr_accessor :runs, :failures, :prev_run_at
103
+ def initialize *args
104
+ super *args
105
+ DEFAULT_OPTIONS.each{|key,val| self[key] ||= val }
106
+ [:priority, :ttr, :state].each{|key| self[key] = self[key].to_i }
107
+ case self.scheduling
108
+ when String
109
+ scheduling_hash = YAML.load(self.scheduling) rescue nil
110
+ self.scheduling = Scheduling.from_hash(scheduling_hash) if scheduling_hash
111
+ when Hash
112
+ self.scheduling = Scheduling.from_hash(scheduling)
113
+ else raise "Can't build a Scheduling from #{self.scheduling}" ; end
114
+ if self.obj.is_a?(String) then self.obj = YAML.load(self.obj) rescue nil ; end
115
+ end
116
+
117
+ def delay
118
+ scheduling.delay
119
+ end
120
+
121
+ def to_hash flatten=true
122
+ hsh = super()
123
+ hsh["scheduling"] = scheduling.to_hash
124
+ hsh["obj"] = obj.to_hash
125
+ if flatten
126
+ hsh["scheduling"] = hsh['scheduling'].to_yaml
127
+ hsh["obj"] = hsh['obj'].to_yaml
128
+ end
129
+ hsh
130
+ end
131
+ end
132
+ end
133
+
134
+
@@ -0,0 +1,6 @@
1
+ module Edamame
2
+ module Queue
3
+ extend FactoryModule
4
+ autoload :BeanstalkQueue, 'edamame/queue/beanstalk'
5
+ end
6
+ end
@@ -0,0 +1,132 @@
1
+ module Edamame
2
+ module Queue
3
+ #
4
+ # Persistent job queue for periodic requests.
5
+ #
6
+ # Jobs are reserved, run, and if successful put back with an updated delay parameter.
7
+ #
8
+ # This is useful for mass scraping of timelines (RSS feeds, twitter search
9
+ # results, etc. See http://github.com/mrflip/wuclan for )
10
+ #
11
+ class BeanstalkQueue
12
+ DEFAULT_OPTIONS = {
13
+ :priority => 65536, # default job queue priority
14
+ :time_to_run => 60*5, # 5 minutes to complete a job or assume dead
15
+ :uris => ['localhost:11300'],
16
+ :default_tube => 'default',
17
+ }
18
+ attr_accessor :options
19
+
20
+ #
21
+ # beanstalk_pool -- specify nil to use the default single-node ['localhost:11300'] pool
22
+ #
23
+ def initialize _options={}
24
+ self.options = DEFAULT_OPTIONS.deep_merge(_options.compact)
25
+ options[:default_tube] = options[:default_tube].to_s
26
+ end
27
+
28
+ #
29
+ # Add a new Job to the queue
30
+ #
31
+ def put job, priority=nil, delay=nil
32
+ beanstalk.yput(job.to_hash(false),
33
+ (priority || job.priority), (delay || job.delay), job.ttr)
34
+ end
35
+
36
+ #
37
+ # Remove the job from the queue.
38
+ #
39
+ def delete(job)
40
+ job.delete
41
+ end
42
+
43
+ #
44
+ # Returns the job to the queue, to be re-run later.
45
+ #
46
+ # release'ing a job acknowledges it was completed, successfully or not
47
+ #
48
+ def release job, priority=nil, delay=nil
49
+ job.release( (priority || job.priority), (delay || job.delay) )
50
+ end
51
+
52
+ #
53
+ # Take the next (highest priority, delay met) job.
54
+ # Set timeout (default is 10s)
55
+ # Returns nil on error or timeout. Interrupt error passes through
56
+ #
57
+ def reserve timeout=10
58
+ begin
59
+ job = beanstalk.reserve(timeout) or return
60
+ rescue Beanstalk::TimedOut => e ; warn e.to_s ; sleep 0.4 ; return ;
61
+ rescue StandardError => e ; warn e.to_s ; sleep 1 ; return ; end
62
+ job
63
+ end
64
+
65
+ #
66
+ # Shelves the job.
67
+ #
68
+ def bury
69
+ job.bury job.priority
70
+ end
71
+
72
+ # The beanstalk pool which acts as job queue
73
+ def beanstalk
74
+ return @beanstalk if @beanstalk
75
+ @beanstalk = Beanstalk::Pool.new(options[:uris], options[:default_tube])
76
+ self.tube= options[:default_tube]
77
+ @beanstalk
78
+ end
79
+ # Close the job queue
80
+ def close
81
+ @beanstalk.close if @beanstalk
82
+ @beanstalk = nil
83
+ end
84
+
85
+ # uses and watches the given beanstalk tube
86
+ def tube= _tube
87
+ puts "#{self.class} setting tube to #{_tube}, was #{@tube}"
88
+ @beanstalk.use _tube
89
+ @beanstalk.watch _tube
90
+ end
91
+
92
+ # Stats on job count across the pool
93
+ def stats
94
+ beanstalk.stats.select{|k,v| k =~ /jobs/}
95
+ end
96
+ # Total jobs in the queue, whether reserved, ready, buried or delayed.
97
+ def total_jobs
98
+ [:reserved, :ready, :buried, :delayed].inject(0){|sum,type| sum += stats["current-jobs-#{type}"]}
99
+ end
100
+
101
+ #
102
+ #
103
+ #
104
+ def empty tube=nil, &block
105
+ tube = tube.to_s if tube
106
+ curr_tube = beanstalk.list_tube_used.values.first
107
+ curr_watches = beanstalk.list_tubes_watched.values.first
108
+ beanstalk.use tube if tube
109
+ beanstalk.watch tube if tube
110
+ p ["emptying", tube, beanstalk_total_jobs]
111
+ loop do
112
+ kicked = beanstalk.open_connections.map{|conxn| conxn.kick(20) }
113
+ break if (beanstalk_total_jobs == 0) || (!beanstalk.peek_ready)
114
+ qjob = reserve(5) or break
115
+ yield qjob
116
+ qjob.delete
117
+ end
118
+ beanstalk.use curr_tube
119
+ beanstalk.ignore tube if (! curr_watches.include?(tube))
120
+ end
121
+
122
+ def empty_all &block
123
+ tubes = beanstalk.list_tubes.values.flatten.uniq
124
+ tubes.each do |tube|
125
+ empty tube, &block
126
+ end
127
+ end
128
+
129
+ end # class
130
+ end
131
+ end
132
+
@@ -0,0 +1,89 @@
1
+ module Edamame
2
+ module Rescheduled
3
+
4
+
5
+ # ===========================================================================
6
+ #
7
+ # Rescheduling
8
+
9
+ #
10
+ # Finish the qjob and re-insert it at the same priority but with the new
11
+ # delay setting.
12
+ #
13
+ def reschedule qjob, scrape_job
14
+ priority = qjob.stats['pri']
15
+ qjob.delete
16
+ self.save scrape_job, priority
17
+ end
18
+
19
+ #
20
+ # Flattens the scrape_job and enqueues it with a delay appropriate for the
21
+ # average item rate so far. You can explicitly supply a +priority+ to
22
+ # override the priority set at instantiation.
23
+ #
24
+ # This doesn't delete the job -- use reschedule if you are putting back an
25
+ # existing qjob.
26
+ #
27
+ def save scrape_job, priority=nil, delay=nil
28
+ body = scrape_job.to_flat.join("\t")
29
+ delay ||= delay_to_next_scrape(scrape_job)
30
+ priority ||= config[:priority]
31
+ log scrape_job, priority, delay
32
+ job_queue.put body, priority, delay, config[:time_to_run]
33
+ end
34
+ # delegates to #save() -- priority and delay are unchanged.
35
+ def <<(scrape_job) save scrape_job end
36
+
37
+ #
38
+ # if we can't determine an actual rate, uses max_resched_delay (assumes it
39
+ # is rare)
40
+ #
41
+ def delay_to_next_scrape scrape_job
42
+ rate = scrape_job.avg_rate or return max_resched_delay
43
+ delay = items_goal.to_f / rate
44
+ delay = delay.clamp(min_resched_delay, max_resched_delay)
45
+ delay.to_i
46
+ end
47
+
48
+ #
49
+ # A (very prolix) log statement
50
+ #
51
+ def log scrape_job, priority=nil, delay=nil
52
+ delay ||= delay_to_next_scrape(scrape_job)
53
+ rate_str = scrape_job.avg_rate ? "%10.5f/s" % (scrape_job.avg_rate) : " "*12
54
+ ll = "Rescheduling\t#{"%-23s"%scrape_job.query_term}\t"
55
+ ll << "%6d" % priority if priority
56
+ ll << "\t#{rate_str}"
57
+ ll << "\t#{"%7d" % (scrape_job.prev_items||0)}"
58
+ ll << "\t#{"%4d"%(scrape_job.new_items||0)} nu"
59
+ ll << "\tin #{"%8.2f" % delay} s"
60
+ ll << "\t#{(Time.now + delay).strftime("%Y-%m-%d %H:%M:%S")}"
61
+ Monkeyshines.logger.info ll
62
+ end
63
+
64
+ # ===========================================================================
65
+ #
66
+ # Beanstalkd interface
67
+ #
68
+
69
+ #
70
+ # De-serialize the scrape job from the queue.
71
+ #
72
+ def scrape_job_from_qjob qjob
73
+ args = qjob.body.split("\t")
74
+ # request_klass = Wukong.class_from_resource(args.shift)
75
+ scrape_job = request_klass.new(*args[1..-1])
76
+ end
77
+
78
+ # Take the next (highest priority, delay met) job.
79
+ # Set timeout (default is 10s)
80
+ # Returns nil on error or timeout. Interrupt error passes through
81
+ def reserve_job! to=10
82
+ begin qjob = job_queue.reserve(to)
83
+ rescue Beanstalk::TimedOut => e ; Monkeyshines.logger.info e.to_s ; sleep 0.4 ; return ;
84
+ rescue StandardError => e ; Monkeyshines.logger.warn e.to_s ; sleep 1 ; return ; end
85
+ qjob
86
+ end
87
+
88
+ end
89
+ end
@@ -0,0 +1,69 @@
1
+ require 'wukong/extensions/hashlike_class'
2
+ module Edamame
3
+
4
+ module Scheduling
5
+ extend FactoryModule
6
+
7
+ # def type
8
+ # self.class.to_s
9
+ # end
10
+ # def to_hash
11
+ # end
12
+
13
+ class Base
14
+ include Wukong::HashlikeClass
15
+ has_members :last_run, :total_runs
16
+
17
+ def initialize *args
18
+ members.zip(args).each do |key, val|
19
+ self[key] = val if val
20
+ end
21
+ end
22
+
23
+ def last_run_time
24
+ last_run.is_a?(String) ? Time.parse(last_run) : last_run
25
+ end
26
+
27
+ def since_last
28
+ Time.now - last_run_time
29
+ end
30
+
31
+ end
32
+
33
+ class Every < Base
34
+ has_member :period
35
+ def delay
36
+ period
37
+ end
38
+ end
39
+
40
+ class At < Base
41
+ attr_accessor :time
42
+ def initialize *args
43
+ super *args
44
+ self.time = Time.parse(time) if time.is_a?(String)
45
+ end
46
+ def delay
47
+ @delay ||= time - Time.now
48
+ end
49
+ end
50
+
51
+ class Once < Base
52
+ def delay
53
+ nil
54
+ end
55
+ end
56
+
57
+ class Rescheduling < Base
58
+ has_members :period, :total_items, :goal_items, :prev_max
59
+
60
+ cattr_accessor :min_resched_delay, :max_resched_delay
61
+ self.min_resched_delay = 10
62
+ self.max_resched_delay = 24*60*60
63
+ def delay
64
+ period
65
+ end
66
+ end
67
+ end
68
+
69
+ end
@@ -0,0 +1,8 @@
1
+ require 'monkeyshines/utils/factory_module'
2
+ module Edamame
3
+ module Store
4
+ extend FactoryModule
5
+ autoload :Base, 'edamame/store/base'
6
+ autoload :TyrantStore, 'edamame/store/tyrant_store'
7
+ end
8
+ end
@@ -0,0 +1,62 @@
1
+ # require 'monkeyshines/utils/factory_module'
2
+ module Edamame
3
+ module Store
4
+ class Base
5
+ # The actual backing store; should respond to #set and #get methods
6
+ attr_accessor :db
7
+
8
+ def initialize options
9
+ end
10
+
11
+ #
12
+ # Executes block once for each element in the whole DB, in whatever order
13
+ # the DB thinks you should see it.
14
+ #
15
+ # Your block will see |key, val|
16
+ #
17
+ # key_store.each do |key, val|
18
+ # # ... stuff ...
19
+ # end
20
+ #
21
+ def each &block
22
+ db.iterinit
23
+ loop do
24
+ key = db.iternext or break
25
+ val = db[key]
26
+ yield key, val
27
+ end
28
+ end
29
+
30
+ def each_as klass, &block
31
+ self.each do |key, hsh|
32
+ yield [key, klass.from_hash(hsh)]
33
+ end
34
+ end
35
+
36
+ # Delegate to store
37
+ def set(key, val)
38
+ return unless val
39
+ db.put key, val.to_hash.compact
40
+ end
41
+ def save obj
42
+ return unless obj
43
+ db.put obj.key, obj.to_hash.compact
44
+ end
45
+
46
+ def get(key) db[key] end
47
+ def put(key, val) db.put key, val end
48
+ def [](key) db[key] end
49
+ def close() db.close end
50
+ def size() db.size end
51
+ def delete(key) db.delete(key) end
52
+
53
+ #
54
+ # Load from standard command-line options
55
+ #
56
+ # obvs only works when there's just one store
57
+ #
58
+ def self.create type, config
59
+ end
60
+ end
61
+ end
62
+ end