mrflip-edamame 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. data/.document +8 -0
  2. data/.gitignore +31 -0
  3. data/LICENSE.textile +21 -0
  4. data/README.textile +178 -0
  5. data/Rakefile +75 -0
  6. data/VERSION +1 -0
  7. data/app/edamame_san/config.ru +4 -0
  8. data/app/edamame_san/config.yml +17 -0
  9. data/app/edamame_san/config/.gitignore +1 -0
  10. data/app/edamame_san/edamame_san.rb +71 -0
  11. data/app/edamame_san/public/favicon.ico +0 -0
  12. data/app/edamame_san/public/images/edamame_logo.icns +0 -0
  13. data/app/edamame_san/public/images/edamame_logo.ico +0 -0
  14. data/app/edamame_san/public/images/edamame_logo.png +0 -0
  15. data/app/edamame_san/public/images/edamame_logo_2.icns +0 -0
  16. data/app/edamame_san/public/javascripts/application.js +8 -0
  17. data/app/edamame_san/public/javascripts/jquery/jquery-ui.js +8694 -0
  18. data/app/edamame_san/public/javascripts/jquery/jquery.js +4376 -0
  19. data/app/edamame_san/public/stylesheets/application.css +32 -0
  20. data/app/edamame_san/public/stylesheets/layout.css +88 -0
  21. data/app/edamame_san/views/layout.haml +13 -0
  22. data/app/edamame_san/views/load.haml +37 -0
  23. data/app/edamame_san/views/root.haml +25 -0
  24. data/bin/edamame-ps +2 -0
  25. data/bin/empty_all.rb +15 -0
  26. data/bin/stats.rb +13 -0
  27. data/bin/sync.rb +15 -0
  28. data/bin/test_run.rb +14 -0
  29. data/edamame.gemspec +110 -0
  30. data/lib/edamame.rb +193 -0
  31. data/lib/edamame/job.rb +134 -0
  32. data/lib/edamame/queue.rb +6 -0
  33. data/lib/edamame/queue/beanstalk.rb +132 -0
  34. data/lib/edamame/rescheduled.rb +89 -0
  35. data/lib/edamame/scheduling.rb +69 -0
  36. data/lib/edamame/store.rb +8 -0
  37. data/lib/edamame/store/base.rb +62 -0
  38. data/lib/edamame/store/tyrant_store.rb +50 -0
  39. data/lib/methods.txt +94 -0
  40. data/spec/edamame_spec.rb +7 -0
  41. data/spec/spec_helper.rb +9 -0
  42. data/utils/god/README-god.textile +54 -0
  43. data/utils/god/beanstalkd_god.rb +34 -0
  44. data/utils/god/edamame.god +30 -0
  45. data/utils/god/god-etc-init-dot-d-example +40 -0
  46. data/utils/god/god_email.rb +45 -0
  47. data/utils/god/god_process.rb +140 -0
  48. data/utils/god/god_site_config.rb +4 -0
  49. data/utils/god/sinatra_god.rb +36 -0
  50. data/utils/god/tyrant_god.rb +67 -0
  51. data/utils/simulation/Add Percent Variation.vi +0 -0
  52. data/utils/simulation/Harmonic Average.vi +0 -0
  53. data/utils/simulation/Rescheduling Simulation.aliases +3 -0
  54. data/utils/simulation/Rescheduling Simulation.lvlps +3 -0
  55. data/utils/simulation/Rescheduling Simulation.lvproj +22 -0
  56. data/utils/simulation/Rescheduling.vi +0 -0
  57. data/utils/simulation/Weighted Average.vi +0 -0
  58. metadata +135 -0
@@ -0,0 +1,134 @@
1
+ module Edamame
2
+
3
+ #
4
+ #
5
+ # id, name, body, timeouts, time-left, age, state, delay, pri, ttr
6
+ #
7
+ #
8
+ # * A job, pulled from the queue: it is connected to its beanstalk presence
9
+ # body contains
10
+ # ** obj
11
+ # ** scheduling
12
+ # ** stats
13
+ #
14
+ # * A DB job
15
+ # body contains
16
+ # ** tube, priority, ttr, state
17
+ # ** obj
18
+ # ** scheduling
19
+ # ** stats
20
+ module JobCore
21
+
22
+ def key
23
+ [ tube, obj[:key]||obj['key'] ].join('-')
24
+ end
25
+
26
+ #
27
+ def since_last
28
+ scheduling.last_run - Time.now
29
+ end
30
+
31
+ # Beanstalk::Job stats:
32
+ # { "pri"=>65536, "ttr"=>120,
33
+ # {"releases"=>8, "delay"=>5, "kicks"=>0, "buries"=>0, "id"=>202,
34
+ # "tube"=>"default", "time-left"=>120,
35
+ # "timeouts"=>0, "age"=>1415, "state"=>"reserved"}
36
+ #
37
+ # [ "id",
38
+ # "tube", "pri", "ttr", "state",
39
+ # "delay",
40
+ # "releases", "kicks", "buries",
41
+ # "time-left", "timeouts", "age", ]
42
+ end
43
+
44
+
45
+ Beanstalk::Job.class_eval do
46
+ include JobCore
47
+
48
+ def scheduling
49
+ @scheduling ||= Edamame::Scheduling.from_hash ybody['scheduling']
50
+ end
51
+
52
+ def obj
53
+ ybody['obj']
54
+ end
55
+
56
+ def priority
57
+ pri
58
+ end
59
+
60
+ def tube
61
+ stats['tube']
62
+ end
63
+
64
+ # Override this for rescheduling
65
+ def update!
66
+ scheduling.total_runs = scheduling.total_runs.to_i + stats['releases']
67
+ scheduling.last_run = Time.now
68
+ end
69
+
70
+ def to_hash flatten=true
71
+ hsh = {
72
+ "tube" => tube,
73
+ "priority" => priority,
74
+ "ttr" => ttr,
75
+ "state" => state,
76
+ "scheduling" => scheduling.to_hash,
77
+ 'key' => key,
78
+ "obj" => obj.to_hash,
79
+ }
80
+ if flatten
81
+ hsh["scheduling"] = hsh['scheduling'].to_yaml
82
+ hsh["obj"] = hsh['obj'].to_yaml
83
+ end
84
+ hsh
85
+ end
86
+ end
87
+
88
+ class Job < Struct.new(
89
+ :tube, :priority, :ttr, :state,
90
+ :scheduling, :obj
91
+ )
92
+ # Job.class_eval do
93
+ include JobCore
94
+
95
+ DEFAULT_OPTIONS = {
96
+ 'priority' => 65536,
97
+ 'ttr' => 120,
98
+ 'state' => 1,
99
+ 'scheduling' => Edamame::Scheduling::Once.new()
100
+ }
101
+
102
+ # attr_accessor :runs, :failures, :prev_run_at
103
+ def initialize *args
104
+ super *args
105
+ DEFAULT_OPTIONS.each{|key,val| self[key] ||= val }
106
+ [:priority, :ttr, :state].each{|key| self[key] = self[key].to_i }
107
+ case self.scheduling
108
+ when String
109
+ scheduling_hash = YAML.load(self.scheduling) rescue nil
110
+ self.scheduling = Scheduling.from_hash(scheduling_hash) if scheduling_hash
111
+ when Hash
112
+ self.scheduling = Scheduling.from_hash(scheduling)
113
+ else raise "Can't build a Scheduling from #{self.scheduling}" ; end
114
+ if self.obj.is_a?(String) then self.obj = YAML.load(self.obj) rescue nil ; end
115
+ end
116
+
117
+ def delay
118
+ scheduling.delay
119
+ end
120
+
121
+ def to_hash flatten=true
122
+ hsh = super()
123
+ hsh["scheduling"] = scheduling.to_hash
124
+ hsh["obj"] = obj.to_hash
125
+ if flatten
126
+ hsh["scheduling"] = hsh['scheduling'].to_yaml
127
+ hsh["obj"] = hsh['obj'].to_yaml
128
+ end
129
+ hsh
130
+ end
131
+ end
132
+ end
133
+
134
+
@@ -0,0 +1,6 @@
1
+ module Edamame
2
+ module Queue
3
+ extend FactoryModule
4
+ autoload :BeanstalkQueue, 'edamame/queue/beanstalk'
5
+ end
6
+ end
@@ -0,0 +1,132 @@
1
+ module Edamame
2
+ module Queue
3
+ #
4
+ # Persistent job queue for periodic requests.
5
+ #
6
+ # Jobs are reserved, run, and if successful put back with an updated delay parameter.
7
+ #
8
+ # This is useful for mass scraping of timelines (RSS feeds, twitter search
9
+ # results, etc. See http://github.com/mrflip/wuclan for )
10
+ #
11
+ class BeanstalkQueue
12
+ DEFAULT_OPTIONS = {
13
+ :priority => 65536, # default job queue priority
14
+ :time_to_run => 60*5, # 5 minutes to complete a job or assume dead
15
+ :uris => ['localhost:11300'],
16
+ :default_tube => 'default',
17
+ }
18
+ attr_accessor :options
19
+
20
+ #
21
+ # beanstalk_pool -- specify nil to use the default single-node ['localhost:11300'] pool
22
+ #
23
+ def initialize _options={}
24
+ self.options = DEFAULT_OPTIONS.deep_merge(_options.compact)
25
+ options[:default_tube] = options[:default_tube].to_s
26
+ end
27
+
28
+ #
29
+ # Add a new Job to the queue
30
+ #
31
+ def put job, priority=nil, delay=nil
32
+ beanstalk.yput(job.to_hash(false),
33
+ (priority || job.priority), (delay || job.delay), job.ttr)
34
+ end
35
+
36
+ #
37
+ # Remove the job from the queue.
38
+ #
39
+ def delete(job)
40
+ job.delete
41
+ end
42
+
43
+ #
44
+ # Returns the job to the queue, to be re-run later.
45
+ #
46
+ # release'ing a job acknowledges it was completed, successfully or not
47
+ #
48
+ def release job, priority=nil, delay=nil
49
+ job.release( (priority || job.priority), (delay || job.delay) )
50
+ end
51
+
52
+ #
53
+ # Take the next (highest priority, delay met) job.
54
+ # Set timeout (default is 10s)
55
+ # Returns nil on error or timeout. Interrupt error passes through
56
+ #
57
+ def reserve timeout=10
58
+ begin
59
+ job = beanstalk.reserve(timeout) or return
60
+ rescue Beanstalk::TimedOut => e ; warn e.to_s ; sleep 0.4 ; return ;
61
+ rescue StandardError => e ; warn e.to_s ; sleep 1 ; return ; end
62
+ job
63
+ end
64
+
65
+ #
66
+ # Shelves the job.
67
+ #
68
+ def bury
69
+ job.bury job.priority
70
+ end
71
+
72
+ # The beanstalk pool which acts as job queue
73
+ def beanstalk
74
+ return @beanstalk if @beanstalk
75
+ @beanstalk = Beanstalk::Pool.new(options[:uris], options[:default_tube])
76
+ self.tube= options[:default_tube]
77
+ @beanstalk
78
+ end
79
+ # Close the job queue
80
+ def close
81
+ @beanstalk.close if @beanstalk
82
+ @beanstalk = nil
83
+ end
84
+
85
+ # uses and watches the given beanstalk tube
86
+ def tube= _tube
87
+ puts "#{self.class} setting tube to #{_tube}, was #{@tube}"
88
+ @beanstalk.use _tube
89
+ @beanstalk.watch _tube
90
+ end
91
+
92
+ # Stats on job count across the pool
93
+ def stats
94
+ beanstalk.stats.select{|k,v| k =~ /jobs/}
95
+ end
96
+ # Total jobs in the queue, whether reserved, ready, buried or delayed.
97
+ def total_jobs
98
+ [:reserved, :ready, :buried, :delayed].inject(0){|sum,type| sum += stats["current-jobs-#{type}"]}
99
+ end
100
+
101
+ #
102
+ #
103
+ #
104
+ def empty tube=nil, &block
105
+ tube = tube.to_s if tube
106
+ curr_tube = beanstalk.list_tube_used.values.first
107
+ curr_watches = beanstalk.list_tubes_watched.values.first
108
+ beanstalk.use tube if tube
109
+ beanstalk.watch tube if tube
110
+ p ["emptying", tube, beanstalk_total_jobs]
111
+ loop do
112
+ kicked = beanstalk.open_connections.map{|conxn| conxn.kick(20) }
113
+ break if (beanstalk_total_jobs == 0) || (!beanstalk.peek_ready)
114
+ qjob = reserve(5) or break
115
+ yield qjob
116
+ qjob.delete
117
+ end
118
+ beanstalk.use curr_tube
119
+ beanstalk.ignore tube if (! curr_watches.include?(tube))
120
+ end
121
+
122
+ def empty_all &block
123
+ tubes = beanstalk.list_tubes.values.flatten.uniq
124
+ tubes.each do |tube|
125
+ empty tube, &block
126
+ end
127
+ end
128
+
129
+ end # class
130
+ end
131
+ end
132
+
@@ -0,0 +1,89 @@
1
+ module Edamame
2
+ module Rescheduled
3
+
4
+
5
+ # ===========================================================================
6
+ #
7
+ # Rescheduling
8
+
9
+ #
10
+ # Finish the qjob and re-insert it at the same priority but with the new
11
+ # delay setting.
12
+ #
13
+ def reschedule qjob, scrape_job
14
+ priority = qjob.stats['pri']
15
+ qjob.delete
16
+ self.save scrape_job, priority
17
+ end
18
+
19
+ #
20
+ # Flattens the scrape_job and enqueues it with a delay appropriate for the
21
+ # average item rate so far. You can explicitly supply a +priority+ to
22
+ # override the priority set at instantiation.
23
+ #
24
+ # This doesn't delete the job -- use reschedule if you are putting back an
25
+ # existing qjob.
26
+ #
27
+ def save scrape_job, priority=nil, delay=nil
28
+ body = scrape_job.to_flat.join("\t")
29
+ delay ||= delay_to_next_scrape(scrape_job)
30
+ priority ||= config[:priority]
31
+ log scrape_job, priority, delay
32
+ job_queue.put body, priority, delay, config[:time_to_run]
33
+ end
34
+ # delegates to #save() -- priority and delay are unchanged.
35
+ def <<(scrape_job) save scrape_job end
36
+
37
+ #
38
+ # if we can't determine an actual rate, uses max_resched_delay (assumes it
39
+ # is rare)
40
+ #
41
+ def delay_to_next_scrape scrape_job
42
+ rate = scrape_job.avg_rate or return max_resched_delay
43
+ delay = items_goal.to_f / rate
44
+ delay = delay.clamp(min_resched_delay, max_resched_delay)
45
+ delay.to_i
46
+ end
47
+
48
+ #
49
+ # A (very prolix) log statement
50
+ #
51
+ def log scrape_job, priority=nil, delay=nil
52
+ delay ||= delay_to_next_scrape(scrape_job)
53
+ rate_str = scrape_job.avg_rate ? "%10.5f/s" % (scrape_job.avg_rate) : " "*12
54
+ ll = "Rescheduling\t#{"%-23s"%scrape_job.query_term}\t"
55
+ ll << "%6d" % priority if priority
56
+ ll << "\t#{rate_str}"
57
+ ll << "\t#{"%7d" % (scrape_job.prev_items||0)}"
58
+ ll << "\t#{"%4d"%(scrape_job.new_items||0)} nu"
59
+ ll << "\tin #{"%8.2f" % delay} s"
60
+ ll << "\t#{(Time.now + delay).strftime("%Y-%m-%d %H:%M:%S")}"
61
+ Monkeyshines.logger.info ll
62
+ end
63
+
64
+ # ===========================================================================
65
+ #
66
+ # Beanstalkd interface
67
+ #
68
+
69
+ #
70
+ # De-serialize the scrape job from the queue.
71
+ #
72
+ def scrape_job_from_qjob qjob
73
+ args = qjob.body.split("\t")
74
+ # request_klass = Wukong.class_from_resource(args.shift)
75
+ scrape_job = request_klass.new(*args[1..-1])
76
+ end
77
+
78
+ # Take the next (highest priority, delay met) job.
79
+ # Set timeout (default is 10s)
80
+ # Returns nil on error or timeout. Interrupt error passes through
81
+ def reserve_job! to=10
82
+ begin qjob = job_queue.reserve(to)
83
+ rescue Beanstalk::TimedOut => e ; Monkeyshines.logger.info e.to_s ; sleep 0.4 ; return ;
84
+ rescue StandardError => e ; Monkeyshines.logger.warn e.to_s ; sleep 1 ; return ; end
85
+ qjob
86
+ end
87
+
88
+ end
89
+ end
@@ -0,0 +1,69 @@
1
+ require 'wukong/extensions/hashlike_class'
2
+ module Edamame
3
+
4
+ module Scheduling
5
+ extend FactoryModule
6
+
7
+ # def type
8
+ # self.class.to_s
9
+ # end
10
+ # def to_hash
11
+ # end
12
+
13
+ class Base
14
+ include Wukong::HashlikeClass
15
+ has_members :last_run, :total_runs
16
+
17
+ def initialize *args
18
+ members.zip(args).each do |key, val|
19
+ self[key] = val if val
20
+ end
21
+ end
22
+
23
+ def last_run_time
24
+ last_run.is_a?(String) ? Time.parse(last_run) : last_run
25
+ end
26
+
27
+ def since_last
28
+ Time.now - last_run_time
29
+ end
30
+
31
+ end
32
+
33
+ class Every < Base
34
+ has_member :period
35
+ def delay
36
+ period
37
+ end
38
+ end
39
+
40
+ class At < Base
41
+ attr_accessor :time
42
+ def initialize *args
43
+ super *args
44
+ self.time = Time.parse(time) if time.is_a?(String)
45
+ end
46
+ def delay
47
+ @delay ||= time - Time.now
48
+ end
49
+ end
50
+
51
+ class Once < Base
52
+ def delay
53
+ nil
54
+ end
55
+ end
56
+
57
+ class Rescheduling < Base
58
+ has_members :period, :total_items, :goal_items, :prev_max
59
+
60
+ cattr_accessor :min_resched_delay, :max_resched_delay
61
+ self.min_resched_delay = 10
62
+ self.max_resched_delay = 24*60*60
63
+ def delay
64
+ period
65
+ end
66
+ end
67
+ end
68
+
69
+ end
@@ -0,0 +1,8 @@
1
+ require 'monkeyshines/utils/factory_module'
2
+ module Edamame
3
+ module Store
4
+ extend FactoryModule
5
+ autoload :Base, 'edamame/store/base'
6
+ autoload :TyrantStore, 'edamame/store/tyrant_store'
7
+ end
8
+ end
@@ -0,0 +1,62 @@
1
+ # require 'monkeyshines/utils/factory_module'
2
+ module Edamame
3
+ module Store
4
+ class Base
5
+ # The actual backing store; should respond to #set and #get methods
6
+ attr_accessor :db
7
+
8
+ def initialize options
9
+ end
10
+
11
+ #
12
+ # Executes block once for each element in the whole DB, in whatever order
13
+ # the DB thinks you should see it.
14
+ #
15
+ # Your block will see |key, val|
16
+ #
17
+ # key_store.each do |key, val|
18
+ # # ... stuff ...
19
+ # end
20
+ #
21
+ def each &block
22
+ db.iterinit
23
+ loop do
24
+ key = db.iternext or break
25
+ val = db[key]
26
+ yield key, val
27
+ end
28
+ end
29
+
30
+ def each_as klass, &block
31
+ self.each do |key, hsh|
32
+ yield [key, klass.from_hash(hsh)]
33
+ end
34
+ end
35
+
36
+ # Delegate to store
37
+ def set(key, val)
38
+ return unless val
39
+ db.put key, val.to_hash.compact
40
+ end
41
+ def save obj
42
+ return unless obj
43
+ db.put obj.key, obj.to_hash.compact
44
+ end
45
+
46
+ def get(key) db[key] end
47
+ def put(key, val) db.put key, val end
48
+ def [](key) db[key] end
49
+ def close() db.close end
50
+ def size() db.size end
51
+ def delete(key) db.delete(key) end
52
+
53
+ #
54
+ # Load from standard command-line options
55
+ #
56
+ # obvs only works when there's just one store
57
+ #
58
+ def self.create type, config
59
+ end
60
+ end
61
+ end
62
+ end