qless 0.9.3 → 0.10.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (65) hide show
  1. data/Gemfile +9 -3
  2. data/README.md +70 -25
  3. data/Rakefile +125 -9
  4. data/exe/install_phantomjs +21 -0
  5. data/lib/qless.rb +115 -76
  6. data/lib/qless/config.rb +11 -9
  7. data/lib/qless/failure_formatter.rb +43 -0
  8. data/lib/qless/job.rb +201 -102
  9. data/lib/qless/job_reservers/ordered.rb +7 -1
  10. data/lib/qless/job_reservers/round_robin.rb +16 -6
  11. data/lib/qless/job_reservers/shuffled_round_robin.rb +9 -2
  12. data/lib/qless/lua/qless-lib.lua +2463 -0
  13. data/lib/qless/lua/qless.lua +2012 -0
  14. data/lib/qless/lua_script.rb +63 -12
  15. data/lib/qless/middleware/memory_usage_monitor.rb +62 -0
  16. data/lib/qless/middleware/metriks.rb +45 -0
  17. data/lib/qless/middleware/redis_reconnect.rb +6 -3
  18. data/lib/qless/middleware/requeue_exceptions.rb +94 -0
  19. data/lib/qless/middleware/retry_exceptions.rb +38 -9
  20. data/lib/qless/middleware/sentry.rb +3 -7
  21. data/lib/qless/middleware/timeout.rb +64 -0
  22. data/lib/qless/queue.rb +90 -55
  23. data/lib/qless/server.rb +177 -130
  24. data/lib/qless/server/views/_job.erb +33 -15
  25. data/lib/qless/server/views/completed.erb +11 -0
  26. data/lib/qless/server/views/layout.erb +70 -11
  27. data/lib/qless/server/views/overview.erb +93 -53
  28. data/lib/qless/server/views/queue.erb +9 -8
  29. data/lib/qless/server/views/queues.erb +18 -1
  30. data/lib/qless/subscriber.rb +37 -22
  31. data/lib/qless/tasks.rb +5 -10
  32. data/lib/qless/test_helpers/worker_helpers.rb +55 -0
  33. data/lib/qless/version.rb +3 -1
  34. data/lib/qless/worker.rb +4 -413
  35. data/lib/qless/worker/base.rb +247 -0
  36. data/lib/qless/worker/forking.rb +245 -0
  37. data/lib/qless/worker/serial.rb +41 -0
  38. metadata +135 -52
  39. data/lib/qless/qless-core/cancel.lua +0 -101
  40. data/lib/qless/qless-core/complete.lua +0 -233
  41. data/lib/qless/qless-core/config.lua +0 -56
  42. data/lib/qless/qless-core/depends.lua +0 -65
  43. data/lib/qless/qless-core/deregister_workers.lua +0 -12
  44. data/lib/qless/qless-core/fail.lua +0 -117
  45. data/lib/qless/qless-core/failed.lua +0 -83
  46. data/lib/qless/qless-core/get.lua +0 -37
  47. data/lib/qless/qless-core/heartbeat.lua +0 -51
  48. data/lib/qless/qless-core/jobs.lua +0 -41
  49. data/lib/qless/qless-core/pause.lua +0 -18
  50. data/lib/qless/qless-core/peek.lua +0 -165
  51. data/lib/qless/qless-core/pop.lua +0 -314
  52. data/lib/qless/qless-core/priority.lua +0 -32
  53. data/lib/qless/qless-core/put.lua +0 -169
  54. data/lib/qless/qless-core/qless-lib.lua +0 -2354
  55. data/lib/qless/qless-core/qless.lua +0 -1862
  56. data/lib/qless/qless-core/queues.lua +0 -58
  57. data/lib/qless/qless-core/recur.lua +0 -190
  58. data/lib/qless/qless-core/retry.lua +0 -73
  59. data/lib/qless/qless-core/stats.lua +0 -92
  60. data/lib/qless/qless-core/tag.lua +0 -100
  61. data/lib/qless/qless-core/track.lua +0 -79
  62. data/lib/qless/qless-core/unfail.lua +0 -54
  63. data/lib/qless/qless-core/unpause.lua +0 -12
  64. data/lib/qless/qless-core/workers.lua +0 -69
  65. data/lib/qless/wait_until.rb +0 -19
@@ -0,0 +1,247 @@
1
+ # Encoding: utf-8
2
+
3
+ # Standard stuff
4
+ require 'time'
5
+ require 'logger'
6
+ require 'thread'
7
+
8
+ # Qless requires
9
+ require 'qless'
10
+ require 'qless/subscriber'
11
+
12
+ module Qless
13
+ module Workers
14
+ JobLockLost = Class.new(StandardError)
15
+
16
+ class BaseWorker
17
+ attr_accessor :output, :reserver, :interval, :paused,
18
+ :options, :sighup_handler
19
+
20
+ def initialize(reserver, options = {})
21
+ # Our job reserver and options
22
+ @reserver = reserver
23
+ @options = options
24
+
25
+ # SIGHUP handler
26
+ @sighup_handler = options.fetch(:sighup_handler) { lambda { } }
27
+
28
+ # Our logger
29
+ @log = options.fetch(:logger) do
30
+ @output = options.fetch(:output, $stdout)
31
+ Logger.new(output).tap do |logger|
32
+ logger.level = options.fetch(:log_level, Logger::WARN)
33
+ logger.formatter = options.fetch(:log_formatter) do
34
+ Proc.new { |severity, datetime, progname, msg| "#{datetime}: #{msg}\n" }
35
+ end
36
+ end
37
+ end
38
+
39
+ # The interval for checking for new jobs
40
+ @interval = options.fetch(:interval, 5.0)
41
+ @current_job_mutex = Mutex.new
42
+ @current_job = nil
43
+
44
+ # Default behavior when a lock is lost: stop after the current job.
45
+ on_current_job_lock_lost { shutdown(in_signal_handler=false) }
46
+ end
47
+
48
+ def log_level
49
+ @log.level
50
+ end
51
+
52
+ def safe_trap(signal_name, &cblock)
53
+ begin
54
+ trap(signal_name, cblock)
55
+ rescue ArgumentError
56
+ warn "Signal #{signal_name} not supported."
57
+ end
58
+ end
59
+
60
+ # The meaning of these signals is meant to closely mirror resque
61
+ #
62
+ # TERM: Shutdown immediately, stop processing jobs.
63
+ # INT: Shutdown immediately, stop processing jobs.
64
+ # QUIT: Shutdown after the current job has finished processing.
65
+ # USR1: Kill the forked children immediately, continue processing jobs.
66
+ # USR2: Pause after this job
67
+ # CONT: Start processing jobs again after a USR2
68
+ # HUP: Print current stack to log and continue
69
+ def register_signal_handlers
70
+ # Otherwise, we want to take the appropriate action
71
+ trap('TERM') { exit! }
72
+ trap('INT') { exit! }
73
+ safe_trap('HUP') { sighup_handler.call }
74
+ safe_trap('QUIT') { shutdown(in_signal_handler=true) }
75
+ begin
76
+ trap('CONT') { unpause(in_signal_handler=true) }
77
+ trap('USR2') { pause(in_signal_handler=true) }
78
+ rescue ArgumentError
79
+ warn 'Signals USR2, and/or CONT not supported.'
80
+ end
81
+ end
82
+
83
+ # Return an enumerator to each of the jobs provided by the reserver
84
+ def jobs
85
+ return Enumerator.new do |enum|
86
+ loop do
87
+ begin
88
+ job = reserver.reserve
89
+ rescue Exception => error
90
+ # We want workers to durably stay up, so we don't want errors
91
+ # during job reserving (e.g. network timeouts, etc) to kill the
92
+ # worker.
93
+ log(:error,
94
+ "Error reserving job: #{error.class}: #{error.message}")
95
+ end
96
+
97
+ # If we ended up getting a job, yield it. Otherwise, we wait
98
+ if job.nil?
99
+ no_job_available
100
+ else
101
+ self.current_job = job
102
+ enum.yield(job)
103
+ self.current_job = nil
104
+ end
105
+
106
+ break if @shutdown
107
+ end
108
+ end
109
+ end
110
+
111
+ # Actually perform the job
112
+ def perform(job)
113
+ around_perform(job)
114
+ rescue JobLockLost
115
+ log(:warn, "Lost lock for job #{job.jid}")
116
+ rescue Exception => error
117
+ fail_job(job, error, caller)
118
+ else
119
+ try_complete(job)
120
+ end
121
+
122
+ # Allow middleware modules to be mixed in and override the
123
+ # definition of around_perform while providing a default
124
+ # implementation so our code can assume the method is present.
125
+ module SupportsMiddlewareModules
126
+ def around_perform(job)
127
+ job.perform
128
+ end
129
+
130
+ def after_fork
131
+ end
132
+ end
133
+
134
+ include SupportsMiddlewareModules
135
+
136
+ # Stop processing after this job
137
+ def shutdown(in_signal_handler=true)
138
+ @shutdown = true
139
+ end
140
+ alias stop! shutdown # so we can call `stop!` regardless of the worker type
141
+
142
+ # Pause the worker -- take no more new jobs
143
+ def pause(in_signal_handler=true)
144
+ @paused = true
145
+ procline("Paused -- #{reserver.description}", in_signal_handler=in_signal_handler)
146
+ end
147
+
148
+ # Continue taking new jobs
149
+ def unpause(in_signal_handler=true)
150
+ @paused = false
151
+ end
152
+
153
+ # Set the proceline. Not supported on all systems
154
+ def procline(value, in_signal_handler=true)
155
+ $0 = "Qless-#{Qless::VERSION}: #{value} at #{Time.now.iso8601}"
156
+ log(:debug, $PROGRAM_NAME) unless in_signal_handler
157
+ end
158
+
159
+ # Complete the job unless the worker has already put it into another state
160
+ # by completing / failing / etc. the job
161
+ def try_complete(job)
162
+ job.complete unless job.state_changed?
163
+ rescue Job::CantCompleteError => e
164
+ # There's not much we can do here. Complete fails in a few cases:
165
+ # - The job is already failed (i.e. by another worker)
166
+ # - The job is being worked on by another worker
167
+ # - The job has been cancelled
168
+ #
169
+ # We don't want to (or are able to) fail the job with this error in
170
+ # any of these cases, so the best we can do is log the failure.
171
+ log(:error, "Failed to complete #{job.inspect}: #{e.message}")
172
+ end
173
+
174
+ def fail_job(job, error, worker_backtrace)
175
+ failure = Qless.failure_formatter.format(job, error, worker_backtrace)
176
+ log(:error, "Got #{failure.group} failure from #{job.inspect}\n#{failure.message}" )
177
+ job.fail(*failure)
178
+ rescue Job::CantFailError => e
179
+ # There's not much we can do here. Another worker may have cancelled it,
180
+ # or we might not own the job, etc. Logging is the best we can do.
181
+ log(:error, "Failed to fail #{job.inspect}: #{e.message}")
182
+ end
183
+
184
+ def deregister
185
+ uniq_clients.each do |client|
186
+ client.deregister_workers(client.worker_name)
187
+ end
188
+ end
189
+
190
+ def uniq_clients
191
+ @uniq_clients ||= reserver.queues.map(&:client).uniq
192
+ end
193
+
194
+ def on_current_job_lock_lost(&block)
195
+ @on_current_job_lock_lost = block
196
+ end
197
+
198
+ def listen_for_lost_lock
199
+ subscribers = uniq_clients.map do |client|
200
+ Subscriber.start(client, "ql:w:#{client.worker_name}", log: @log) do |_, message|
201
+ if message['event'] == 'lock_lost'
202
+ with_current_job do |job|
203
+ if job && message['jid'] == job.jid
204
+ @on_current_job_lock_lost.call(job)
205
+ end
206
+ end
207
+ end
208
+ end
209
+ end
210
+
211
+ yield
212
+ ensure
213
+ subscribers.each(&:stop)
214
+ end
215
+
216
+ private
217
+
218
+ def log(type, msg)
219
+ @log.public_send(type, "#{Process.pid}: #{msg}")
220
+ end
221
+
222
+ def no_job_available
223
+ unless interval.zero?
224
+ procline("Waiting for #{reserver.description}", in_signal_handler=false)
225
+ log(:debug, "Sleeping for #{interval} seconds")
226
+ sleep interval
227
+ end
228
+ end
229
+
230
+ def with_current_job
231
+ @current_job_mutex.synchronize do
232
+ yield @current_job
233
+ end
234
+ end
235
+
236
+ def current_job=(job)
237
+ @current_job_mutex.synchronize do
238
+ @current_job = job
239
+ end
240
+ end
241
+
242
+ def reconnect_each_client
243
+ uniq_clients.each { |client| client.redis.client.reconnect }
244
+ end
245
+ end
246
+ end
247
+ end
@@ -0,0 +1,245 @@
1
+ # Encoding: utf-8
2
+
3
+ # Qless requires
4
+ require 'qless'
5
+ require 'qless/worker/base'
6
+ require 'qless/worker/serial'
7
+ require 'thread'
8
+
9
+ module Qless
10
+ module Workers
11
+ class ForkingWorker < BaseWorker
12
+ # The child startup interval
13
+ attr_accessor :max_startup_interval
14
+
15
+ def initialize(reserver, options = {})
16
+ super(reserver, options)
17
+ # The keys are the child PIDs, the values are information about the
18
+ # worker, including its sandbox directory. This directory currently
19
+ # isn't used, but this sets up for having that eventually.
20
+ @sandboxes = {}
21
+
22
+ # Save our options for starting children
23
+ @options = options
24
+
25
+ # The max interval between when children start (reduces thundering herd)
26
+ @max_startup_interval = options[:max_startup_interval] || 10.0
27
+
28
+ # TODO: facter to figure out how many cores we have
29
+ @num_workers = options[:num_workers] || 1
30
+
31
+ # All the modules that have been applied to this worker
32
+ @modules = []
33
+
34
+ @sandbox_mutex = Mutex.new
35
+ # A queue of blocks that are postponed since we cannot get
36
+ # @sandbox_mutex in trap handler
37
+ @postponed_actions_queue = ::Queue.new
38
+ end
39
+
40
+ # Because we spawn a new worker, we need to apply all the modules that
41
+ # extend this one
42
+ def extend(mod)
43
+ @modules << mod
44
+ super(mod)
45
+ end
46
+
47
+ # Spawn a new child worker
48
+ def spawn
49
+ worker = SerialWorker.new(reserver, @options)
50
+ # We use 11 as the exit status so that it is something unique
51
+ # (rather than the common 1). Plus, 11 looks a little like
52
+ # ll (i.e. "Lock Lost").
53
+ worker.on_current_job_lock_lost { |job| exit!(11) }
54
+ @modules.each { |mod| worker.extend(mod) }
55
+ worker
56
+ end
57
+
58
+ # If @sandbox_mutex is free, execute block immediately.
59
+ # Otherwise, postpone it until handling is possible
60
+ def contention_aware_handler(&block)
61
+ if @sandbox_mutex.try_lock
62
+ block.call
63
+ @sandbox_mutex.unlock
64
+ else
65
+ @postponed_actions_queue << block
66
+ end
67
+ end
68
+
69
+ # Process any signals (such as TERM) that could not be processed
70
+ # immediately due to @sandbox_mutex being in use
71
+ def process_postponed_actions
72
+ until @postponed_actions_queue.empty?
73
+ # It's possible a signal interrupteed us between the empty?
74
+ # and shift calls, but it could have only added more things
75
+ # into @postponed_actions_queue
76
+ block = @postponed_actions_queue.shift(true)
77
+ @sandbox_mutex.synchronize do
78
+ block.call
79
+ end
80
+ end
81
+ end
82
+
83
+ # Register our handling of signals
84
+ def register_signal_handlers
85
+ # If we're the parent process, we mostly want to forward the signals on
86
+ # to the child processes. It's just that sometimes we want to wait for
87
+ # them and then exit
88
+ trap('TERM') do
89
+ contention_aware_handler { stop!('TERM', in_signal_handler=true); exit }
90
+ end
91
+ trap('INT') do
92
+ contention_aware_handler { stop!('INT', in_signal_handler=true); exit }
93
+ end
94
+ safe_trap('HUP') { sighup_handler.call }
95
+ safe_trap('QUIT') do
96
+ contention_aware_handler { stop!('QUIT', in_signal_handler=true); exit }
97
+ end
98
+ safe_trap('USR1') do
99
+ contention_aware_handler { stop!('KILL', in_signal_handler=true) }
100
+ end
101
+ begin
102
+ trap('CONT') { stop('CONT', in_signal_handler=true) }
103
+ trap('USR2') { stop('USR2', in_signal_handler=true) }
104
+ rescue ArgumentError
105
+ warn 'Signals USR2, and/or CONT not supported.'
106
+ end
107
+ end
108
+
109
+ # Run this worker
110
+ def run
111
+ startup_sandboxes
112
+
113
+ # Now keep an eye on our child processes, spawn replacements as needed
114
+ loop do
115
+ begin
116
+ # Don't wait on any processes if we're already in shutdown mode.
117
+ break if @shutdown
118
+
119
+ # Wait for any child to kick the bucket
120
+ pid, status = Process.wait2
121
+ code, sig = status.exitstatus, status.stopsig
122
+ log(:warn,
123
+ "Worker process #{pid} died with #{code} from signal (#{sig})")
124
+
125
+ # allow our shutdown logic (called from a separate thread) to take affect.
126
+ break if @shutdown
127
+
128
+ spawn_replacement_child(pid)
129
+ process_postponed_actions
130
+ rescue SystemCallError => e
131
+ log(:error, "Failed to wait for child process: #{e.inspect}")
132
+ # If we're shutting down, the loop above will exit
133
+ exit! unless @shutdown
134
+ end
135
+ end
136
+ end
137
+
138
+ # Returns a list of each of the child pids
139
+ def children
140
+ @sandboxes.keys
141
+ end
142
+
143
+ # Signal all the children
144
+ def stop(signal = 'QUIT', in_signal_handler=true)
145
+ log(:warn, "Sending #{signal} to children") unless in_signal_handler
146
+ children.each do |pid|
147
+ begin
148
+ Process.kill(signal, pid)
149
+ rescue Errno::ESRCH
150
+ # no such process -- means the process has already died.
151
+ end
152
+ end
153
+ end
154
+
155
+ # Signal all the children and wait for them to exit.
156
+ # Should only be called when we have the lock on @sandbox_mutex
157
+ def stop!(signal = 'QUIT', in_signal_handler=true)
158
+ shutdown(in_signal_handler=in_signal_handler)
159
+ shutdown_sandboxes(signal, in_signal_handler=in_signal_handler)
160
+ end
161
+
162
+ private
163
+
164
+ def startup_sandboxes
165
+ # Make sure we respond to signals correctly
166
+ register_signal_handlers
167
+
168
+ log(:debug, "Starting to run with #{@num_workers} workers")
169
+ @num_workers.times do |i|
170
+ slot = {
171
+ worker_id: i,
172
+ sandbox: nil
173
+ }
174
+
175
+ cpid = fork_child_process do
176
+ # Wait for a bit to calm the thundering herd
177
+ sleep(rand(max_startup_interval)) if max_startup_interval > 0
178
+ end
179
+
180
+ # If we're the parent process, save information about the child
181
+ log(:info, "Spawned worker #{cpid}")
182
+ @sandboxes[cpid] = slot
183
+ end
184
+ end
185
+
186
+ # Should only be called when we have a lock on @sandbox_mutex
187
+ def shutdown_sandboxes(signal, in_signal_handler=true)
188
+ # First, send the signal
189
+ stop(signal, in_signal_handler=in_signal_handler)
190
+
191
+ # Wait for each of our children
192
+ log(:warn, 'Waiting for child processes') unless in_signal_handler
193
+
194
+ until @sandboxes.empty?
195
+ begin
196
+ pid, _ = Process.wait2
197
+ log(:warn, "Child #{pid} stopped") unless in_signal_handler
198
+ @sandboxes.delete(pid)
199
+ rescue SystemCallError
200
+ break
201
+ end
202
+ end
203
+
204
+
205
+ unless in_signal_handler
206
+ log(:warn, 'All children have stopped')
207
+
208
+ # If there were any children processes we couldn't wait for, log it
209
+ @sandboxes.keys.each do |cpid|
210
+ log(:warn, "Could not wait for child #{cpid}")
211
+ end
212
+ end
213
+
214
+ @sandboxes.clear
215
+ end
216
+
217
+ private
218
+
219
+ def spawn_replacement_child(pid)
220
+ @sandbox_mutex.synchronize do
221
+ return if @shutdown
222
+
223
+ # And give its slot to a new worker process
224
+ slot = @sandboxes.delete(pid)
225
+ cpid = fork_child_process
226
+
227
+ # If we're the parent process, ave information about the child
228
+ log(:warn, "Spawned worker #{cpid} to replace #{pid}")
229
+ @sandboxes[cpid] = slot
230
+ end
231
+ end
232
+
233
+ # returns child's pid.
234
+ def fork_child_process
235
+ fork do
236
+ yield if block_given?
237
+ reconnect_each_client
238
+ after_fork
239
+ spawn.run
240
+ end
241
+ end
242
+
243
+ end
244
+ end
245
+ end