qless 0.9.3 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. data/Gemfile +9 -3
  2. data/README.md +70 -25
  3. data/Rakefile +125 -9
  4. data/exe/install_phantomjs +21 -0
  5. data/lib/qless.rb +115 -76
  6. data/lib/qless/config.rb +11 -9
  7. data/lib/qless/failure_formatter.rb +43 -0
  8. data/lib/qless/job.rb +201 -102
  9. data/lib/qless/job_reservers/ordered.rb +7 -1
  10. data/lib/qless/job_reservers/round_robin.rb +16 -6
  11. data/lib/qless/job_reservers/shuffled_round_robin.rb +9 -2
  12. data/lib/qless/lua/qless-lib.lua +2463 -0
  13. data/lib/qless/lua/qless.lua +2012 -0
  14. data/lib/qless/lua_script.rb +63 -12
  15. data/lib/qless/middleware/memory_usage_monitor.rb +62 -0
  16. data/lib/qless/middleware/metriks.rb +45 -0
  17. data/lib/qless/middleware/redis_reconnect.rb +6 -3
  18. data/lib/qless/middleware/requeue_exceptions.rb +94 -0
  19. data/lib/qless/middleware/retry_exceptions.rb +38 -9
  20. data/lib/qless/middleware/sentry.rb +3 -7
  21. data/lib/qless/middleware/timeout.rb +64 -0
  22. data/lib/qless/queue.rb +90 -55
  23. data/lib/qless/server.rb +177 -130
  24. data/lib/qless/server/views/_job.erb +33 -15
  25. data/lib/qless/server/views/completed.erb +11 -0
  26. data/lib/qless/server/views/layout.erb +70 -11
  27. data/lib/qless/server/views/overview.erb +93 -53
  28. data/lib/qless/server/views/queue.erb +9 -8
  29. data/lib/qless/server/views/queues.erb +18 -1
  30. data/lib/qless/subscriber.rb +37 -22
  31. data/lib/qless/tasks.rb +5 -10
  32. data/lib/qless/test_helpers/worker_helpers.rb +55 -0
  33. data/lib/qless/version.rb +3 -1
  34. data/lib/qless/worker.rb +4 -413
  35. data/lib/qless/worker/base.rb +247 -0
  36. data/lib/qless/worker/forking.rb +245 -0
  37. data/lib/qless/worker/serial.rb +41 -0
  38. metadata +135 -52
  39. data/lib/qless/qless-core/cancel.lua +0 -101
  40. data/lib/qless/qless-core/complete.lua +0 -233
  41. data/lib/qless/qless-core/config.lua +0 -56
  42. data/lib/qless/qless-core/depends.lua +0 -65
  43. data/lib/qless/qless-core/deregister_workers.lua +0 -12
  44. data/lib/qless/qless-core/fail.lua +0 -117
  45. data/lib/qless/qless-core/failed.lua +0 -83
  46. data/lib/qless/qless-core/get.lua +0 -37
  47. data/lib/qless/qless-core/heartbeat.lua +0 -51
  48. data/lib/qless/qless-core/jobs.lua +0 -41
  49. data/lib/qless/qless-core/pause.lua +0 -18
  50. data/lib/qless/qless-core/peek.lua +0 -165
  51. data/lib/qless/qless-core/pop.lua +0 -314
  52. data/lib/qless/qless-core/priority.lua +0 -32
  53. data/lib/qless/qless-core/put.lua +0 -169
  54. data/lib/qless/qless-core/qless-lib.lua +0 -2354
  55. data/lib/qless/qless-core/qless.lua +0 -1862
  56. data/lib/qless/qless-core/queues.lua +0 -58
  57. data/lib/qless/qless-core/recur.lua +0 -190
  58. data/lib/qless/qless-core/retry.lua +0 -73
  59. data/lib/qless/qless-core/stats.lua +0 -92
  60. data/lib/qless/qless-core/tag.lua +0 -100
  61. data/lib/qless/qless-core/track.lua +0 -79
  62. data/lib/qless/qless-core/unfail.lua +0 -54
  63. data/lib/qless/qless-core/unpause.lua +0 -12
  64. data/lib/qless/qless-core/workers.lua +0 -69
  65. data/lib/qless/wait_until.rb +0 -19
@@ -0,0 +1,247 @@
1
+ # Encoding: utf-8
2
+
3
+ # Standard stuff
4
+ require 'time'
5
+ require 'logger'
6
+ require 'thread'
7
+
8
+ # Qless requires
9
+ require 'qless'
10
+ require 'qless/subscriber'
11
+
12
+ module Qless
13
+ module Workers
14
+ JobLockLost = Class.new(StandardError)
15
+
16
+ class BaseWorker
17
+ attr_accessor :output, :reserver, :interval, :paused,
18
+ :options, :sighup_handler
19
+
20
+ def initialize(reserver, options = {})
21
+ # Our job reserver and options
22
+ @reserver = reserver
23
+ @options = options
24
+
25
+ # SIGHUP handler
26
+ @sighup_handler = options.fetch(:sighup_handler) { lambda { } }
27
+
28
+ # Our logger
29
+ @log = options.fetch(:logger) do
30
+ @output = options.fetch(:output, $stdout)
31
+ Logger.new(output).tap do |logger|
32
+ logger.level = options.fetch(:log_level, Logger::WARN)
33
+ logger.formatter = options.fetch(:log_formatter) do
34
+ Proc.new { |severity, datetime, progname, msg| "#{datetime}: #{msg}\n" }
35
+ end
36
+ end
37
+ end
38
+
39
+ # The interval for checking for new jobs
40
+ @interval = options.fetch(:interval, 5.0)
41
+ @current_job_mutex = Mutex.new
42
+ @current_job = nil
43
+
44
+ # Default behavior when a lock is lost: stop after the current job.
45
+ on_current_job_lock_lost { shutdown(in_signal_handler=false) }
46
+ end
47
+
48
+ def log_level
49
+ @log.level
50
+ end
51
+
52
+ def safe_trap(signal_name, &cblock)
53
+ begin
54
+ trap(signal_name, cblock)
55
+ rescue ArgumentError
56
+ warn "Signal #{signal_name} not supported."
57
+ end
58
+ end
59
+
60
+ # The meaning of these signals is meant to closely mirror resque
61
+ #
62
+ # TERM: Shutdown immediately, stop processing jobs.
63
+ # INT: Shutdown immediately, stop processing jobs.
64
+ # QUIT: Shutdown after the current job has finished processing.
65
+ # USR1: Kill the forked children immediately, continue processing jobs.
66
+ # USR2: Pause after this job
67
+ # CONT: Start processing jobs again after a USR2
68
+ # HUP: Print current stack to log and continue
69
+ def register_signal_handlers
70
+ # Otherwise, we want to take the appropriate action
71
+ trap('TERM') { exit! }
72
+ trap('INT') { exit! }
73
+ safe_trap('HUP') { sighup_handler.call }
74
+ safe_trap('QUIT') { shutdown(in_signal_handler=true) }
75
+ begin
76
+ trap('CONT') { unpause(in_signal_handler=true) }
77
+ trap('USR2') { pause(in_signal_handler=true) }
78
+ rescue ArgumentError
79
+ warn 'Signals USR2, and/or CONT not supported.'
80
+ end
81
+ end
82
+
83
+ # Return an enumerator to each of the jobs provided by the reserver
84
+ def jobs
85
+ return Enumerator.new do |enum|
86
+ loop do
87
+ begin
88
+ job = reserver.reserve
89
+ rescue Exception => error
90
+ # We want workers to durably stay up, so we don't want errors
91
+ # during job reserving (e.g. network timeouts, etc) to kill the
92
+ # worker.
93
+ log(:error,
94
+ "Error reserving job: #{error.class}: #{error.message}")
95
+ end
96
+
97
+ # If we ended up getting a job, yield it. Otherwise, we wait
98
+ if job.nil?
99
+ no_job_available
100
+ else
101
+ self.current_job = job
102
+ enum.yield(job)
103
+ self.current_job = nil
104
+ end
105
+
106
+ break if @shutdown
107
+ end
108
+ end
109
+ end
110
+
111
+ # Actually perform the job
112
+ def perform(job)
113
+ around_perform(job)
114
+ rescue JobLockLost
115
+ log(:warn, "Lost lock for job #{job.jid}")
116
+ rescue Exception => error
117
+ fail_job(job, error, caller)
118
+ else
119
+ try_complete(job)
120
+ end
121
+
122
+ # Allow middleware modules to be mixed in and override the
123
+ # definition of around_perform while providing a default
124
+ # implementation so our code can assume the method is present.
125
+ module SupportsMiddlewareModules
126
+ def around_perform(job)
127
+ job.perform
128
+ end
129
+
130
+ def after_fork
131
+ end
132
+ end
133
+
134
+ include SupportsMiddlewareModules
135
+
136
+ # Stop processing after this job
137
+ def shutdown(in_signal_handler=true)
138
+ @shutdown = true
139
+ end
140
+ alias stop! shutdown # so we can call `stop!` regardless of the worker type
141
+
142
+ # Pause the worker -- take no more new jobs
143
+ def pause(in_signal_handler=true)
144
+ @paused = true
145
+ procline("Paused -- #{reserver.description}", in_signal_handler=in_signal_handler)
146
+ end
147
+
148
+ # Continue taking new jobs
149
+ def unpause(in_signal_handler=true)
150
+ @paused = false
151
+ end
152
+
153
+ # Set the proceline. Not supported on all systems
154
+ def procline(value, in_signal_handler=true)
155
+ $0 = "Qless-#{Qless::VERSION}: #{value} at #{Time.now.iso8601}"
156
+ log(:debug, $PROGRAM_NAME) unless in_signal_handler
157
+ end
158
+
159
+ # Complete the job unless the worker has already put it into another state
160
+ # by completing / failing / etc. the job
161
+ def try_complete(job)
162
+ job.complete unless job.state_changed?
163
+ rescue Job::CantCompleteError => e
164
+ # There's not much we can do here. Complete fails in a few cases:
165
+ # - The job is already failed (i.e. by another worker)
166
+ # - The job is being worked on by another worker
167
+ # - The job has been cancelled
168
+ #
169
+ # We don't want to (or are able to) fail the job with this error in
170
+ # any of these cases, so the best we can do is log the failure.
171
+ log(:error, "Failed to complete #{job.inspect}: #{e.message}")
172
+ end
173
+
174
+ def fail_job(job, error, worker_backtrace)
175
+ failure = Qless.failure_formatter.format(job, error, worker_backtrace)
176
+ log(:error, "Got #{failure.group} failure from #{job.inspect}\n#{failure.message}" )
177
+ job.fail(*failure)
178
+ rescue Job::CantFailError => e
179
+ # There's not much we can do here. Another worker may have cancelled it,
180
+ # or we might not own the job, etc. Logging is the best we can do.
181
+ log(:error, "Failed to fail #{job.inspect}: #{e.message}")
182
+ end
183
+
184
+ def deregister
185
+ uniq_clients.each do |client|
186
+ client.deregister_workers(client.worker_name)
187
+ end
188
+ end
189
+
190
+ def uniq_clients
191
+ @uniq_clients ||= reserver.queues.map(&:client).uniq
192
+ end
193
+
194
+ def on_current_job_lock_lost(&block)
195
+ @on_current_job_lock_lost = block
196
+ end
197
+
198
+ def listen_for_lost_lock
199
+ subscribers = uniq_clients.map do |client|
200
+ Subscriber.start(client, "ql:w:#{client.worker_name}", log: @log) do |_, message|
201
+ if message['event'] == 'lock_lost'
202
+ with_current_job do |job|
203
+ if job && message['jid'] == job.jid
204
+ @on_current_job_lock_lost.call(job)
205
+ end
206
+ end
207
+ end
208
+ end
209
+ end
210
+
211
+ yield
212
+ ensure
213
+ subscribers.each(&:stop)
214
+ end
215
+
216
+ private
217
+
218
+ def log(type, msg)
219
+ @log.public_send(type, "#{Process.pid}: #{msg}")
220
+ end
221
+
222
+ def no_job_available
223
+ unless interval.zero?
224
+ procline("Waiting for #{reserver.description}", in_signal_handler=false)
225
+ log(:debug, "Sleeping for #{interval} seconds")
226
+ sleep interval
227
+ end
228
+ end
229
+
230
+ def with_current_job
231
+ @current_job_mutex.synchronize do
232
+ yield @current_job
233
+ end
234
+ end
235
+
236
+ def current_job=(job)
237
+ @current_job_mutex.synchronize do
238
+ @current_job = job
239
+ end
240
+ end
241
+
242
+ def reconnect_each_client
243
+ uniq_clients.each { |client| client.redis.client.reconnect }
244
+ end
245
+ end
246
+ end
247
+ end
@@ -0,0 +1,245 @@
1
+ # Encoding: utf-8
2
+
3
+ # Qless requires
4
+ require 'qless'
5
+ require 'qless/worker/base'
6
+ require 'qless/worker/serial'
7
+ require 'thread'
8
+
9
+ module Qless
10
+ module Workers
11
+ class ForkingWorker < BaseWorker
12
+ # The child startup interval
13
+ attr_accessor :max_startup_interval
14
+
15
+ def initialize(reserver, options = {})
16
+ super(reserver, options)
17
+ # The keys are the child PIDs, the values are information about the
18
+ # worker, including its sandbox directory. This directory currently
19
+ # isn't used, but this sets up for having that eventually.
20
+ @sandboxes = {}
21
+
22
+ # Save our options for starting children
23
+ @options = options
24
+
25
+ # The max interval between when children start (reduces thundering herd)
26
+ @max_startup_interval = options[:max_startup_interval] || 10.0
27
+
28
+ # TODO: facter to figure out how many cores we have
29
+ @num_workers = options[:num_workers] || 1
30
+
31
+ # All the modules that have been applied to this worker
32
+ @modules = []
33
+
34
+ @sandbox_mutex = Mutex.new
35
+ # A queue of blocks that are postponed since we cannot get
36
+ # @sandbox_mutex in trap handler
37
+ @postponed_actions_queue = ::Queue.new
38
+ end
39
+
40
+ # Because we spawn a new worker, we need to apply all the modules that
41
+ # extend this one
42
+ def extend(mod)
43
+ @modules << mod
44
+ super(mod)
45
+ end
46
+
47
+ # Spawn a new child worker
48
+ def spawn
49
+ worker = SerialWorker.new(reserver, @options)
50
+ # We use 11 as the exit status so that it is something unique
51
+ # (rather than the common 1). Plus, 11 looks a little like
52
+ # ll (i.e. "Lock Lost").
53
+ worker.on_current_job_lock_lost { |job| exit!(11) }
54
+ @modules.each { |mod| worker.extend(mod) }
55
+ worker
56
+ end
57
+
58
+ # If @sandbox_mutex is free, execute block immediately.
59
+ # Otherwise, postpone it until handling is possible
60
+ def contention_aware_handler(&block)
61
+ if @sandbox_mutex.try_lock
62
+ block.call
63
+ @sandbox_mutex.unlock
64
+ else
65
+ @postponed_actions_queue << block
66
+ end
67
+ end
68
+
69
+ # Process any signals (such as TERM) that could not be processed
70
+ # immediately due to @sandbox_mutex being in use
71
+ def process_postponed_actions
72
+ until @postponed_actions_queue.empty?
73
+ # It's possible a signal interrupteed us between the empty?
74
+ # and shift calls, but it could have only added more things
75
+ # into @postponed_actions_queue
76
+ block = @postponed_actions_queue.shift(true)
77
+ @sandbox_mutex.synchronize do
78
+ block.call
79
+ end
80
+ end
81
+ end
82
+
83
+ # Register our handling of signals
84
+ def register_signal_handlers
85
+ # If we're the parent process, we mostly want to forward the signals on
86
+ # to the child processes. It's just that sometimes we want to wait for
87
+ # them and then exit
88
+ trap('TERM') do
89
+ contention_aware_handler { stop!('TERM', in_signal_handler=true); exit }
90
+ end
91
+ trap('INT') do
92
+ contention_aware_handler { stop!('INT', in_signal_handler=true); exit }
93
+ end
94
+ safe_trap('HUP') { sighup_handler.call }
95
+ safe_trap('QUIT') do
96
+ contention_aware_handler { stop!('QUIT', in_signal_handler=true); exit }
97
+ end
98
+ safe_trap('USR1') do
99
+ contention_aware_handler { stop!('KILL', in_signal_handler=true) }
100
+ end
101
+ begin
102
+ trap('CONT') { stop('CONT', in_signal_handler=true) }
103
+ trap('USR2') { stop('USR2', in_signal_handler=true) }
104
+ rescue ArgumentError
105
+ warn 'Signals USR2, and/or CONT not supported.'
106
+ end
107
+ end
108
+
109
+ # Run this worker
110
+ def run
111
+ startup_sandboxes
112
+
113
+ # Now keep an eye on our child processes, spawn replacements as needed
114
+ loop do
115
+ begin
116
+ # Don't wait on any processes if we're already in shutdown mode.
117
+ break if @shutdown
118
+
119
+ # Wait for any child to kick the bucket
120
+ pid, status = Process.wait2
121
+ code, sig = status.exitstatus, status.stopsig
122
+ log(:warn,
123
+ "Worker process #{pid} died with #{code} from signal (#{sig})")
124
+
125
+ # allow our shutdown logic (called from a separate thread) to take affect.
126
+ break if @shutdown
127
+
128
+ spawn_replacement_child(pid)
129
+ process_postponed_actions
130
+ rescue SystemCallError => e
131
+ log(:error, "Failed to wait for child process: #{e.inspect}")
132
+ # If we're shutting down, the loop above will exit
133
+ exit! unless @shutdown
134
+ end
135
+ end
136
+ end
137
+
138
+ # Returns a list of each of the child pids
139
+ def children
140
+ @sandboxes.keys
141
+ end
142
+
143
+ # Signal all the children
144
+ def stop(signal = 'QUIT', in_signal_handler=true)
145
+ log(:warn, "Sending #{signal} to children") unless in_signal_handler
146
+ children.each do |pid|
147
+ begin
148
+ Process.kill(signal, pid)
149
+ rescue Errno::ESRCH
150
+ # no such process -- means the process has already died.
151
+ end
152
+ end
153
+ end
154
+
155
+ # Signal all the children and wait for them to exit.
156
+ # Should only be called when we have the lock on @sandbox_mutex
157
+ def stop!(signal = 'QUIT', in_signal_handler=true)
158
+ shutdown(in_signal_handler=in_signal_handler)
159
+ shutdown_sandboxes(signal, in_signal_handler=in_signal_handler)
160
+ end
161
+
162
+ private
163
+
164
+ def startup_sandboxes
165
+ # Make sure we respond to signals correctly
166
+ register_signal_handlers
167
+
168
+ log(:debug, "Starting to run with #{@num_workers} workers")
169
+ @num_workers.times do |i|
170
+ slot = {
171
+ worker_id: i,
172
+ sandbox: nil
173
+ }
174
+
175
+ cpid = fork_child_process do
176
+ # Wait for a bit to calm the thundering herd
177
+ sleep(rand(max_startup_interval)) if max_startup_interval > 0
178
+ end
179
+
180
+ # If we're the parent process, save information about the child
181
+ log(:info, "Spawned worker #{cpid}")
182
+ @sandboxes[cpid] = slot
183
+ end
184
+ end
185
+
186
+ # Should only be called when we have a lock on @sandbox_mutex
187
+ def shutdown_sandboxes(signal, in_signal_handler=true)
188
+ # First, send the signal
189
+ stop(signal, in_signal_handler=in_signal_handler)
190
+
191
+ # Wait for each of our children
192
+ log(:warn, 'Waiting for child processes') unless in_signal_handler
193
+
194
+ until @sandboxes.empty?
195
+ begin
196
+ pid, _ = Process.wait2
197
+ log(:warn, "Child #{pid} stopped") unless in_signal_handler
198
+ @sandboxes.delete(pid)
199
+ rescue SystemCallError
200
+ break
201
+ end
202
+ end
203
+
204
+
205
+ unless in_signal_handler
206
+ log(:warn, 'All children have stopped')
207
+
208
+ # If there were any children processes we couldn't wait for, log it
209
+ @sandboxes.keys.each do |cpid|
210
+ log(:warn, "Could not wait for child #{cpid}")
211
+ end
212
+ end
213
+
214
+ @sandboxes.clear
215
+ end
216
+
217
+ private
218
+
219
+ def spawn_replacement_child(pid)
220
+ @sandbox_mutex.synchronize do
221
+ return if @shutdown
222
+
223
+ # And give its slot to a new worker process
224
+ slot = @sandboxes.delete(pid)
225
+ cpid = fork_child_process
226
+
227
+ # If we're the parent process, ave information about the child
228
+ log(:warn, "Spawned worker #{cpid} to replace #{pid}")
229
+ @sandboxes[cpid] = slot
230
+ end
231
+ end
232
+
233
+ # returns child's pid.
234
+ def fork_child_process
235
+ fork do
236
+ yield if block_given?
237
+ reconnect_each_client
238
+ after_fork
239
+ spawn.run
240
+ end
241
+ end
242
+
243
+ end
244
+ end
245
+ end