qless 0.9.2 → 0.9.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +2 -0
- data/README.md +42 -3
- data/Rakefile +26 -2
- data/{bin → exe}/qless-web +3 -2
- data/lib/qless.rb +55 -28
- data/lib/qless/config.rb +1 -3
- data/lib/qless/job.rb +127 -22
- data/lib/qless/job_reservers/round_robin.rb +3 -1
- data/lib/qless/job_reservers/shuffled_round_robin.rb +14 -0
- data/lib/qless/lua_script.rb +42 -0
- data/lib/qless/middleware/redis_reconnect.rb +24 -0
- data/lib/qless/middleware/retry_exceptions.rb +43 -0
- data/lib/qless/middleware/sentry.rb +70 -0
- data/lib/qless/qless-core/cancel.lua +89 -59
- data/lib/qless/qless-core/complete.lua +16 -1
- data/lib/qless/qless-core/config.lua +12 -0
- data/lib/qless/qless-core/deregister_workers.lua +12 -0
- data/lib/qless/qless-core/fail.lua +24 -14
- data/lib/qless/qless-core/heartbeat.lua +2 -1
- data/lib/qless/qless-core/pause.lua +18 -0
- data/lib/qless/qless-core/pop.lua +24 -3
- data/lib/qless/qless-core/put.lua +14 -1
- data/lib/qless/qless-core/qless-lib.lua +2354 -0
- data/lib/qless/qless-core/qless.lua +1862 -0
- data/lib/qless/qless-core/retry.lua +1 -1
- data/lib/qless/qless-core/unfail.lua +54 -0
- data/lib/qless/qless-core/unpause.lua +12 -0
- data/lib/qless/queue.rb +45 -21
- data/lib/qless/server.rb +38 -39
- data/lib/qless/server/static/css/docs.css +21 -1
- data/lib/qless/server/views/_job.erb +5 -5
- data/lib/qless/server/views/overview.erb +14 -9
- data/lib/qless/subscriber.rb +48 -0
- data/lib/qless/version.rb +1 -1
- data/lib/qless/wait_until.rb +19 -0
- data/lib/qless/worker.rb +243 -33
- metadata +49 -30
- data/bin/install_phantomjs +0 -7
- data/bin/qless-campfire +0 -106
- data/bin/qless-growl +0 -99
- data/lib/qless/lua.rb +0 -25
@@ -12,7 +12,6 @@ body {
|
|
12
12
|
position: relative;
|
13
13
|
padding-top: 90px;
|
14
14
|
background-color: #fff;
|
15
|
-
background-image: url(../img/grid-18px-masked.png);
|
16
15
|
background-repeat: repeat-x;
|
17
16
|
background-position: 0 40px;
|
18
17
|
}
|
@@ -817,3 +816,24 @@ form.well {
|
|
817
816
|
}
|
818
817
|
|
819
818
|
}
|
819
|
+
|
820
|
+
/* For proper failed job display*/
|
821
|
+
.l-sidebyside {
|
822
|
+
overflow:auto;
|
823
|
+
}
|
824
|
+
|
825
|
+
.l-sidebyside > * {
|
826
|
+
display:inline-block;
|
827
|
+
*display: inline;
|
828
|
+
*zoom: 1;
|
829
|
+
}
|
830
|
+
|
831
|
+
.failed-job > .min-col-size {
|
832
|
+
min-width: 395px;
|
833
|
+
}
|
834
|
+
|
835
|
+
.failed-job .row {
|
836
|
+
margin-left:0;
|
837
|
+
margin-right:10px;
|
838
|
+
height: 30px;
|
839
|
+
}
|
@@ -112,12 +112,12 @@
|
|
112
112
|
<div class="span6">
|
113
113
|
<h3><small>History</small></h3>
|
114
114
|
<div style="overflow-y:scroll; height: 200px">
|
115
|
-
<% job.
|
115
|
+
<% job.queue_history.reverse.each do |h| %>
|
116
116
|
<pre><strong><%= h['q'] %></strong>
|
117
|
-
Put: <%= strftime(
|
118
|
-
Pop: <%= strftime(
|
119
|
-
Completed: <%= strftime(
|
120
|
-
Failed: <%= strftime(
|
117
|
+
Put: <%= strftime(h['put']) %><% if not h['popped'].nil? %>
|
118
|
+
Pop: <%= strftime(h['popped']) %> by <%= h['worker'] %><% end %><% if not h['completed'].nil? %>
|
119
|
+
Completed: <%= strftime(h['completed']) %><% end %><% if not h['failed'].nil? %>
|
120
|
+
Failed: <%= strftime(h['failed']) %><% end %></pre>
|
121
121
|
<% end %>
|
122
122
|
</div>
|
123
123
|
</div>
|
@@ -18,7 +18,7 @@
|
|
18
18
|
<%= queue['waiting'] %> /
|
19
19
|
<%= queue['scheduled'] %> /
|
20
20
|
<%= queue['stalled'] %> /
|
21
|
-
<%= queue['depends'] %> /
|
21
|
+
<%= queue['depends'] %> /
|
22
22
|
<%= queue['recurring'] %> <small>(running / waiting / scheduled / stalled / depends / recurring)</small>
|
23
23
|
</h3>
|
24
24
|
</div>
|
@@ -34,16 +34,22 @@
|
|
34
34
|
<div class="page-header">
|
35
35
|
<h1>Failed Jobs <small>D'oh!</small></h1>
|
36
36
|
</div>
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
<
|
37
|
+
<div class="l-sidebyside failed-job">
|
38
|
+
<div class="min-col-size">
|
39
|
+
<% failed.sort_by { |t, count| -count }.each do |t, count| %>
|
40
|
+
<div class="row">
|
41
|
+
<h3><a href="<%= u "/failed/#{t}" %>"><%= t %></a></h3>
|
42
|
+
</div>
|
43
|
+
<% end %>
|
41
44
|
</div>
|
42
|
-
<div
|
43
|
-
|
45
|
+
<div>
|
46
|
+
<% failed.sort_by { |t, count| -count }.each do |t, count| %>
|
47
|
+
<div class="row">
|
48
|
+
<h3>| <%= count %> <small>Jobs</small></h3>
|
49
|
+
</div>
|
50
|
+
<% end %>
|
44
51
|
</div>
|
45
52
|
</div>
|
46
|
-
<% end %>
|
47
53
|
<% end %>
|
48
54
|
|
49
55
|
<% if tracked['jobs'].empty? %>
|
@@ -67,7 +73,6 @@
|
|
67
73
|
<% end %>
|
68
74
|
<% end %>
|
69
75
|
|
70
|
-
|
71
76
|
<% if workers.empty? %>
|
72
77
|
<div class="page-header">
|
73
78
|
<h1>No Workers <small>Nobody's doin' nothin'!</small></h1>
|
@@ -0,0 +1,48 @@
|
|
1
|
+
require 'thread'
|
2
|
+
require 'qless/wait_until'
|
3
|
+
|
4
|
+
module Qless
|
5
|
+
class Subscriber
|
6
|
+
def self.start(*args, &block)
|
7
|
+
new(*args, &block).start_pub_sub_listener
|
8
|
+
end
|
9
|
+
|
10
|
+
attr_reader :client, :channel
|
11
|
+
|
12
|
+
def initialize(client, channel, &message_received_callback)
|
13
|
+
@client = client
|
14
|
+
@channel = channel
|
15
|
+
@message_received_callback = message_received_callback
|
16
|
+
|
17
|
+
# pub/sub blocks the connection so we must use a different redis connection
|
18
|
+
@client_redis = client.redis
|
19
|
+
@listener_redis = client.new_redis_connection
|
20
|
+
|
21
|
+
@my_channel = Qless.generate_jid
|
22
|
+
end
|
23
|
+
|
24
|
+
def start_pub_sub_listener
|
25
|
+
@thread = ::Thread.start do
|
26
|
+
@listener_redis.subscribe(channel, @my_channel) do |on|
|
27
|
+
on.message do |_channel, message|
|
28
|
+
if _channel == @my_channel
|
29
|
+
@listener_redis.unsubscribe(@my_channel)
|
30
|
+
else
|
31
|
+
@message_received_callback.call(self, JSON.parse(message))
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
wait_until_thread_listening
|
38
|
+
end
|
39
|
+
|
40
|
+
def wait_until_thread_listening
|
41
|
+
Qless::WaitUntil.wait_until(10) do
|
42
|
+
@client_redis.publish(@my_channel, 'disconnect') == 1
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
|
data/lib/qless/version.rb
CHANGED
@@ -0,0 +1,19 @@
|
|
1
|
+
module Qless
|
2
|
+
module WaitUntil
|
3
|
+
TimeoutError = Class.new(StandardError)
|
4
|
+
|
5
|
+
def wait_until(timeout)
|
6
|
+
timeout_at = Time.now + timeout
|
7
|
+
|
8
|
+
loop do
|
9
|
+
return if yield
|
10
|
+
sleep 0.002
|
11
|
+
if Time.now > timeout_at
|
12
|
+
raise TimeoutError, "Timed out after #{timeout} seconds"
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
module_function :wait_until
|
18
|
+
end
|
19
|
+
end
|
data/lib/qless/worker.rb
CHANGED
@@ -2,19 +2,25 @@ require 'qless'
|
|
2
2
|
require 'time'
|
3
3
|
require 'qless/job_reservers/ordered'
|
4
4
|
require 'qless/job_reservers/round_robin'
|
5
|
+
require 'qless/job_reservers/shuffled_round_robin'
|
6
|
+
require 'qless/subscriber'
|
7
|
+
require 'qless/wait_until'
|
5
8
|
|
6
9
|
module Qless
|
7
10
|
# This is heavily inspired by Resque's excellent worker:
|
8
11
|
# https://github.com/defunkt/resque/blob/v1.20.0/lib/resque/worker.rb
|
9
12
|
class Worker
|
10
|
-
def initialize(
|
11
|
-
|
13
|
+
def initialize(job_reserver, options = {})
|
14
|
+
self.job_reserver = job_reserver
|
12
15
|
@shutdown = @paused = false
|
13
16
|
|
14
17
|
self.very_verbose = options[:very_verbose]
|
15
18
|
self.verbose = options[:verbose]
|
16
19
|
self.run_as_single_process = options[:run_as_single_process]
|
17
20
|
self.output = options.fetch(:output, $stdout)
|
21
|
+
self.term_timeout = options.fetch(:term_timeout, 4.0)
|
22
|
+
@backtrace_replacements = { Dir.pwd => '.' }
|
23
|
+
@backtrace_replacements[ENV['GEM_HOME']] = '<GEM_HOME>' if ENV.has_key?('GEM_HOME')
|
18
24
|
|
19
25
|
output.puts "\n\n\n" if verbose || very_verbose
|
20
26
|
log "Instantiated Worker"
|
@@ -35,6 +41,13 @@ module Qless
|
|
35
41
|
# Defaults to $stdout.
|
36
42
|
attr_accessor :output
|
37
43
|
|
44
|
+
# The object responsible for reserving jobs from the Qless server,
|
45
|
+
# using some reasonable strategy (e.g. round robin or ordered)
|
46
|
+
attr_accessor :job_reserver
|
47
|
+
|
48
|
+
# How long the child process is given to exit before forcibly killing it.
|
49
|
+
attr_accessor :term_timeout
|
50
|
+
|
38
51
|
# Starts a worker based on ENV vars. Supported ENV vars:
|
39
52
|
# - REDIS_URL=redis://host:port/db-num (the redis gem uses this automatically)
|
40
53
|
# - QUEUES=high,medium,low or QUEUE=blah
|
@@ -59,18 +72,22 @@ module Qless
|
|
59
72
|
options[:very_verbose] = !!ENV['VVERBOSE']
|
60
73
|
options[:run_as_single_process] = !!ENV['RUN_AS_SINGLE_PROCESS']
|
61
74
|
|
62
|
-
new(
|
75
|
+
new(reserver, options).work(interval)
|
63
76
|
end
|
64
77
|
|
65
78
|
def work(interval = 5.0)
|
66
79
|
procline "Starting #{@job_reserver.description}"
|
67
|
-
|
80
|
+
register_parent_signal_handlers
|
81
|
+
uniq_clients.each { |client| start_parent_pub_sub_listener_for(client) }
|
68
82
|
|
69
83
|
loop do
|
70
84
|
break if shutdown?
|
71
|
-
|
85
|
+
if paused?
|
86
|
+
sleep interval
|
87
|
+
next
|
88
|
+
end
|
72
89
|
|
73
|
-
unless job =
|
90
|
+
unless job = reserve_job
|
74
91
|
break if interval.zero?
|
75
92
|
procline "Waiting for #{@job_reserver.description}"
|
76
93
|
log! "Sleeping for #{interval} seconds"
|
@@ -78,31 +95,48 @@ module Qless
|
|
78
95
|
next
|
79
96
|
end
|
80
97
|
|
81
|
-
|
82
|
-
|
83
|
-
if run_as_single_process
|
84
|
-
# We're staying in the same process
|
85
|
-
procline "Single processing #{job.description}"
|
86
|
-
perform(job)
|
87
|
-
elsif @child = fork
|
88
|
-
# We're in the parent process
|
89
|
-
procline "Forked #{@child} for #{job.description}"
|
90
|
-
Process.wait(@child)
|
91
|
-
else
|
92
|
-
# We're in the child process
|
93
|
-
procline "Processing #{job.description}"
|
94
|
-
perform(job)
|
95
|
-
exit!
|
96
|
-
end
|
98
|
+
perform_job_in_child_process(job)
|
97
99
|
end
|
100
|
+
ensure
|
101
|
+
# make sure the worker deregisters on shutdown
|
102
|
+
deregister
|
98
103
|
end
|
99
104
|
|
100
105
|
def perform(job)
|
101
106
|
around_perform(job)
|
102
107
|
rescue Exception => error
|
103
|
-
fail_job(job, error)
|
108
|
+
fail_job(job, error, caller)
|
104
109
|
else
|
105
|
-
job
|
110
|
+
try_complete(job)
|
111
|
+
end
|
112
|
+
|
113
|
+
def reserve_job
|
114
|
+
@job_reserver.reserve
|
115
|
+
rescue Exception => error
|
116
|
+
# We want workers to durably stay up, so we don't want errors
|
117
|
+
# during job reserving (e.g. network timeouts, etc) to kill
|
118
|
+
# the worker.
|
119
|
+
log "Got an error while reserving a job: #{error.class}: #{error.message}"
|
120
|
+
end
|
121
|
+
|
122
|
+
def perform_job_in_child_process(job)
|
123
|
+
with_job(job) do
|
124
|
+
@child = fork do
|
125
|
+
job.reconnect_to_redis
|
126
|
+
register_child_signal_handlers
|
127
|
+
start_child_pub_sub_listener_for(job.client)
|
128
|
+
procline "Processing #{job.description}"
|
129
|
+
perform(job)
|
130
|
+
exit! # don't run at_exit hooks
|
131
|
+
end
|
132
|
+
|
133
|
+
if @child
|
134
|
+
wait_for_child
|
135
|
+
else
|
136
|
+
procline "Single processing #{job.description}"
|
137
|
+
perform(job)
|
138
|
+
end
|
139
|
+
end
|
106
140
|
end
|
107
141
|
|
108
142
|
def shutdown
|
@@ -135,6 +169,33 @@ module Qless
|
|
135
169
|
|
136
170
|
private
|
137
171
|
|
172
|
+
def fork
|
173
|
+
super unless run_as_single_process
|
174
|
+
end
|
175
|
+
|
176
|
+
def deregister
|
177
|
+
uniq_clients.each do |client|
|
178
|
+
client.deregister_workers(Qless.worker_name)
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
def uniq_clients
|
183
|
+
@uniq_clients ||= @job_reserver.queues.map(&:client).uniq
|
184
|
+
end
|
185
|
+
|
186
|
+
def try_complete(job)
|
187
|
+
job.complete unless job.state_changed?
|
188
|
+
rescue Job::CantCompleteError => e
|
189
|
+
# There's not much we can do here. Complete fails in a few cases:
|
190
|
+
# - The job is already failed (i.e. by another worker)
|
191
|
+
# - The job is being worked on by another worker
|
192
|
+
# - The job has been cancelled
|
193
|
+
#
|
194
|
+
# We don't want to (or are able to) fail the job with this error in
|
195
|
+
# any of these cases, so the best we can do is log the failure.
|
196
|
+
log "Failed to complete #{job.inspect}: #{e.message}"
|
197
|
+
end
|
198
|
+
|
138
199
|
# Allow middleware modules to be mixed in and override the
|
139
200
|
# definition of around_perform while providing a default
|
140
201
|
# implementation so our code can assume the method is present.
|
@@ -144,47 +205,120 @@ module Qless
|
|
144
205
|
end
|
145
206
|
}
|
146
207
|
|
147
|
-
def fail_job(job, error)
|
148
|
-
group = "#{job.
|
149
|
-
message = "#{error
|
208
|
+
def fail_job(job, error, worker_backtrace)
|
209
|
+
group = "#{job.klass_name}:#{error.class}"
|
210
|
+
message = "#{truncated_message(error)}\n\n#{format_failure_backtrace(error.backtrace, worker_backtrace)}"
|
150
211
|
log "Got #{group} failure from #{job.inspect}"
|
151
212
|
job.fail(group, message)
|
152
213
|
end
|
153
214
|
|
215
|
+
# TODO: pull this out into a config option.
|
216
|
+
MAX_ERROR_MESSAGE_SIZE = 10_000
|
217
|
+
def truncated_message(error)
|
218
|
+
return error.message if error.message.length <= MAX_ERROR_MESSAGE_SIZE
|
219
|
+
error.message.slice(0, MAX_ERROR_MESSAGE_SIZE) + "... (truncated due to length)"
|
220
|
+
end
|
221
|
+
|
222
|
+
def format_failure_backtrace(error_backtrace, worker_backtrace)
|
223
|
+
(error_backtrace - worker_backtrace).map do |line|
|
224
|
+
@backtrace_replacements.inject(line) do |line, (original, new)|
|
225
|
+
line.sub(original, new)
|
226
|
+
end
|
227
|
+
end.join("\n")
|
228
|
+
end
|
229
|
+
|
154
230
|
def procline(value)
|
155
231
|
$0 = "Qless-#{Qless::VERSION}: #{value} at #{Time.now.iso8601}"
|
156
232
|
log! $0
|
157
233
|
end
|
158
234
|
|
235
|
+
def wait_for_child
|
236
|
+
srand # Reseeding
|
237
|
+
procline "Forked #{@child} at #{Time.now.to_i}"
|
238
|
+
begin
|
239
|
+
Process.waitpid(@child)
|
240
|
+
rescue SystemCallError
|
241
|
+
nil
|
242
|
+
end
|
243
|
+
end
|
244
|
+
|
245
|
+
# Kills the forked child immediately with minimal remorse. The job it
|
246
|
+
# is processing will not be completed. Send the child a TERM signal,
|
247
|
+
# wait 5 seconds, and then a KILL signal if it has not quit
|
159
248
|
def kill_child
|
160
249
|
return unless @child
|
161
|
-
|
162
|
-
Process.
|
250
|
+
|
251
|
+
if Process.waitpid(@child, Process::WNOHANG)
|
252
|
+
log "Child #{@child} already quit."
|
253
|
+
return
|
254
|
+
end
|
255
|
+
|
256
|
+
signal_child("TERM", @child)
|
257
|
+
|
258
|
+
signal_child("KILL", @child) unless quit_gracefully?(@child)
|
259
|
+
rescue SystemCallError
|
260
|
+
log "Child #{@child} already quit and reaped."
|
261
|
+
end
|
262
|
+
|
263
|
+
# send a signal to a child, have it logged.
|
264
|
+
def signal_child(signal, child)
|
265
|
+
log "Sending #{signal} signal to child #{child}"
|
266
|
+
Process.kill(signal, child)
|
267
|
+
end
|
268
|
+
|
269
|
+
# has our child quit gracefully within the timeout limit?
|
270
|
+
def quit_gracefully?(child)
|
271
|
+
(term_timeout.to_f * 10).round.times do |i|
|
272
|
+
sleep(0.1)
|
273
|
+
return true if Process.waitpid(child, Process::WNOHANG)
|
274
|
+
end
|
275
|
+
|
276
|
+
false
|
163
277
|
end
|
164
278
|
|
165
|
-
# This
|
279
|
+
# This was originally stolen directly from resque... (thanks, @defunkt!)
|
166
280
|
# Registers the various signal handlers a worker responds to.
|
167
281
|
#
|
168
282
|
# TERM: Shutdown immediately, stop processing jobs.
|
169
283
|
# INT: Shutdown immediately, stop processing jobs.
|
170
284
|
# QUIT: Shutdown after the current job has finished processing.
|
171
285
|
# USR1: Kill the forked child immediately, continue processing jobs.
|
172
|
-
# USR2: Don't process any new jobs
|
286
|
+
# USR2: Don't process any new jobs; dump the backtrace.
|
173
287
|
# CONT: Start processing jobs again after a USR2
|
174
|
-
def
|
288
|
+
def register_parent_signal_handlers
|
175
289
|
trap('TERM') { shutdown! }
|
176
290
|
trap('INT') { shutdown! }
|
177
291
|
|
178
292
|
begin
|
179
293
|
trap('QUIT') { shutdown }
|
180
294
|
trap('USR1') { kill_child }
|
181
|
-
trap('USR2')
|
295
|
+
trap('USR2') do
|
296
|
+
log "Current backtrace (parent): \n\n#{caller.join("\n")}\n\n"
|
297
|
+
pause_processing
|
298
|
+
end
|
299
|
+
|
182
300
|
trap('CONT') { unpause_processing }
|
183
301
|
rescue ArgumentError
|
184
302
|
warn "Signals QUIT, USR1, USR2, and/or CONT not supported."
|
185
303
|
end
|
186
304
|
end
|
187
305
|
|
306
|
+
def register_child_signal_handlers
|
307
|
+
trap('TERM') { raise SignalException.new("SIGTERM") }
|
308
|
+
trap('INT', 'DEFAULT')
|
309
|
+
|
310
|
+
begin
|
311
|
+
trap('QUIT', 'DEFAULT')
|
312
|
+
trap('USR1', 'DEFAULT')
|
313
|
+
trap('USR2', 'DEFAULT')
|
314
|
+
|
315
|
+
trap('USR2') do
|
316
|
+
log "Current backtrace (child): \n\n#{caller.join("\n")}\n\n"
|
317
|
+
end
|
318
|
+
rescue ArgumentError
|
319
|
+
end
|
320
|
+
end
|
321
|
+
|
188
322
|
# Log a message to STDOUT if we are verbose or very_verbose.
|
189
323
|
def log(message)
|
190
324
|
if verbose
|
@@ -199,6 +333,82 @@ module Qless
|
|
199
333
|
def log!(message)
|
200
334
|
log message if very_verbose
|
201
335
|
end
|
336
|
+
|
337
|
+
def start_parent_pub_sub_listener_for(client)
|
338
|
+
Subscriber.start(client, "ql:w:#{Qless.worker_name}") do |subscriber, message|
|
339
|
+
if message["event"] == "lock_lost" && message["jid"] == current_job_jid
|
340
|
+
fail_job_due_to_timeout
|
341
|
+
kill_child
|
342
|
+
end
|
343
|
+
end
|
344
|
+
end
|
345
|
+
|
346
|
+
def start_child_pub_sub_listener_for(client)
|
347
|
+
Subscriber.start(client, "ql:w:#{Qless.worker_name}:#{Process.pid}") do |subscriber, message|
|
348
|
+
if message["event"] == "notify_backtrace"
|
349
|
+
notify_parent_of_job_backtrace(client, message.fetch('notify_list'))
|
350
|
+
end
|
351
|
+
end
|
352
|
+
end
|
353
|
+
|
354
|
+
def with_job(job)
|
355
|
+
@job = job
|
356
|
+
yield
|
357
|
+
ensure
|
358
|
+
@job = nil
|
359
|
+
end
|
360
|
+
|
361
|
+
# To prevent race conditions (with our listener thread),
|
362
|
+
# we cannot use a pattern like `use(@job) if @job` because
|
363
|
+
# the value of `@job` could change between the checking of
|
364
|
+
# it and the use of it. Here we use a pattern that avoids
|
365
|
+
# the issue -- get the job into a local, and yield that if
|
366
|
+
# it is set.
|
367
|
+
def access_current_job
|
368
|
+
if job = @job
|
369
|
+
yield job
|
370
|
+
end
|
371
|
+
end
|
372
|
+
|
373
|
+
def current_job_jid
|
374
|
+
access_current_job &:jid
|
375
|
+
end
|
376
|
+
|
377
|
+
JobLockLost = Class.new(StandardError)
|
378
|
+
|
379
|
+
def fail_job_due_to_timeout
|
380
|
+
access_current_job do |job|
|
381
|
+
error = JobLockLost.new
|
382
|
+
error.set_backtrace(get_backtrace_from_child(job.client.redis))
|
383
|
+
fail_job(job, error, caller)
|
384
|
+
end
|
385
|
+
end
|
386
|
+
|
387
|
+
def notify_parent_of_job_backtrace(client, list)
|
388
|
+
job_backtrace = Thread.main.backtrace
|
389
|
+
client.redis.lpush list, JSON.dump(job_backtrace)
|
390
|
+
client.redis.pexpire list, BACKTRACE_EXPIRATION_TIMEOUT_MS
|
391
|
+
end
|
392
|
+
|
393
|
+
WAIT_FOR_CHILD_BACKTRACE_TIMEOUT = 4
|
394
|
+
BACKTRACE_EXPIRATION_TIMEOUT_MS = 60_000 # timeout after a minute
|
395
|
+
|
396
|
+
def get_backtrace_from_child(child_redis)
|
397
|
+
notification_list = "ql:child_backtraces:#{Qless.generate_jid}"
|
398
|
+
request_backtrace = { "event" => "notify_backtrace",
|
399
|
+
"notify_list" => notification_list }
|
400
|
+
|
401
|
+
if child_redis.publish("ql:w:#{Qless.worker_name}:#{@child}", JSON.dump(request_backtrace)).zero?
|
402
|
+
return ["Could not obtain child backtrace since it was not listening."]
|
403
|
+
end
|
404
|
+
|
405
|
+
begin
|
406
|
+
_, backtrace_json = child_redis.blpop(notification_list, WAIT_FOR_CHILD_BACKTRACE_TIMEOUT)
|
407
|
+
JSON.parse(backtrace_json)
|
408
|
+
rescue => e
|
409
|
+
["Could not obtain child backtrace: #{e.class}: #{e.message}"] + e.backtrace
|
410
|
+
end
|
411
|
+
end
|
202
412
|
end
|
203
413
|
end
|
204
414
|
|