qless 0.9.2 → 0.9.3
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +2 -0
- data/README.md +42 -3
- data/Rakefile +26 -2
- data/{bin → exe}/qless-web +3 -2
- data/lib/qless.rb +55 -28
- data/lib/qless/config.rb +1 -3
- data/lib/qless/job.rb +127 -22
- data/lib/qless/job_reservers/round_robin.rb +3 -1
- data/lib/qless/job_reservers/shuffled_round_robin.rb +14 -0
- data/lib/qless/lua_script.rb +42 -0
- data/lib/qless/middleware/redis_reconnect.rb +24 -0
- data/lib/qless/middleware/retry_exceptions.rb +43 -0
- data/lib/qless/middleware/sentry.rb +70 -0
- data/lib/qless/qless-core/cancel.lua +89 -59
- data/lib/qless/qless-core/complete.lua +16 -1
- data/lib/qless/qless-core/config.lua +12 -0
- data/lib/qless/qless-core/deregister_workers.lua +12 -0
- data/lib/qless/qless-core/fail.lua +24 -14
- data/lib/qless/qless-core/heartbeat.lua +2 -1
- data/lib/qless/qless-core/pause.lua +18 -0
- data/lib/qless/qless-core/pop.lua +24 -3
- data/lib/qless/qless-core/put.lua +14 -1
- data/lib/qless/qless-core/qless-lib.lua +2354 -0
- data/lib/qless/qless-core/qless.lua +1862 -0
- data/lib/qless/qless-core/retry.lua +1 -1
- data/lib/qless/qless-core/unfail.lua +54 -0
- data/lib/qless/qless-core/unpause.lua +12 -0
- data/lib/qless/queue.rb +45 -21
- data/lib/qless/server.rb +38 -39
- data/lib/qless/server/static/css/docs.css +21 -1
- data/lib/qless/server/views/_job.erb +5 -5
- data/lib/qless/server/views/overview.erb +14 -9
- data/lib/qless/subscriber.rb +48 -0
- data/lib/qless/version.rb +1 -1
- data/lib/qless/wait_until.rb +19 -0
- data/lib/qless/worker.rb +243 -33
- metadata +49 -30
- data/bin/install_phantomjs +0 -7
- data/bin/qless-campfire +0 -106
- data/bin/qless-growl +0 -99
- data/lib/qless/lua.rb +0 -25
@@ -12,7 +12,6 @@ body {
|
|
12
12
|
position: relative;
|
13
13
|
padding-top: 90px;
|
14
14
|
background-color: #fff;
|
15
|
-
background-image: url(../img/grid-18px-masked.png);
|
16
15
|
background-repeat: repeat-x;
|
17
16
|
background-position: 0 40px;
|
18
17
|
}
|
@@ -817,3 +816,24 @@ form.well {
|
|
817
816
|
}
|
818
817
|
|
819
818
|
}
|
819
|
+
|
820
|
+
/* For proper failed job display*/
|
821
|
+
.l-sidebyside {
|
822
|
+
overflow:auto;
|
823
|
+
}
|
824
|
+
|
825
|
+
.l-sidebyside > * {
|
826
|
+
display:inline-block;
|
827
|
+
*display: inline;
|
828
|
+
*zoom: 1;
|
829
|
+
}
|
830
|
+
|
831
|
+
.failed-job > .min-col-size {
|
832
|
+
min-width: 395px;
|
833
|
+
}
|
834
|
+
|
835
|
+
.failed-job .row {
|
836
|
+
margin-left:0;
|
837
|
+
margin-right:10px;
|
838
|
+
height: 30px;
|
839
|
+
}
|
@@ -112,12 +112,12 @@
|
|
112
112
|
<div class="span6">
|
113
113
|
<h3><small>History</small></h3>
|
114
114
|
<div style="overflow-y:scroll; height: 200px">
|
115
|
-
<% job.
|
115
|
+
<% job.queue_history.reverse.each do |h| %>
|
116
116
|
<pre><strong><%= h['q'] %></strong>
|
117
|
-
Put: <%= strftime(
|
118
|
-
Pop: <%= strftime(
|
119
|
-
Completed: <%= strftime(
|
120
|
-
Failed: <%= strftime(
|
117
|
+
Put: <%= strftime(h['put']) %><% if not h['popped'].nil? %>
|
118
|
+
Pop: <%= strftime(h['popped']) %> by <%= h['worker'] %><% end %><% if not h['completed'].nil? %>
|
119
|
+
Completed: <%= strftime(h['completed']) %><% end %><% if not h['failed'].nil? %>
|
120
|
+
Failed: <%= strftime(h['failed']) %><% end %></pre>
|
121
121
|
<% end %>
|
122
122
|
</div>
|
123
123
|
</div>
|
@@ -18,7 +18,7 @@
|
|
18
18
|
<%= queue['waiting'] %> /
|
19
19
|
<%= queue['scheduled'] %> /
|
20
20
|
<%= queue['stalled'] %> /
|
21
|
-
<%= queue['depends'] %> /
|
21
|
+
<%= queue['depends'] %> /
|
22
22
|
<%= queue['recurring'] %> <small>(running / waiting / scheduled / stalled / depends / recurring)</small>
|
23
23
|
</h3>
|
24
24
|
</div>
|
@@ -34,16 +34,22 @@
|
|
34
34
|
<div class="page-header">
|
35
35
|
<h1>Failed Jobs <small>D'oh!</small></h1>
|
36
36
|
</div>
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
<
|
37
|
+
<div class="l-sidebyside failed-job">
|
38
|
+
<div class="min-col-size">
|
39
|
+
<% failed.sort_by { |t, count| -count }.each do |t, count| %>
|
40
|
+
<div class="row">
|
41
|
+
<h3><a href="<%= u "/failed/#{t}" %>"><%= t %></a></h3>
|
42
|
+
</div>
|
43
|
+
<% end %>
|
41
44
|
</div>
|
42
|
-
<div
|
43
|
-
|
45
|
+
<div>
|
46
|
+
<% failed.sort_by { |t, count| -count }.each do |t, count| %>
|
47
|
+
<div class="row">
|
48
|
+
<h3>| <%= count %> <small>Jobs</small></h3>
|
49
|
+
</div>
|
50
|
+
<% end %>
|
44
51
|
</div>
|
45
52
|
</div>
|
46
|
-
<% end %>
|
47
53
|
<% end %>
|
48
54
|
|
49
55
|
<% if tracked['jobs'].empty? %>
|
@@ -67,7 +73,6 @@
|
|
67
73
|
<% end %>
|
68
74
|
<% end %>
|
69
75
|
|
70
|
-
|
71
76
|
<% if workers.empty? %>
|
72
77
|
<div class="page-header">
|
73
78
|
<h1>No Workers <small>Nobody's doin' nothin'!</small></h1>
|
@@ -0,0 +1,48 @@
|
|
1
|
+
require 'thread'
|
2
|
+
require 'qless/wait_until'
|
3
|
+
|
4
|
+
module Qless
|
5
|
+
class Subscriber
|
6
|
+
def self.start(*args, &block)
|
7
|
+
new(*args, &block).start_pub_sub_listener
|
8
|
+
end
|
9
|
+
|
10
|
+
attr_reader :client, :channel
|
11
|
+
|
12
|
+
def initialize(client, channel, &message_received_callback)
|
13
|
+
@client = client
|
14
|
+
@channel = channel
|
15
|
+
@message_received_callback = message_received_callback
|
16
|
+
|
17
|
+
# pub/sub blocks the connection so we must use a different redis connection
|
18
|
+
@client_redis = client.redis
|
19
|
+
@listener_redis = client.new_redis_connection
|
20
|
+
|
21
|
+
@my_channel = Qless.generate_jid
|
22
|
+
end
|
23
|
+
|
24
|
+
def start_pub_sub_listener
|
25
|
+
@thread = ::Thread.start do
|
26
|
+
@listener_redis.subscribe(channel, @my_channel) do |on|
|
27
|
+
on.message do |_channel, message|
|
28
|
+
if _channel == @my_channel
|
29
|
+
@listener_redis.unsubscribe(@my_channel)
|
30
|
+
else
|
31
|
+
@message_received_callback.call(self, JSON.parse(message))
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
wait_until_thread_listening
|
38
|
+
end
|
39
|
+
|
40
|
+
def wait_until_thread_listening
|
41
|
+
Qless::WaitUntil.wait_until(10) do
|
42
|
+
@client_redis.publish(@my_channel, 'disconnect') == 1
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
|
data/lib/qless/version.rb
CHANGED
@@ -0,0 +1,19 @@
|
|
1
|
+
module Qless
|
2
|
+
module WaitUntil
|
3
|
+
TimeoutError = Class.new(StandardError)
|
4
|
+
|
5
|
+
def wait_until(timeout)
|
6
|
+
timeout_at = Time.now + timeout
|
7
|
+
|
8
|
+
loop do
|
9
|
+
return if yield
|
10
|
+
sleep 0.002
|
11
|
+
if Time.now > timeout_at
|
12
|
+
raise TimeoutError, "Timed out after #{timeout} seconds"
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
module_function :wait_until
|
18
|
+
end
|
19
|
+
end
|
data/lib/qless/worker.rb
CHANGED
@@ -2,19 +2,25 @@ require 'qless'
|
|
2
2
|
require 'time'
|
3
3
|
require 'qless/job_reservers/ordered'
|
4
4
|
require 'qless/job_reservers/round_robin'
|
5
|
+
require 'qless/job_reservers/shuffled_round_robin'
|
6
|
+
require 'qless/subscriber'
|
7
|
+
require 'qless/wait_until'
|
5
8
|
|
6
9
|
module Qless
|
7
10
|
# This is heavily inspired by Resque's excellent worker:
|
8
11
|
# https://github.com/defunkt/resque/blob/v1.20.0/lib/resque/worker.rb
|
9
12
|
class Worker
|
10
|
-
def initialize(
|
11
|
-
|
13
|
+
def initialize(job_reserver, options = {})
|
14
|
+
self.job_reserver = job_reserver
|
12
15
|
@shutdown = @paused = false
|
13
16
|
|
14
17
|
self.very_verbose = options[:very_verbose]
|
15
18
|
self.verbose = options[:verbose]
|
16
19
|
self.run_as_single_process = options[:run_as_single_process]
|
17
20
|
self.output = options.fetch(:output, $stdout)
|
21
|
+
self.term_timeout = options.fetch(:term_timeout, 4.0)
|
22
|
+
@backtrace_replacements = { Dir.pwd => '.' }
|
23
|
+
@backtrace_replacements[ENV['GEM_HOME']] = '<GEM_HOME>' if ENV.has_key?('GEM_HOME')
|
18
24
|
|
19
25
|
output.puts "\n\n\n" if verbose || very_verbose
|
20
26
|
log "Instantiated Worker"
|
@@ -35,6 +41,13 @@ module Qless
|
|
35
41
|
# Defaults to $stdout.
|
36
42
|
attr_accessor :output
|
37
43
|
|
44
|
+
# The object responsible for reserving jobs from the Qless server,
|
45
|
+
# using some reasonable strategy (e.g. round robin or ordered)
|
46
|
+
attr_accessor :job_reserver
|
47
|
+
|
48
|
+
# How long the child process is given to exit before forcibly killing it.
|
49
|
+
attr_accessor :term_timeout
|
50
|
+
|
38
51
|
# Starts a worker based on ENV vars. Supported ENV vars:
|
39
52
|
# - REDIS_URL=redis://host:port/db-num (the redis gem uses this automatically)
|
40
53
|
# - QUEUES=high,medium,low or QUEUE=blah
|
@@ -59,18 +72,22 @@ module Qless
|
|
59
72
|
options[:very_verbose] = !!ENV['VVERBOSE']
|
60
73
|
options[:run_as_single_process] = !!ENV['RUN_AS_SINGLE_PROCESS']
|
61
74
|
|
62
|
-
new(
|
75
|
+
new(reserver, options).work(interval)
|
63
76
|
end
|
64
77
|
|
65
78
|
def work(interval = 5.0)
|
66
79
|
procline "Starting #{@job_reserver.description}"
|
67
|
-
|
80
|
+
register_parent_signal_handlers
|
81
|
+
uniq_clients.each { |client| start_parent_pub_sub_listener_for(client) }
|
68
82
|
|
69
83
|
loop do
|
70
84
|
break if shutdown?
|
71
|
-
|
85
|
+
if paused?
|
86
|
+
sleep interval
|
87
|
+
next
|
88
|
+
end
|
72
89
|
|
73
|
-
unless job =
|
90
|
+
unless job = reserve_job
|
74
91
|
break if interval.zero?
|
75
92
|
procline "Waiting for #{@job_reserver.description}"
|
76
93
|
log! "Sleeping for #{interval} seconds"
|
@@ -78,31 +95,48 @@ module Qless
|
|
78
95
|
next
|
79
96
|
end
|
80
97
|
|
81
|
-
|
82
|
-
|
83
|
-
if run_as_single_process
|
84
|
-
# We're staying in the same process
|
85
|
-
procline "Single processing #{job.description}"
|
86
|
-
perform(job)
|
87
|
-
elsif @child = fork
|
88
|
-
# We're in the parent process
|
89
|
-
procline "Forked #{@child} for #{job.description}"
|
90
|
-
Process.wait(@child)
|
91
|
-
else
|
92
|
-
# We're in the child process
|
93
|
-
procline "Processing #{job.description}"
|
94
|
-
perform(job)
|
95
|
-
exit!
|
96
|
-
end
|
98
|
+
perform_job_in_child_process(job)
|
97
99
|
end
|
100
|
+
ensure
|
101
|
+
# make sure the worker deregisters on shutdown
|
102
|
+
deregister
|
98
103
|
end
|
99
104
|
|
100
105
|
def perform(job)
|
101
106
|
around_perform(job)
|
102
107
|
rescue Exception => error
|
103
|
-
fail_job(job, error)
|
108
|
+
fail_job(job, error, caller)
|
104
109
|
else
|
105
|
-
job
|
110
|
+
try_complete(job)
|
111
|
+
end
|
112
|
+
|
113
|
+
def reserve_job
|
114
|
+
@job_reserver.reserve
|
115
|
+
rescue Exception => error
|
116
|
+
# We want workers to durably stay up, so we don't want errors
|
117
|
+
# during job reserving (e.g. network timeouts, etc) to kill
|
118
|
+
# the worker.
|
119
|
+
log "Got an error while reserving a job: #{error.class}: #{error.message}"
|
120
|
+
end
|
121
|
+
|
122
|
+
def perform_job_in_child_process(job)
|
123
|
+
with_job(job) do
|
124
|
+
@child = fork do
|
125
|
+
job.reconnect_to_redis
|
126
|
+
register_child_signal_handlers
|
127
|
+
start_child_pub_sub_listener_for(job.client)
|
128
|
+
procline "Processing #{job.description}"
|
129
|
+
perform(job)
|
130
|
+
exit! # don't run at_exit hooks
|
131
|
+
end
|
132
|
+
|
133
|
+
if @child
|
134
|
+
wait_for_child
|
135
|
+
else
|
136
|
+
procline "Single processing #{job.description}"
|
137
|
+
perform(job)
|
138
|
+
end
|
139
|
+
end
|
106
140
|
end
|
107
141
|
|
108
142
|
def shutdown
|
@@ -135,6 +169,33 @@ module Qless
|
|
135
169
|
|
136
170
|
private
|
137
171
|
|
172
|
+
def fork
|
173
|
+
super unless run_as_single_process
|
174
|
+
end
|
175
|
+
|
176
|
+
def deregister
|
177
|
+
uniq_clients.each do |client|
|
178
|
+
client.deregister_workers(Qless.worker_name)
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
def uniq_clients
|
183
|
+
@uniq_clients ||= @job_reserver.queues.map(&:client).uniq
|
184
|
+
end
|
185
|
+
|
186
|
+
def try_complete(job)
|
187
|
+
job.complete unless job.state_changed?
|
188
|
+
rescue Job::CantCompleteError => e
|
189
|
+
# There's not much we can do here. Complete fails in a few cases:
|
190
|
+
# - The job is already failed (i.e. by another worker)
|
191
|
+
# - The job is being worked on by another worker
|
192
|
+
# - The job has been cancelled
|
193
|
+
#
|
194
|
+
# We don't want to (or are able to) fail the job with this error in
|
195
|
+
# any of these cases, so the best we can do is log the failure.
|
196
|
+
log "Failed to complete #{job.inspect}: #{e.message}"
|
197
|
+
end
|
198
|
+
|
138
199
|
# Allow middleware modules to be mixed in and override the
|
139
200
|
# definition of around_perform while providing a default
|
140
201
|
# implementation so our code can assume the method is present.
|
@@ -144,47 +205,120 @@ module Qless
|
|
144
205
|
end
|
145
206
|
}
|
146
207
|
|
147
|
-
def fail_job(job, error)
|
148
|
-
group = "#{job.
|
149
|
-
message = "#{error
|
208
|
+
def fail_job(job, error, worker_backtrace)
|
209
|
+
group = "#{job.klass_name}:#{error.class}"
|
210
|
+
message = "#{truncated_message(error)}\n\n#{format_failure_backtrace(error.backtrace, worker_backtrace)}"
|
150
211
|
log "Got #{group} failure from #{job.inspect}"
|
151
212
|
job.fail(group, message)
|
152
213
|
end
|
153
214
|
|
215
|
+
# TODO: pull this out into a config option.
|
216
|
+
MAX_ERROR_MESSAGE_SIZE = 10_000
|
217
|
+
def truncated_message(error)
|
218
|
+
return error.message if error.message.length <= MAX_ERROR_MESSAGE_SIZE
|
219
|
+
error.message.slice(0, MAX_ERROR_MESSAGE_SIZE) + "... (truncated due to length)"
|
220
|
+
end
|
221
|
+
|
222
|
+
def format_failure_backtrace(error_backtrace, worker_backtrace)
|
223
|
+
(error_backtrace - worker_backtrace).map do |line|
|
224
|
+
@backtrace_replacements.inject(line) do |line, (original, new)|
|
225
|
+
line.sub(original, new)
|
226
|
+
end
|
227
|
+
end.join("\n")
|
228
|
+
end
|
229
|
+
|
154
230
|
def procline(value)
|
155
231
|
$0 = "Qless-#{Qless::VERSION}: #{value} at #{Time.now.iso8601}"
|
156
232
|
log! $0
|
157
233
|
end
|
158
234
|
|
235
|
+
def wait_for_child
|
236
|
+
srand # Reseeding
|
237
|
+
procline "Forked #{@child} at #{Time.now.to_i}"
|
238
|
+
begin
|
239
|
+
Process.waitpid(@child)
|
240
|
+
rescue SystemCallError
|
241
|
+
nil
|
242
|
+
end
|
243
|
+
end
|
244
|
+
|
245
|
+
# Kills the forked child immediately with minimal remorse. The job it
|
246
|
+
# is processing will not be completed. Send the child a TERM signal,
|
247
|
+
# wait 5 seconds, and then a KILL signal if it has not quit
|
159
248
|
def kill_child
|
160
249
|
return unless @child
|
161
|
-
|
162
|
-
Process.
|
250
|
+
|
251
|
+
if Process.waitpid(@child, Process::WNOHANG)
|
252
|
+
log "Child #{@child} already quit."
|
253
|
+
return
|
254
|
+
end
|
255
|
+
|
256
|
+
signal_child("TERM", @child)
|
257
|
+
|
258
|
+
signal_child("KILL", @child) unless quit_gracefully?(@child)
|
259
|
+
rescue SystemCallError
|
260
|
+
log "Child #{@child} already quit and reaped."
|
261
|
+
end
|
262
|
+
|
263
|
+
# send a signal to a child, have it logged.
|
264
|
+
def signal_child(signal, child)
|
265
|
+
log "Sending #{signal} signal to child #{child}"
|
266
|
+
Process.kill(signal, child)
|
267
|
+
end
|
268
|
+
|
269
|
+
# has our child quit gracefully within the timeout limit?
|
270
|
+
def quit_gracefully?(child)
|
271
|
+
(term_timeout.to_f * 10).round.times do |i|
|
272
|
+
sleep(0.1)
|
273
|
+
return true if Process.waitpid(child, Process::WNOHANG)
|
274
|
+
end
|
275
|
+
|
276
|
+
false
|
163
277
|
end
|
164
278
|
|
165
|
-
# This
|
279
|
+
# This was originally stolen directly from resque... (thanks, @defunkt!)
|
166
280
|
# Registers the various signal handlers a worker responds to.
|
167
281
|
#
|
168
282
|
# TERM: Shutdown immediately, stop processing jobs.
|
169
283
|
# INT: Shutdown immediately, stop processing jobs.
|
170
284
|
# QUIT: Shutdown after the current job has finished processing.
|
171
285
|
# USR1: Kill the forked child immediately, continue processing jobs.
|
172
|
-
# USR2: Don't process any new jobs
|
286
|
+
# USR2: Don't process any new jobs; dump the backtrace.
|
173
287
|
# CONT: Start processing jobs again after a USR2
|
174
|
-
def
|
288
|
+
def register_parent_signal_handlers
|
175
289
|
trap('TERM') { shutdown! }
|
176
290
|
trap('INT') { shutdown! }
|
177
291
|
|
178
292
|
begin
|
179
293
|
trap('QUIT') { shutdown }
|
180
294
|
trap('USR1') { kill_child }
|
181
|
-
trap('USR2')
|
295
|
+
trap('USR2') do
|
296
|
+
log "Current backtrace (parent): \n\n#{caller.join("\n")}\n\n"
|
297
|
+
pause_processing
|
298
|
+
end
|
299
|
+
|
182
300
|
trap('CONT') { unpause_processing }
|
183
301
|
rescue ArgumentError
|
184
302
|
warn "Signals QUIT, USR1, USR2, and/or CONT not supported."
|
185
303
|
end
|
186
304
|
end
|
187
305
|
|
306
|
+
def register_child_signal_handlers
|
307
|
+
trap('TERM') { raise SignalException.new("SIGTERM") }
|
308
|
+
trap('INT', 'DEFAULT')
|
309
|
+
|
310
|
+
begin
|
311
|
+
trap('QUIT', 'DEFAULT')
|
312
|
+
trap('USR1', 'DEFAULT')
|
313
|
+
trap('USR2', 'DEFAULT')
|
314
|
+
|
315
|
+
trap('USR2') do
|
316
|
+
log "Current backtrace (child): \n\n#{caller.join("\n")}\n\n"
|
317
|
+
end
|
318
|
+
rescue ArgumentError
|
319
|
+
end
|
320
|
+
end
|
321
|
+
|
188
322
|
# Log a message to STDOUT if we are verbose or very_verbose.
|
189
323
|
def log(message)
|
190
324
|
if verbose
|
@@ -199,6 +333,82 @@ module Qless
|
|
199
333
|
def log!(message)
|
200
334
|
log message if very_verbose
|
201
335
|
end
|
336
|
+
|
337
|
+
def start_parent_pub_sub_listener_for(client)
|
338
|
+
Subscriber.start(client, "ql:w:#{Qless.worker_name}") do |subscriber, message|
|
339
|
+
if message["event"] == "lock_lost" && message["jid"] == current_job_jid
|
340
|
+
fail_job_due_to_timeout
|
341
|
+
kill_child
|
342
|
+
end
|
343
|
+
end
|
344
|
+
end
|
345
|
+
|
346
|
+
def start_child_pub_sub_listener_for(client)
|
347
|
+
Subscriber.start(client, "ql:w:#{Qless.worker_name}:#{Process.pid}") do |subscriber, message|
|
348
|
+
if message["event"] == "notify_backtrace"
|
349
|
+
notify_parent_of_job_backtrace(client, message.fetch('notify_list'))
|
350
|
+
end
|
351
|
+
end
|
352
|
+
end
|
353
|
+
|
354
|
+
def with_job(job)
|
355
|
+
@job = job
|
356
|
+
yield
|
357
|
+
ensure
|
358
|
+
@job = nil
|
359
|
+
end
|
360
|
+
|
361
|
+
# To prevent race conditions (with our listener thread),
|
362
|
+
# we cannot use a pattern like `use(@job) if @job` because
|
363
|
+
# the value of `@job` could change between the checking of
|
364
|
+
# it and the use of it. Here we use a pattern that avoids
|
365
|
+
# the issue -- get the job into a local, and yield that if
|
366
|
+
# it is set.
|
367
|
+
def access_current_job
|
368
|
+
if job = @job
|
369
|
+
yield job
|
370
|
+
end
|
371
|
+
end
|
372
|
+
|
373
|
+
def current_job_jid
|
374
|
+
access_current_job &:jid
|
375
|
+
end
|
376
|
+
|
377
|
+
JobLockLost = Class.new(StandardError)
|
378
|
+
|
379
|
+
def fail_job_due_to_timeout
|
380
|
+
access_current_job do |job|
|
381
|
+
error = JobLockLost.new
|
382
|
+
error.set_backtrace(get_backtrace_from_child(job.client.redis))
|
383
|
+
fail_job(job, error, caller)
|
384
|
+
end
|
385
|
+
end
|
386
|
+
|
387
|
+
def notify_parent_of_job_backtrace(client, list)
|
388
|
+
job_backtrace = Thread.main.backtrace
|
389
|
+
client.redis.lpush list, JSON.dump(job_backtrace)
|
390
|
+
client.redis.pexpire list, BACKTRACE_EXPIRATION_TIMEOUT_MS
|
391
|
+
end
|
392
|
+
|
393
|
+
WAIT_FOR_CHILD_BACKTRACE_TIMEOUT = 4
|
394
|
+
BACKTRACE_EXPIRATION_TIMEOUT_MS = 60_000 # timeout after a minute
|
395
|
+
|
396
|
+
def get_backtrace_from_child(child_redis)
|
397
|
+
notification_list = "ql:child_backtraces:#{Qless.generate_jid}"
|
398
|
+
request_backtrace = { "event" => "notify_backtrace",
|
399
|
+
"notify_list" => notification_list }
|
400
|
+
|
401
|
+
if child_redis.publish("ql:w:#{Qless.worker_name}:#{@child}", JSON.dump(request_backtrace)).zero?
|
402
|
+
return ["Could not obtain child backtrace since it was not listening."]
|
403
|
+
end
|
404
|
+
|
405
|
+
begin
|
406
|
+
_, backtrace_json = child_redis.blpop(notification_list, WAIT_FOR_CHILD_BACKTRACE_TIMEOUT)
|
407
|
+
JSON.parse(backtrace_json)
|
408
|
+
rescue => e
|
409
|
+
["Could not obtain child backtrace: #{e.class}: #{e.message}"] + e.backtrace
|
410
|
+
end
|
411
|
+
end
|
202
412
|
end
|
203
413
|
end
|
204
414
|
|