qless 0.9.3 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +9 -3
- data/README.md +70 -25
- data/Rakefile +125 -9
- data/exe/install_phantomjs +21 -0
- data/lib/qless.rb +115 -76
- data/lib/qless/config.rb +11 -9
- data/lib/qless/failure_formatter.rb +43 -0
- data/lib/qless/job.rb +201 -102
- data/lib/qless/job_reservers/ordered.rb +7 -1
- data/lib/qless/job_reservers/round_robin.rb +16 -6
- data/lib/qless/job_reservers/shuffled_round_robin.rb +9 -2
- data/lib/qless/lua/qless-lib.lua +2463 -0
- data/lib/qless/lua/qless.lua +2012 -0
- data/lib/qless/lua_script.rb +63 -12
- data/lib/qless/middleware/memory_usage_monitor.rb +62 -0
- data/lib/qless/middleware/metriks.rb +45 -0
- data/lib/qless/middleware/redis_reconnect.rb +6 -3
- data/lib/qless/middleware/requeue_exceptions.rb +94 -0
- data/lib/qless/middleware/retry_exceptions.rb +38 -9
- data/lib/qless/middleware/sentry.rb +3 -7
- data/lib/qless/middleware/timeout.rb +64 -0
- data/lib/qless/queue.rb +90 -55
- data/lib/qless/server.rb +177 -130
- data/lib/qless/server/views/_job.erb +33 -15
- data/lib/qless/server/views/completed.erb +11 -0
- data/lib/qless/server/views/layout.erb +70 -11
- data/lib/qless/server/views/overview.erb +93 -53
- data/lib/qless/server/views/queue.erb +9 -8
- data/lib/qless/server/views/queues.erb +18 -1
- data/lib/qless/subscriber.rb +37 -22
- data/lib/qless/tasks.rb +5 -10
- data/lib/qless/test_helpers/worker_helpers.rb +55 -0
- data/lib/qless/version.rb +3 -1
- data/lib/qless/worker.rb +4 -413
- data/lib/qless/worker/base.rb +247 -0
- data/lib/qless/worker/forking.rb +245 -0
- data/lib/qless/worker/serial.rb +41 -0
- metadata +135 -52
- data/lib/qless/qless-core/cancel.lua +0 -101
- data/lib/qless/qless-core/complete.lua +0 -233
- data/lib/qless/qless-core/config.lua +0 -56
- data/lib/qless/qless-core/depends.lua +0 -65
- data/lib/qless/qless-core/deregister_workers.lua +0 -12
- data/lib/qless/qless-core/fail.lua +0 -117
- data/lib/qless/qless-core/failed.lua +0 -83
- data/lib/qless/qless-core/get.lua +0 -37
- data/lib/qless/qless-core/heartbeat.lua +0 -51
- data/lib/qless/qless-core/jobs.lua +0 -41
- data/lib/qless/qless-core/pause.lua +0 -18
- data/lib/qless/qless-core/peek.lua +0 -165
- data/lib/qless/qless-core/pop.lua +0 -314
- data/lib/qless/qless-core/priority.lua +0 -32
- data/lib/qless/qless-core/put.lua +0 -169
- data/lib/qless/qless-core/qless-lib.lua +0 -2354
- data/lib/qless/qless-core/qless.lua +0 -1862
- data/lib/qless/qless-core/queues.lua +0 -58
- data/lib/qless/qless-core/recur.lua +0 -190
- data/lib/qless/qless-core/retry.lua +0 -73
- data/lib/qless/qless-core/stats.lua +0 -92
- data/lib/qless/qless-core/tag.lua +0 -100
- data/lib/qless/qless-core/track.lua +0 -79
- data/lib/qless/qless-core/unfail.lua +0 -54
- data/lib/qless/qless-core/unpause.lua +0 -12
- data/lib/qless/qless-core/workers.lua +0 -69
- data/lib/qless/wait_until.rb +0 -19
@@ -40,12 +40,13 @@
|
|
40
40
|
|
41
41
|
<div class="subnav subnav-fixed">
|
42
42
|
<ul class="nav nav-pills">
|
43
|
-
<li class="<%= tab == 'stats' ? 'active' : '' %>"><a href="<%= u "/queues/#{queue['name']}/stats" %>">Stats</a></li>
|
44
|
-
<li class="<%= tab == 'running' ? 'active' : '' %>"><a href="<%= u "/queues/#{queue['name']}/running" %>">Running</a></li>
|
45
|
-
<li class="<%= tab == 'waiting' ? 'active' : '' %>"><a href="<%= u "/queues/#{queue['name']}/waiting" %>">Waiting</a></li>
|
46
|
-
<li class="<%= tab == 'scheduled' ? 'active' : '' %>"><a href="<%= u "/queues/#{queue['name']}/scheduled" %>">Scheduled</a></li>
|
47
|
-
<li class="<%= tab == 'stalled' ? 'active' : '' %>"><a href="<%= u "/queues/#{queue['name']}/stalled" %>">Stalled</a></li>
|
48
|
-
<li class="<%= tab == 'depends' ? 'active' : '' %>"><a href="<%= u "/queues/#{queue['name']}/depends" %>">Depends</a></li>
|
43
|
+
<li class="<%= tab == 'stats' ? 'active' : '' %>"><a href="<%= u "/queues/#{CGI::escape(queue['name'])}/stats" %>">Stats</a></li>
|
44
|
+
<li class="<%= tab == 'running' ? 'active' : '' %>"><a href="<%= u "/queues/#{CGI::escape(queue['name'])}/running" %>">Running</a></li>
|
45
|
+
<li class="<%= tab == 'waiting' ? 'active' : '' %>"><a href="<%= u "/queues/#{CGI::escape(queue['name'])}/waiting" %>">Waiting</a></li>
|
46
|
+
<li class="<%= tab == 'scheduled' ? 'active' : '' %>"><a href="<%= u "/queues/#{CGI::escape(queue['name'])}/scheduled" %>">Scheduled</a></li>
|
47
|
+
<li class="<%= tab == 'stalled' ? 'active' : '' %>"><a href="<%= u "/queues/#{CGI::escape(queue['name'])}/stalled" %>">Stalled</a></li>
|
48
|
+
<li class="<%= tab == 'depends' ? 'active' : '' %>"><a href="<%= u "/queues/#{CGI::escape(queue['name'])}/depends" %>">Depends</a></li>
|
49
|
+
<li class="<%= tab == 'recurring' ? 'active' : '' %>"><a href="<%= u "/queues/#{CGI::escape(queue['name'])}/recurring" %>">Recurring</a></li>
|
49
50
|
</ul>
|
50
51
|
</div>
|
51
52
|
|
@@ -53,7 +54,7 @@
|
|
53
54
|
|
54
55
|
<div class="row">
|
55
56
|
<div class="span8">
|
56
|
-
<h2><a href="<%= u "/queues/#{queue['name']}" %>"><%= queue['name'] %></a> |
|
57
|
+
<h2><a href="<%= u "/queues/#{CGI::escape(queue['name'])}" %>"><%= queue['name'] %></a> |
|
57
58
|
<%= queue['running'] %> /
|
58
59
|
<%= queue['waiting'] %> /
|
59
60
|
<%= queue['scheduled'] %> /
|
@@ -73,7 +74,7 @@
|
|
73
74
|
</div>
|
74
75
|
</div>
|
75
76
|
|
76
|
-
<% if ['running', 'waiting', 'scheduled', 'stalled', 'depends'].include?(tab) %>
|
77
|
+
<% if ['running', 'waiting', 'scheduled', 'stalled', 'depends', 'recurring'].include?(tab) %>
|
77
78
|
<hr/>
|
78
79
|
<%= erb :_job_list, :locals => { :jobs => jobs, :queues => queues } %>
|
79
80
|
<% else %>
|
@@ -10,7 +10,24 @@
|
|
10
10
|
<% queues.each do |queue| %>
|
11
11
|
<div class="row">
|
12
12
|
<div class="span4">
|
13
|
-
<h3
|
13
|
+
<h3>
|
14
|
+
<% if queue['paused'] %>
|
15
|
+
<button
|
16
|
+
id="<%= queue['name'] %>-pause"
|
17
|
+
title="Unpause"
|
18
|
+
class="btn btn-success"
|
19
|
+
onclick="unpause('<%= queue['name'] %>')"><i class="icon-play"></i>
|
20
|
+
</button>
|
21
|
+
<% else %>
|
22
|
+
<button
|
23
|
+
id="<%= queue['name'] %>-pause"
|
24
|
+
title="Pause"
|
25
|
+
class="btn btn-warning"
|
26
|
+
onclick="pause('<%= queue['name'] %>')"><i class="icon-pause"></i>
|
27
|
+
</button>
|
28
|
+
<% end %>
|
29
|
+
<a href="<%= u "/queues/#{CGI::escape(queue['name'])}" %>"><%= queue['name'] %></a>
|
30
|
+
</h3>
|
14
31
|
</div>
|
15
32
|
<div class="span8">
|
16
33
|
<h3> |
|
data/lib/qless/subscriber.rb
CHANGED
@@ -1,48 +1,63 @@
|
|
1
|
+
# Encoding: utf-8
|
2
|
+
|
1
3
|
require 'thread'
|
2
|
-
require 'qless/wait_until'
|
3
4
|
|
4
5
|
module Qless
|
6
|
+
# A class used for subscribing to messages in a thread
|
5
7
|
class Subscriber
|
6
8
|
def self.start(*args, &block)
|
7
|
-
new(*args, &block).
|
9
|
+
new(*args, &block).tap(&:start)
|
8
10
|
end
|
9
11
|
|
10
|
-
attr_reader :
|
12
|
+
attr_reader :channel, :redis
|
11
13
|
|
12
|
-
def initialize(client, channel, &message_received_callback)
|
13
|
-
@client = client
|
14
|
+
def initialize(client, channel, options = {}, &message_received_callback)
|
14
15
|
@channel = channel
|
15
16
|
@message_received_callback = message_received_callback
|
17
|
+
@log = options.fetch(:log) { ::Logger.new($stderr) }
|
16
18
|
|
17
|
-
# pub/sub blocks the connection so we must use a different redis
|
18
|
-
|
19
|
+
# pub/sub blocks the connection so we must use a different redis
|
20
|
+
# connection
|
21
|
+
@client_redis = client.redis
|
19
22
|
@listener_redis = client.new_redis_connection
|
20
23
|
|
21
24
|
@my_channel = Qless.generate_jid
|
22
25
|
end
|
23
26
|
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
27
|
+
# Start a thread listening
|
28
|
+
def start
|
29
|
+
queue = ::Queue.new
|
30
|
+
|
31
|
+
@thread = Thread.start do
|
32
|
+
@listener_redis.subscribe(@channel, @my_channel) do |on|
|
33
|
+
on.subscribe do |channel|
|
34
|
+
queue.push(:subscribed) if channel == @channel
|
35
|
+
end
|
36
|
+
|
37
|
+
on.message do |channel, message|
|
38
|
+
handle_message(channel, message)
|
33
39
|
end
|
34
40
|
end
|
35
41
|
end
|
36
42
|
|
37
|
-
|
43
|
+
queue.pop
|
38
44
|
end
|
39
45
|
|
40
|
-
def
|
41
|
-
|
42
|
-
|
46
|
+
def stop
|
47
|
+
@client_redis.publish(@my_channel, 'disconnect')
|
48
|
+
@thread.join
|
49
|
+
end
|
50
|
+
|
51
|
+
private
|
52
|
+
|
53
|
+
def handle_message(channel, message)
|
54
|
+
if channel == @my_channel
|
55
|
+
@listener_redis.unsubscribe(@channel, @my_channel) if message == "disconnect"
|
56
|
+
else
|
57
|
+
@message_received_callback.call(self, JSON.parse(message))
|
43
58
|
end
|
59
|
+
rescue Exception => error
|
60
|
+
@log.error("Qless::Subscriber") { error }
|
44
61
|
end
|
45
62
|
end
|
46
63
|
end
|
47
|
-
|
48
|
-
|
data/lib/qless/tasks.rb
CHANGED
@@ -1,10 +1,5 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
require 'qless/worker'
|
7
|
-
Qless::Worker.start
|
8
|
-
end
|
9
|
-
end
|
10
|
-
|
1
|
+
# Encoding: utf-8
|
2
|
+
warn "Qless tasks are deprecated (they haven't worked for " \
|
3
|
+
"quite some time) and you should start a worker by " \
|
4
|
+
"writing a bit of ruby code that instantiates and " \
|
5
|
+
"runs a worker instead."
|
@@ -0,0 +1,55 @@
|
|
1
|
+
module Qless
|
2
|
+
module WorkerHelpers
|
3
|
+
# Yield with a worker running, and then clean the worker up afterwards
|
4
|
+
def run_worker_concurrently_with(worker, &block)
|
5
|
+
thread = Thread.start { stop_worker_after(worker, &block) }
|
6
|
+
thread.abort_on_exception = true
|
7
|
+
worker.run
|
8
|
+
ensure
|
9
|
+
thread.join(0.1)
|
10
|
+
end
|
11
|
+
|
12
|
+
def stop_worker_after(worker, &block)
|
13
|
+
yield
|
14
|
+
ensure
|
15
|
+
worker.stop!
|
16
|
+
end
|
17
|
+
|
18
|
+
# Run only the given number of jobs, then stop
|
19
|
+
def run_jobs(worker, count)
|
20
|
+
worker.extend Module.new {
|
21
|
+
define_method(:jobs) do
|
22
|
+
base_enum = super()
|
23
|
+
Enumerator.new do |enum|
|
24
|
+
count.times { enum << base_enum.next }
|
25
|
+
end
|
26
|
+
end
|
27
|
+
}
|
28
|
+
|
29
|
+
thread = Thread.start { yield } if block_given?
|
30
|
+
thread.abort_on_exception if thread
|
31
|
+
worker.run
|
32
|
+
ensure
|
33
|
+
thread.join(0.1) if thread
|
34
|
+
end
|
35
|
+
|
36
|
+
# Runs the worker until it has no more jobs to process,
|
37
|
+
# effectively drainig its queues.
|
38
|
+
def drain_worker_queues(worker)
|
39
|
+
worker.extend Module.new {
|
40
|
+
# For the child: stop as soon as it can't pop more jobs.
|
41
|
+
def no_job_available
|
42
|
+
shutdown
|
43
|
+
end
|
44
|
+
|
45
|
+
# For the parent: when the child stops,
|
46
|
+
# don't try to restart it; shutdown instead.
|
47
|
+
def spawn_replacement_child(*)
|
48
|
+
shutdown
|
49
|
+
end
|
50
|
+
}
|
51
|
+
|
52
|
+
worker.run
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
data/lib/qless/version.rb
CHANGED
data/lib/qless/worker.rb
CHANGED
@@ -1,414 +1,5 @@
|
|
1
|
-
|
2
|
-
require 'time'
|
3
|
-
require 'qless/job_reservers/ordered'
|
4
|
-
require 'qless/job_reservers/round_robin'
|
5
|
-
require 'qless/job_reservers/shuffled_round_robin'
|
6
|
-
require 'qless/subscriber'
|
7
|
-
require 'qless/wait_until'
|
8
|
-
|
9
|
-
module Qless
|
10
|
-
# This is heavily inspired by Resque's excellent worker:
|
11
|
-
# https://github.com/defunkt/resque/blob/v1.20.0/lib/resque/worker.rb
|
12
|
-
class Worker
|
13
|
-
def initialize(job_reserver, options = {})
|
14
|
-
self.job_reserver = job_reserver
|
15
|
-
@shutdown = @paused = false
|
16
|
-
|
17
|
-
self.very_verbose = options[:very_verbose]
|
18
|
-
self.verbose = options[:verbose]
|
19
|
-
self.run_as_single_process = options[:run_as_single_process]
|
20
|
-
self.output = options.fetch(:output, $stdout)
|
21
|
-
self.term_timeout = options.fetch(:term_timeout, 4.0)
|
22
|
-
@backtrace_replacements = { Dir.pwd => '.' }
|
23
|
-
@backtrace_replacements[ENV['GEM_HOME']] = '<GEM_HOME>' if ENV.has_key?('GEM_HOME')
|
24
|
-
|
25
|
-
output.puts "\n\n\n" if verbose || very_verbose
|
26
|
-
log "Instantiated Worker"
|
27
|
-
end
|
28
|
-
|
29
|
-
# Whether the worker should log basic info to STDOUT
|
30
|
-
attr_accessor :verbose
|
31
|
-
|
32
|
-
# Whether the worker should log lots of info to STDOUT
|
33
|
-
attr_accessor :very_verbose
|
34
|
-
|
35
|
-
# Whether the worker should run in a single prcoess
|
36
|
-
# i.e. not fork a child process to do the work
|
37
|
-
# This should only be true in a dev/test environment
|
38
|
-
attr_accessor :run_as_single_process
|
39
|
-
|
40
|
-
# An IO-like object that logging output is sent to.
|
41
|
-
# Defaults to $stdout.
|
42
|
-
attr_accessor :output
|
43
|
-
|
44
|
-
# The object responsible for reserving jobs from the Qless server,
|
45
|
-
# using some reasonable strategy (e.g. round robin or ordered)
|
46
|
-
attr_accessor :job_reserver
|
47
|
-
|
48
|
-
# How long the child process is given to exit before forcibly killing it.
|
49
|
-
attr_accessor :term_timeout
|
50
|
-
|
51
|
-
# Starts a worker based on ENV vars. Supported ENV vars:
|
52
|
-
# - REDIS_URL=redis://host:port/db-num (the redis gem uses this automatically)
|
53
|
-
# - QUEUES=high,medium,low or QUEUE=blah
|
54
|
-
# - JOB_RESERVER=Ordered or JOB_RESERVER=RoundRobin
|
55
|
-
# - INTERVAL=3.2
|
56
|
-
# - VERBOSE=true (to enable logging)
|
57
|
-
# - VVERBOSE=true (to enable very verbose logging)
|
58
|
-
# - RUN_AS_SINGLE_PROCESS=true (false will fork children to do work, true will keep it single process)
|
59
|
-
# This is designed to be called from a rake task
|
60
|
-
def self.start
|
61
|
-
client = Qless::Client.new
|
62
|
-
queues = (ENV['QUEUES'] || ENV['QUEUE']).to_s.split(',').map { |q| client.queues[q.strip] }
|
63
|
-
if queues.none?
|
64
|
-
raise "No queues provided. You must pass QUEUE or QUEUES when starting a worker."
|
65
|
-
end
|
66
|
-
|
67
|
-
reserver = JobReservers.const_get(ENV.fetch('JOB_RESERVER', 'Ordered')).new(queues)
|
68
|
-
interval = Float(ENV.fetch('INTERVAL', 5.0))
|
69
|
-
|
70
|
-
options = {}
|
71
|
-
options[:verbose] = !!ENV['VERBOSE']
|
72
|
-
options[:very_verbose] = !!ENV['VVERBOSE']
|
73
|
-
options[:run_as_single_process] = !!ENV['RUN_AS_SINGLE_PROCESS']
|
74
|
-
|
75
|
-
new(reserver, options).work(interval)
|
76
|
-
end
|
77
|
-
|
78
|
-
def work(interval = 5.0)
|
79
|
-
procline "Starting #{@job_reserver.description}"
|
80
|
-
register_parent_signal_handlers
|
81
|
-
uniq_clients.each { |client| start_parent_pub_sub_listener_for(client) }
|
82
|
-
|
83
|
-
loop do
|
84
|
-
break if shutdown?
|
85
|
-
if paused?
|
86
|
-
sleep interval
|
87
|
-
next
|
88
|
-
end
|
89
|
-
|
90
|
-
unless job = reserve_job
|
91
|
-
break if interval.zero?
|
92
|
-
procline "Waiting for #{@job_reserver.description}"
|
93
|
-
log! "Sleeping for #{interval} seconds"
|
94
|
-
sleep interval
|
95
|
-
next
|
96
|
-
end
|
97
|
-
|
98
|
-
perform_job_in_child_process(job)
|
99
|
-
end
|
100
|
-
ensure
|
101
|
-
# make sure the worker deregisters on shutdown
|
102
|
-
deregister
|
103
|
-
end
|
104
|
-
|
105
|
-
def perform(job)
|
106
|
-
around_perform(job)
|
107
|
-
rescue Exception => error
|
108
|
-
fail_job(job, error, caller)
|
109
|
-
else
|
110
|
-
try_complete(job)
|
111
|
-
end
|
112
|
-
|
113
|
-
def reserve_job
|
114
|
-
@job_reserver.reserve
|
115
|
-
rescue Exception => error
|
116
|
-
# We want workers to durably stay up, so we don't want errors
|
117
|
-
# during job reserving (e.g. network timeouts, etc) to kill
|
118
|
-
# the worker.
|
119
|
-
log "Got an error while reserving a job: #{error.class}: #{error.message}"
|
120
|
-
end
|
121
|
-
|
122
|
-
def perform_job_in_child_process(job)
|
123
|
-
with_job(job) do
|
124
|
-
@child = fork do
|
125
|
-
job.reconnect_to_redis
|
126
|
-
register_child_signal_handlers
|
127
|
-
start_child_pub_sub_listener_for(job.client)
|
128
|
-
procline "Processing #{job.description}"
|
129
|
-
perform(job)
|
130
|
-
exit! # don't run at_exit hooks
|
131
|
-
end
|
132
|
-
|
133
|
-
if @child
|
134
|
-
wait_for_child
|
135
|
-
else
|
136
|
-
procline "Single processing #{job.description}"
|
137
|
-
perform(job)
|
138
|
-
end
|
139
|
-
end
|
140
|
-
end
|
141
|
-
|
142
|
-
def shutdown
|
143
|
-
@shutdown = true
|
144
|
-
end
|
145
|
-
|
146
|
-
def shutdown!
|
147
|
-
shutdown
|
148
|
-
kill_child unless run_as_single_process
|
149
|
-
end
|
150
|
-
|
151
|
-
def shutdown?
|
152
|
-
@shutdown
|
153
|
-
end
|
154
|
-
|
155
|
-
def paused?
|
156
|
-
@paused
|
157
|
-
end
|
158
|
-
|
159
|
-
def pause_processing
|
160
|
-
log "USR2 received; pausing job processing"
|
161
|
-
@paused = true
|
162
|
-
procline "Paused -- #{@job_reserver.description}"
|
163
|
-
end
|
164
|
-
|
165
|
-
def unpause_processing
|
166
|
-
log "CONT received; resuming job processing"
|
167
|
-
@paused = false
|
168
|
-
end
|
169
|
-
|
170
|
-
private
|
171
|
-
|
172
|
-
def fork
|
173
|
-
super unless run_as_single_process
|
174
|
-
end
|
175
|
-
|
176
|
-
def deregister
|
177
|
-
uniq_clients.each do |client|
|
178
|
-
client.deregister_workers(Qless.worker_name)
|
179
|
-
end
|
180
|
-
end
|
181
|
-
|
182
|
-
def uniq_clients
|
183
|
-
@uniq_clients ||= @job_reserver.queues.map(&:client).uniq
|
184
|
-
end
|
185
|
-
|
186
|
-
def try_complete(job)
|
187
|
-
job.complete unless job.state_changed?
|
188
|
-
rescue Job::CantCompleteError => e
|
189
|
-
# There's not much we can do here. Complete fails in a few cases:
|
190
|
-
# - The job is already failed (i.e. by another worker)
|
191
|
-
# - The job is being worked on by another worker
|
192
|
-
# - The job has been cancelled
|
193
|
-
#
|
194
|
-
# We don't want to (or are able to) fail the job with this error in
|
195
|
-
# any of these cases, so the best we can do is log the failure.
|
196
|
-
log "Failed to complete #{job.inspect}: #{e.message}"
|
197
|
-
end
|
198
|
-
|
199
|
-
# Allow middleware modules to be mixed in and override the
|
200
|
-
# definition of around_perform while providing a default
|
201
|
-
# implementation so our code can assume the method is present.
|
202
|
-
include Module.new {
|
203
|
-
def around_perform(job)
|
204
|
-
job.perform
|
205
|
-
end
|
206
|
-
}
|
207
|
-
|
208
|
-
def fail_job(job, error, worker_backtrace)
|
209
|
-
group = "#{job.klass_name}:#{error.class}"
|
210
|
-
message = "#{truncated_message(error)}\n\n#{format_failure_backtrace(error.backtrace, worker_backtrace)}"
|
211
|
-
log "Got #{group} failure from #{job.inspect}"
|
212
|
-
job.fail(group, message)
|
213
|
-
end
|
214
|
-
|
215
|
-
# TODO: pull this out into a config option.
|
216
|
-
MAX_ERROR_MESSAGE_SIZE = 10_000
|
217
|
-
def truncated_message(error)
|
218
|
-
return error.message if error.message.length <= MAX_ERROR_MESSAGE_SIZE
|
219
|
-
error.message.slice(0, MAX_ERROR_MESSAGE_SIZE) + "... (truncated due to length)"
|
220
|
-
end
|
221
|
-
|
222
|
-
def format_failure_backtrace(error_backtrace, worker_backtrace)
|
223
|
-
(error_backtrace - worker_backtrace).map do |line|
|
224
|
-
@backtrace_replacements.inject(line) do |line, (original, new)|
|
225
|
-
line.sub(original, new)
|
226
|
-
end
|
227
|
-
end.join("\n")
|
228
|
-
end
|
229
|
-
|
230
|
-
def procline(value)
|
231
|
-
$0 = "Qless-#{Qless::VERSION}: #{value} at #{Time.now.iso8601}"
|
232
|
-
log! $0
|
233
|
-
end
|
234
|
-
|
235
|
-
def wait_for_child
|
236
|
-
srand # Reseeding
|
237
|
-
procline "Forked #{@child} at #{Time.now.to_i}"
|
238
|
-
begin
|
239
|
-
Process.waitpid(@child)
|
240
|
-
rescue SystemCallError
|
241
|
-
nil
|
242
|
-
end
|
243
|
-
end
|
244
|
-
|
245
|
-
# Kills the forked child immediately with minimal remorse. The job it
|
246
|
-
# is processing will not be completed. Send the child a TERM signal,
|
247
|
-
# wait 5 seconds, and then a KILL signal if it has not quit
|
248
|
-
def kill_child
|
249
|
-
return unless @child
|
250
|
-
|
251
|
-
if Process.waitpid(@child, Process::WNOHANG)
|
252
|
-
log "Child #{@child} already quit."
|
253
|
-
return
|
254
|
-
end
|
255
|
-
|
256
|
-
signal_child("TERM", @child)
|
257
|
-
|
258
|
-
signal_child("KILL", @child) unless quit_gracefully?(@child)
|
259
|
-
rescue SystemCallError
|
260
|
-
log "Child #{@child} already quit and reaped."
|
261
|
-
end
|
262
|
-
|
263
|
-
# send a signal to a child, have it logged.
|
264
|
-
def signal_child(signal, child)
|
265
|
-
log "Sending #{signal} signal to child #{child}"
|
266
|
-
Process.kill(signal, child)
|
267
|
-
end
|
268
|
-
|
269
|
-
# has our child quit gracefully within the timeout limit?
|
270
|
-
def quit_gracefully?(child)
|
271
|
-
(term_timeout.to_f * 10).round.times do |i|
|
272
|
-
sleep(0.1)
|
273
|
-
return true if Process.waitpid(child, Process::WNOHANG)
|
274
|
-
end
|
275
|
-
|
276
|
-
false
|
277
|
-
end
|
278
|
-
|
279
|
-
# This was originally stolen directly from resque... (thanks, @defunkt!)
|
280
|
-
# Registers the various signal handlers a worker responds to.
|
281
|
-
#
|
282
|
-
# TERM: Shutdown immediately, stop processing jobs.
|
283
|
-
# INT: Shutdown immediately, stop processing jobs.
|
284
|
-
# QUIT: Shutdown after the current job has finished processing.
|
285
|
-
# USR1: Kill the forked child immediately, continue processing jobs.
|
286
|
-
# USR2: Don't process any new jobs; dump the backtrace.
|
287
|
-
# CONT: Start processing jobs again after a USR2
|
288
|
-
def register_parent_signal_handlers
|
289
|
-
trap('TERM') { shutdown! }
|
290
|
-
trap('INT') { shutdown! }
|
291
|
-
|
292
|
-
begin
|
293
|
-
trap('QUIT') { shutdown }
|
294
|
-
trap('USR1') { kill_child }
|
295
|
-
trap('USR2') do
|
296
|
-
log "Current backtrace (parent): \n\n#{caller.join("\n")}\n\n"
|
297
|
-
pause_processing
|
298
|
-
end
|
299
|
-
|
300
|
-
trap('CONT') { unpause_processing }
|
301
|
-
rescue ArgumentError
|
302
|
-
warn "Signals QUIT, USR1, USR2, and/or CONT not supported."
|
303
|
-
end
|
304
|
-
end
|
305
|
-
|
306
|
-
def register_child_signal_handlers
|
307
|
-
trap('TERM') { raise SignalException.new("SIGTERM") }
|
308
|
-
trap('INT', 'DEFAULT')
|
309
|
-
|
310
|
-
begin
|
311
|
-
trap('QUIT', 'DEFAULT')
|
312
|
-
trap('USR1', 'DEFAULT')
|
313
|
-
trap('USR2', 'DEFAULT')
|
314
|
-
|
315
|
-
trap('USR2') do
|
316
|
-
log "Current backtrace (child): \n\n#{caller.join("\n")}\n\n"
|
317
|
-
end
|
318
|
-
rescue ArgumentError
|
319
|
-
end
|
320
|
-
end
|
321
|
-
|
322
|
-
# Log a message to STDOUT if we are verbose or very_verbose.
|
323
|
-
def log(message)
|
324
|
-
if verbose
|
325
|
-
output.puts "*** #{message}"
|
326
|
-
elsif very_verbose
|
327
|
-
time = Time.now.strftime('%H:%M:%S %Y-%m-%d')
|
328
|
-
output.puts "** [#{time}] #$$: #{message}"
|
329
|
-
end
|
330
|
-
end
|
331
|
-
|
332
|
-
# Logs a very verbose message to STDOUT.
|
333
|
-
def log!(message)
|
334
|
-
log message if very_verbose
|
335
|
-
end
|
336
|
-
|
337
|
-
def start_parent_pub_sub_listener_for(client)
|
338
|
-
Subscriber.start(client, "ql:w:#{Qless.worker_name}") do |subscriber, message|
|
339
|
-
if message["event"] == "lock_lost" && message["jid"] == current_job_jid
|
340
|
-
fail_job_due_to_timeout
|
341
|
-
kill_child
|
342
|
-
end
|
343
|
-
end
|
344
|
-
end
|
345
|
-
|
346
|
-
def start_child_pub_sub_listener_for(client)
|
347
|
-
Subscriber.start(client, "ql:w:#{Qless.worker_name}:#{Process.pid}") do |subscriber, message|
|
348
|
-
if message["event"] == "notify_backtrace"
|
349
|
-
notify_parent_of_job_backtrace(client, message.fetch('notify_list'))
|
350
|
-
end
|
351
|
-
end
|
352
|
-
end
|
353
|
-
|
354
|
-
def with_job(job)
|
355
|
-
@job = job
|
356
|
-
yield
|
357
|
-
ensure
|
358
|
-
@job = nil
|
359
|
-
end
|
360
|
-
|
361
|
-
# To prevent race conditions (with our listener thread),
|
362
|
-
# we cannot use a pattern like `use(@job) if @job` because
|
363
|
-
# the value of `@job` could change between the checking of
|
364
|
-
# it and the use of it. Here we use a pattern that avoids
|
365
|
-
# the issue -- get the job into a local, and yield that if
|
366
|
-
# it is set.
|
367
|
-
def access_current_job
|
368
|
-
if job = @job
|
369
|
-
yield job
|
370
|
-
end
|
371
|
-
end
|
372
|
-
|
373
|
-
def current_job_jid
|
374
|
-
access_current_job &:jid
|
375
|
-
end
|
376
|
-
|
377
|
-
JobLockLost = Class.new(StandardError)
|
378
|
-
|
379
|
-
def fail_job_due_to_timeout
|
380
|
-
access_current_job do |job|
|
381
|
-
error = JobLockLost.new
|
382
|
-
error.set_backtrace(get_backtrace_from_child(job.client.redis))
|
383
|
-
fail_job(job, error, caller)
|
384
|
-
end
|
385
|
-
end
|
386
|
-
|
387
|
-
def notify_parent_of_job_backtrace(client, list)
|
388
|
-
job_backtrace = Thread.main.backtrace
|
389
|
-
client.redis.lpush list, JSON.dump(job_backtrace)
|
390
|
-
client.redis.pexpire list, BACKTRACE_EXPIRATION_TIMEOUT_MS
|
391
|
-
end
|
392
|
-
|
393
|
-
WAIT_FOR_CHILD_BACKTRACE_TIMEOUT = 4
|
394
|
-
BACKTRACE_EXPIRATION_TIMEOUT_MS = 60_000 # timeout after a minute
|
395
|
-
|
396
|
-
def get_backtrace_from_child(child_redis)
|
397
|
-
notification_list = "ql:child_backtraces:#{Qless.generate_jid}"
|
398
|
-
request_backtrace = { "event" => "notify_backtrace",
|
399
|
-
"notify_list" => notification_list }
|
400
|
-
|
401
|
-
if child_redis.publish("ql:w:#{Qless.worker_name}:#{@child}", JSON.dump(request_backtrace)).zero?
|
402
|
-
return ["Could not obtain child backtrace since it was not listening."]
|
403
|
-
end
|
404
|
-
|
405
|
-
begin
|
406
|
-
_, backtrace_json = child_redis.blpop(notification_list, WAIT_FOR_CHILD_BACKTRACE_TIMEOUT)
|
407
|
-
JSON.parse(backtrace_json)
|
408
|
-
rescue => e
|
409
|
-
["Could not obtain child backtrace: #{e.class}: #{e.message}"] + e.backtrace
|
410
|
-
end
|
411
|
-
end
|
412
|
-
end
|
413
|
-
end
|
1
|
+
# Encoding: utf-8
|
414
2
|
|
3
|
+
require 'qless/worker/base'
|
4
|
+
require 'qless/worker/serial'
|
5
|
+
require 'qless/worker/forking'
|