qless 0.9.2 → 0.9.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (41) hide show
  1. data/Gemfile +2 -0
  2. data/README.md +42 -3
  3. data/Rakefile +26 -2
  4. data/{bin → exe}/qless-web +3 -2
  5. data/lib/qless.rb +55 -28
  6. data/lib/qless/config.rb +1 -3
  7. data/lib/qless/job.rb +127 -22
  8. data/lib/qless/job_reservers/round_robin.rb +3 -1
  9. data/lib/qless/job_reservers/shuffled_round_robin.rb +14 -0
  10. data/lib/qless/lua_script.rb +42 -0
  11. data/lib/qless/middleware/redis_reconnect.rb +24 -0
  12. data/lib/qless/middleware/retry_exceptions.rb +43 -0
  13. data/lib/qless/middleware/sentry.rb +70 -0
  14. data/lib/qless/qless-core/cancel.lua +89 -59
  15. data/lib/qless/qless-core/complete.lua +16 -1
  16. data/lib/qless/qless-core/config.lua +12 -0
  17. data/lib/qless/qless-core/deregister_workers.lua +12 -0
  18. data/lib/qless/qless-core/fail.lua +24 -14
  19. data/lib/qless/qless-core/heartbeat.lua +2 -1
  20. data/lib/qless/qless-core/pause.lua +18 -0
  21. data/lib/qless/qless-core/pop.lua +24 -3
  22. data/lib/qless/qless-core/put.lua +14 -1
  23. data/lib/qless/qless-core/qless-lib.lua +2354 -0
  24. data/lib/qless/qless-core/qless.lua +1862 -0
  25. data/lib/qless/qless-core/retry.lua +1 -1
  26. data/lib/qless/qless-core/unfail.lua +54 -0
  27. data/lib/qless/qless-core/unpause.lua +12 -0
  28. data/lib/qless/queue.rb +45 -21
  29. data/lib/qless/server.rb +38 -39
  30. data/lib/qless/server/static/css/docs.css +21 -1
  31. data/lib/qless/server/views/_job.erb +5 -5
  32. data/lib/qless/server/views/overview.erb +14 -9
  33. data/lib/qless/subscriber.rb +48 -0
  34. data/lib/qless/version.rb +1 -1
  35. data/lib/qless/wait_until.rb +19 -0
  36. data/lib/qless/worker.rb +243 -33
  37. metadata +49 -30
  38. data/bin/install_phantomjs +0 -7
  39. data/bin/qless-campfire +0 -106
  40. data/bin/qless-growl +0 -99
  41. data/lib/qless/lua.rb +0 -25
@@ -12,7 +12,6 @@ body {
12
12
  position: relative;
13
13
  padding-top: 90px;
14
14
  background-color: #fff;
15
- background-image: url(../img/grid-18px-masked.png);
16
15
  background-repeat: repeat-x;
17
16
  background-position: 0 40px;
18
17
  }
@@ -817,3 +816,24 @@ form.well {
817
816
  }
818
817
 
819
818
  }
819
+
820
+ /* For proper failed job display*/
821
+ .l-sidebyside {
822
+ overflow:auto;
823
+ }
824
+
825
+ .l-sidebyside > * {
826
+ display:inline-block;
827
+ *display: inline;
828
+ *zoom: 1;
829
+ }
830
+
831
+ .failed-job > .min-col-size {
832
+ min-width: 395px;
833
+ }
834
+
835
+ .failed-job .row {
836
+ margin-left:0;
837
+ margin-right:10px;
838
+ height: 30px;
839
+ }
@@ -112,12 +112,12 @@
112
112
  <div class="span6">
113
113
  <h3><small>History</small></h3>
114
114
  <div style="overflow-y:scroll; height: 200px">
115
- <% job.history.reverse.each do |h| %>
115
+ <% job.queue_history.reverse.each do |h| %>
116
116
  <pre><strong><%= h['q'] %></strong>
117
- Put: <%= strftime(Time.at(h['put'])) %><% if not h['popped'].nil? %>
118
- Pop: <%= strftime(Time.at(h['popped'])) %> by <%= h['worker'] %><% end %><% if not h['completed'].nil? %>
119
- Completed: <%= strftime(Time.at(h['completed'])) %><% end %><% if not h['failed'].nil? %>
120
- Failed: <%= strftime(Time.at(h['failed'])) %><% end %></pre>
117
+ Put: <%= strftime(h['put']) %><% if not h['popped'].nil? %>
118
+ Pop: <%= strftime(h['popped']) %> by <%= h['worker'] %><% end %><% if not h['completed'].nil? %>
119
+ Completed: <%= strftime(h['completed']) %><% end %><% if not h['failed'].nil? %>
120
+ Failed: <%= strftime(h['failed']) %><% end %></pre>
121
121
  <% end %>
122
122
  </div>
123
123
  </div>
@@ -18,7 +18,7 @@
18
18
  <%= queue['waiting'] %> /
19
19
  <%= queue['scheduled'] %> /
20
20
  <%= queue['stalled'] %> /
21
- <%= queue['depends'] %> /
21
+ <%= queue['depends'] %> /
22
22
  <%= queue['recurring'] %> <small>(running / waiting / scheduled / stalled / depends / recurring)</small>
23
23
  </h3>
24
24
  </div>
@@ -34,16 +34,22 @@
34
34
  <div class="page-header">
35
35
  <h1>Failed Jobs <small>D'oh!</small></h1>
36
36
  </div>
37
- <% failed.sort_by { |t, count| -count }.each do |t, count| %>
38
- <div class="row">
39
- <div class="span4">
40
- <h3 style="text-overflow: ellipsis; white-space: nowrap; overflow: hidden"><a href="<%= u "/failed/#{t}" %>"><%= t %></a></h3>
37
+ <div class="l-sidebyside failed-job">
38
+ <div class="min-col-size">
39
+ <% failed.sort_by { |t, count| -count }.each do |t, count| %>
40
+ <div class="row">
41
+ <h3><a href="<%= u "/failed/#{t}" %>"><%= t %></a></h3>
42
+ </div>
43
+ <% end %>
41
44
  </div>
42
- <div class="span8">
43
- <h3>| <%= count %> <small>Jobs</small></h3>
45
+ <div>
46
+ <% failed.sort_by { |t, count| -count }.each do |t, count| %>
47
+ <div class="row">
48
+ <h3>| <%= count %> <small>Jobs</small></h3>
49
+ </div>
50
+ <% end %>
44
51
  </div>
45
52
  </div>
46
- <% end %>
47
53
  <% end %>
48
54
 
49
55
  <% if tracked['jobs'].empty? %>
@@ -67,7 +73,6 @@
67
73
  <% end %>
68
74
  <% end %>
69
75
 
70
-
71
76
  <% if workers.empty? %>
72
77
  <div class="page-header">
73
78
  <h1>No Workers <small>Nobody's doin' nothin'!</small></h1>
@@ -0,0 +1,48 @@
1
+ require 'thread'
2
+ require 'qless/wait_until'
3
+
4
+ module Qless
5
+ class Subscriber
6
+ def self.start(*args, &block)
7
+ new(*args, &block).start_pub_sub_listener
8
+ end
9
+
10
+ attr_reader :client, :channel
11
+
12
+ def initialize(client, channel, &message_received_callback)
13
+ @client = client
14
+ @channel = channel
15
+ @message_received_callback = message_received_callback
16
+
17
+ # pub/sub blocks the connection so we must use a different redis connection
18
+ @client_redis = client.redis
19
+ @listener_redis = client.new_redis_connection
20
+
21
+ @my_channel = Qless.generate_jid
22
+ end
23
+
24
+ def start_pub_sub_listener
25
+ @thread = ::Thread.start do
26
+ @listener_redis.subscribe(channel, @my_channel) do |on|
27
+ on.message do |_channel, message|
28
+ if _channel == @my_channel
29
+ @listener_redis.unsubscribe(@my_channel)
30
+ else
31
+ @message_received_callback.call(self, JSON.parse(message))
32
+ end
33
+ end
34
+ end
35
+ end
36
+
37
+ wait_until_thread_listening
38
+ end
39
+
40
+ def wait_until_thread_listening
41
+ Qless::WaitUntil.wait_until(10) do
42
+ @client_redis.publish(@my_channel, 'disconnect') == 1
43
+ end
44
+ end
45
+ end
46
+ end
47
+
48
+
@@ -1,3 +1,3 @@
1
1
  module Qless
2
- VERSION = "0.9.2"
2
+ VERSION = "0.9.3"
3
3
  end
@@ -0,0 +1,19 @@
1
+ module Qless
2
+ module WaitUntil
3
+ TimeoutError = Class.new(StandardError)
4
+
5
+ def wait_until(timeout)
6
+ timeout_at = Time.now + timeout
7
+
8
+ loop do
9
+ return if yield
10
+ sleep 0.002
11
+ if Time.now > timeout_at
12
+ raise TimeoutError, "Timed out after #{timeout} seconds"
13
+ end
14
+ end
15
+ end
16
+
17
+ module_function :wait_until
18
+ end
19
+ end
@@ -2,19 +2,25 @@ require 'qless'
2
2
  require 'time'
3
3
  require 'qless/job_reservers/ordered'
4
4
  require 'qless/job_reservers/round_robin'
5
+ require 'qless/job_reservers/shuffled_round_robin'
6
+ require 'qless/subscriber'
7
+ require 'qless/wait_until'
5
8
 
6
9
  module Qless
7
10
  # This is heavily inspired by Resque's excellent worker:
8
11
  # https://github.com/defunkt/resque/blob/v1.20.0/lib/resque/worker.rb
9
12
  class Worker
10
- def initialize(client, job_reserver, options = {})
11
- @client, @job_reserver = client, job_reserver
13
+ def initialize(job_reserver, options = {})
14
+ self.job_reserver = job_reserver
12
15
  @shutdown = @paused = false
13
16
 
14
17
  self.very_verbose = options[:very_verbose]
15
18
  self.verbose = options[:verbose]
16
19
  self.run_as_single_process = options[:run_as_single_process]
17
20
  self.output = options.fetch(:output, $stdout)
21
+ self.term_timeout = options.fetch(:term_timeout, 4.0)
22
+ @backtrace_replacements = { Dir.pwd => '.' }
23
+ @backtrace_replacements[ENV['GEM_HOME']] = '<GEM_HOME>' if ENV.has_key?('GEM_HOME')
18
24
 
19
25
  output.puts "\n\n\n" if verbose || very_verbose
20
26
  log "Instantiated Worker"
@@ -35,6 +41,13 @@ module Qless
35
41
  # Defaults to $stdout.
36
42
  attr_accessor :output
37
43
 
44
+ # The object responsible for reserving jobs from the Qless server,
45
+ # using some reasonable strategy (e.g. round robin or ordered)
46
+ attr_accessor :job_reserver
47
+
48
+ # How long the child process is given to exit before forcibly killing it.
49
+ attr_accessor :term_timeout
50
+
38
51
  # Starts a worker based on ENV vars. Supported ENV vars:
39
52
  # - REDIS_URL=redis://host:port/db-num (the redis gem uses this automatically)
40
53
  # - QUEUES=high,medium,low or QUEUE=blah
@@ -59,18 +72,22 @@ module Qless
59
72
  options[:very_verbose] = !!ENV['VVERBOSE']
60
73
  options[:run_as_single_process] = !!ENV['RUN_AS_SINGLE_PROCESS']
61
74
 
62
- new(client, reserver, options).work(interval)
75
+ new(reserver, options).work(interval)
63
76
  end
64
77
 
65
78
  def work(interval = 5.0)
66
79
  procline "Starting #{@job_reserver.description}"
67
- register_signal_handlers
80
+ register_parent_signal_handlers
81
+ uniq_clients.each { |client| start_parent_pub_sub_listener_for(client) }
68
82
 
69
83
  loop do
70
84
  break if shutdown?
71
- next if paused?
85
+ if paused?
86
+ sleep interval
87
+ next
88
+ end
72
89
 
73
- unless job = @job_reserver.reserve
90
+ unless job = reserve_job
74
91
  break if interval.zero?
75
92
  procline "Waiting for #{@job_reserver.description}"
76
93
  log! "Sleeping for #{interval} seconds"
@@ -78,31 +95,48 @@ module Qless
78
95
  next
79
96
  end
80
97
 
81
- log "got: #{job.inspect}"
82
-
83
- if run_as_single_process
84
- # We're staying in the same process
85
- procline "Single processing #{job.description}"
86
- perform(job)
87
- elsif @child = fork
88
- # We're in the parent process
89
- procline "Forked #{@child} for #{job.description}"
90
- Process.wait(@child)
91
- else
92
- # We're in the child process
93
- procline "Processing #{job.description}"
94
- perform(job)
95
- exit!
96
- end
98
+ perform_job_in_child_process(job)
97
99
  end
100
+ ensure
101
+ # make sure the worker deregisters on shutdown
102
+ deregister
98
103
  end
99
104
 
100
105
  def perform(job)
101
106
  around_perform(job)
102
107
  rescue Exception => error
103
- fail_job(job, error)
108
+ fail_job(job, error, caller)
104
109
  else
105
- job.complete unless job.state_changed?
110
+ try_complete(job)
111
+ end
112
+
113
+ def reserve_job
114
+ @job_reserver.reserve
115
+ rescue Exception => error
116
+ # We want workers to durably stay up, so we don't want errors
117
+ # during job reserving (e.g. network timeouts, etc) to kill
118
+ # the worker.
119
+ log "Got an error while reserving a job: #{error.class}: #{error.message}"
120
+ end
121
+
122
+ def perform_job_in_child_process(job)
123
+ with_job(job) do
124
+ @child = fork do
125
+ job.reconnect_to_redis
126
+ register_child_signal_handlers
127
+ start_child_pub_sub_listener_for(job.client)
128
+ procline "Processing #{job.description}"
129
+ perform(job)
130
+ exit! # don't run at_exit hooks
131
+ end
132
+
133
+ if @child
134
+ wait_for_child
135
+ else
136
+ procline "Single processing #{job.description}"
137
+ perform(job)
138
+ end
139
+ end
106
140
  end
107
141
 
108
142
  def shutdown
@@ -135,6 +169,33 @@ module Qless
135
169
 
136
170
  private
137
171
 
172
+ def fork
173
+ super unless run_as_single_process
174
+ end
175
+
176
+ def deregister
177
+ uniq_clients.each do |client|
178
+ client.deregister_workers(Qless.worker_name)
179
+ end
180
+ end
181
+
182
+ def uniq_clients
183
+ @uniq_clients ||= @job_reserver.queues.map(&:client).uniq
184
+ end
185
+
186
+ def try_complete(job)
187
+ job.complete unless job.state_changed?
188
+ rescue Job::CantCompleteError => e
189
+ # There's not much we can do here. Complete fails in a few cases:
190
+ # - The job is already failed (i.e. by another worker)
191
+ # - The job is being worked on by another worker
192
+ # - The job has been cancelled
193
+ #
194
+ # We don't want to (or are able to) fail the job with this error in
195
+ # any of these cases, so the best we can do is log the failure.
196
+ log "Failed to complete #{job.inspect}: #{e.message}"
197
+ end
198
+
138
199
  # Allow middleware modules to be mixed in and override the
139
200
  # definition of around_perform while providing a default
140
201
  # implementation so our code can assume the method is present.
@@ -144,47 +205,120 @@ module Qless
144
205
  end
145
206
  }
146
207
 
147
- def fail_job(job, error)
148
- group = "#{job.klass}:#{error.class}"
149
- message = "#{error.message}\n\n#{error.backtrace.join("\n")}"
208
+ def fail_job(job, error, worker_backtrace)
209
+ group = "#{job.klass_name}:#{error.class}"
210
+ message = "#{truncated_message(error)}\n\n#{format_failure_backtrace(error.backtrace, worker_backtrace)}"
150
211
  log "Got #{group} failure from #{job.inspect}"
151
212
  job.fail(group, message)
152
213
  end
153
214
 
215
+ # TODO: pull this out into a config option.
216
+ MAX_ERROR_MESSAGE_SIZE = 10_000
217
+ def truncated_message(error)
218
+ return error.message if error.message.length <= MAX_ERROR_MESSAGE_SIZE
219
+ error.message.slice(0, MAX_ERROR_MESSAGE_SIZE) + "... (truncated due to length)"
220
+ end
221
+
222
+ def format_failure_backtrace(error_backtrace, worker_backtrace)
223
+ (error_backtrace - worker_backtrace).map do |line|
224
+ @backtrace_replacements.inject(line) do |line, (original, new)|
225
+ line.sub(original, new)
226
+ end
227
+ end.join("\n")
228
+ end
229
+
154
230
  def procline(value)
155
231
  $0 = "Qless-#{Qless::VERSION}: #{value} at #{Time.now.iso8601}"
156
232
  log! $0
157
233
  end
158
234
 
235
+ def wait_for_child
236
+ srand # Reseeding
237
+ procline "Forked #{@child} at #{Time.now.to_i}"
238
+ begin
239
+ Process.waitpid(@child)
240
+ rescue SystemCallError
241
+ nil
242
+ end
243
+ end
244
+
245
+ # Kills the forked child immediately with minimal remorse. The job it
246
+ # is processing will not be completed. Send the child a TERM signal,
247
+ # wait 5 seconds, and then a KILL signal if it has not quit
159
248
  def kill_child
160
249
  return unless @child
161
- return unless system("ps -o pid,state -p #{@child}")
162
- Process.kill("KILL", @child) rescue nil
250
+
251
+ if Process.waitpid(@child, Process::WNOHANG)
252
+ log "Child #{@child} already quit."
253
+ return
254
+ end
255
+
256
+ signal_child("TERM", @child)
257
+
258
+ signal_child("KILL", @child) unless quit_gracefully?(@child)
259
+ rescue SystemCallError
260
+ log "Child #{@child} already quit and reaped."
261
+ end
262
+
263
+ # send a signal to a child, have it logged.
264
+ def signal_child(signal, child)
265
+ log "Sending #{signal} signal to child #{child}"
266
+ Process.kill(signal, child)
267
+ end
268
+
269
+ # has our child quit gracefully within the timeout limit?
270
+ def quit_gracefully?(child)
271
+ (term_timeout.to_f * 10).round.times do |i|
272
+ sleep(0.1)
273
+ return true if Process.waitpid(child, Process::WNOHANG)
274
+ end
275
+
276
+ false
163
277
  end
164
278
 
165
- # This is stolen directly from resque... (thanks, @defunkt!)
279
+ # This was originally stolen directly from resque... (thanks, @defunkt!)
166
280
  # Registers the various signal handlers a worker responds to.
167
281
  #
168
282
  # TERM: Shutdown immediately, stop processing jobs.
169
283
  # INT: Shutdown immediately, stop processing jobs.
170
284
  # QUIT: Shutdown after the current job has finished processing.
171
285
  # USR1: Kill the forked child immediately, continue processing jobs.
172
- # USR2: Don't process any new jobs
286
+ # USR2: Don't process any new jobs; dump the backtrace.
173
287
  # CONT: Start processing jobs again after a USR2
174
- def register_signal_handlers
288
+ def register_parent_signal_handlers
175
289
  trap('TERM') { shutdown! }
176
290
  trap('INT') { shutdown! }
177
291
 
178
292
  begin
179
293
  trap('QUIT') { shutdown }
180
294
  trap('USR1') { kill_child }
181
- trap('USR2') { pause_processing }
295
+ trap('USR2') do
296
+ log "Current backtrace (parent): \n\n#{caller.join("\n")}\n\n"
297
+ pause_processing
298
+ end
299
+
182
300
  trap('CONT') { unpause_processing }
183
301
  rescue ArgumentError
184
302
  warn "Signals QUIT, USR1, USR2, and/or CONT not supported."
185
303
  end
186
304
  end
187
305
 
306
+ def register_child_signal_handlers
307
+ trap('TERM') { raise SignalException.new("SIGTERM") }
308
+ trap('INT', 'DEFAULT')
309
+
310
+ begin
311
+ trap('QUIT', 'DEFAULT')
312
+ trap('USR1', 'DEFAULT')
313
+ trap('USR2', 'DEFAULT')
314
+
315
+ trap('USR2') do
316
+ log "Current backtrace (child): \n\n#{caller.join("\n")}\n\n"
317
+ end
318
+ rescue ArgumentError
319
+ end
320
+ end
321
+
188
322
  # Log a message to STDOUT if we are verbose or very_verbose.
189
323
  def log(message)
190
324
  if verbose
@@ -199,6 +333,82 @@ module Qless
199
333
  def log!(message)
200
334
  log message if very_verbose
201
335
  end
336
+
337
+ def start_parent_pub_sub_listener_for(client)
338
+ Subscriber.start(client, "ql:w:#{Qless.worker_name}") do |subscriber, message|
339
+ if message["event"] == "lock_lost" && message["jid"] == current_job_jid
340
+ fail_job_due_to_timeout
341
+ kill_child
342
+ end
343
+ end
344
+ end
345
+
346
+ def start_child_pub_sub_listener_for(client)
347
+ Subscriber.start(client, "ql:w:#{Qless.worker_name}:#{Process.pid}") do |subscriber, message|
348
+ if message["event"] == "notify_backtrace"
349
+ notify_parent_of_job_backtrace(client, message.fetch('notify_list'))
350
+ end
351
+ end
352
+ end
353
+
354
+ def with_job(job)
355
+ @job = job
356
+ yield
357
+ ensure
358
+ @job = nil
359
+ end
360
+
361
+ # To prevent race conditions (with our listener thread),
362
+ # we cannot use a pattern like `use(@job) if @job` because
363
+ # the value of `@job` could change between the checking of
364
+ # it and the use of it. Here we use a pattern that avoids
365
+ # the issue -- get the job into a local, and yield that if
366
+ # it is set.
367
+ def access_current_job
368
+ if job = @job
369
+ yield job
370
+ end
371
+ end
372
+
373
+ def current_job_jid
374
+ access_current_job &:jid
375
+ end
376
+
377
+ JobLockLost = Class.new(StandardError)
378
+
379
+ def fail_job_due_to_timeout
380
+ access_current_job do |job|
381
+ error = JobLockLost.new
382
+ error.set_backtrace(get_backtrace_from_child(job.client.redis))
383
+ fail_job(job, error, caller)
384
+ end
385
+ end
386
+
387
+ def notify_parent_of_job_backtrace(client, list)
388
+ job_backtrace = Thread.main.backtrace
389
+ client.redis.lpush list, JSON.dump(job_backtrace)
390
+ client.redis.pexpire list, BACKTRACE_EXPIRATION_TIMEOUT_MS
391
+ end
392
+
393
+ WAIT_FOR_CHILD_BACKTRACE_TIMEOUT = 4
394
+ BACKTRACE_EXPIRATION_TIMEOUT_MS = 60_000 # timeout after a minute
395
+
396
+ def get_backtrace_from_child(child_redis)
397
+ notification_list = "ql:child_backtraces:#{Qless.generate_jid}"
398
+ request_backtrace = { "event" => "notify_backtrace",
399
+ "notify_list" => notification_list }
400
+
401
+ if child_redis.publish("ql:w:#{Qless.worker_name}:#{@child}", JSON.dump(request_backtrace)).zero?
402
+ return ["Could not obtain child backtrace since it was not listening."]
403
+ end
404
+
405
+ begin
406
+ _, backtrace_json = child_redis.blpop(notification_list, WAIT_FOR_CHILD_BACKTRACE_TIMEOUT)
407
+ JSON.parse(backtrace_json)
408
+ rescue => e
409
+ ["Could not obtain child backtrace: #{e.class}: #{e.message}"] + e.backtrace
410
+ end
411
+ end
202
412
  end
203
413
  end
204
414