qless 0.9.2 → 0.9.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. data/Gemfile +2 -0
  2. data/README.md +42 -3
  3. data/Rakefile +26 -2
  4. data/{bin → exe}/qless-web +3 -2
  5. data/lib/qless.rb +55 -28
  6. data/lib/qless/config.rb +1 -3
  7. data/lib/qless/job.rb +127 -22
  8. data/lib/qless/job_reservers/round_robin.rb +3 -1
  9. data/lib/qless/job_reservers/shuffled_round_robin.rb +14 -0
  10. data/lib/qless/lua_script.rb +42 -0
  11. data/lib/qless/middleware/redis_reconnect.rb +24 -0
  12. data/lib/qless/middleware/retry_exceptions.rb +43 -0
  13. data/lib/qless/middleware/sentry.rb +70 -0
  14. data/lib/qless/qless-core/cancel.lua +89 -59
  15. data/lib/qless/qless-core/complete.lua +16 -1
  16. data/lib/qless/qless-core/config.lua +12 -0
  17. data/lib/qless/qless-core/deregister_workers.lua +12 -0
  18. data/lib/qless/qless-core/fail.lua +24 -14
  19. data/lib/qless/qless-core/heartbeat.lua +2 -1
  20. data/lib/qless/qless-core/pause.lua +18 -0
  21. data/lib/qless/qless-core/pop.lua +24 -3
  22. data/lib/qless/qless-core/put.lua +14 -1
  23. data/lib/qless/qless-core/qless-lib.lua +2354 -0
  24. data/lib/qless/qless-core/qless.lua +1862 -0
  25. data/lib/qless/qless-core/retry.lua +1 -1
  26. data/lib/qless/qless-core/unfail.lua +54 -0
  27. data/lib/qless/qless-core/unpause.lua +12 -0
  28. data/lib/qless/queue.rb +45 -21
  29. data/lib/qless/server.rb +38 -39
  30. data/lib/qless/server/static/css/docs.css +21 -1
  31. data/lib/qless/server/views/_job.erb +5 -5
  32. data/lib/qless/server/views/overview.erb +14 -9
  33. data/lib/qless/subscriber.rb +48 -0
  34. data/lib/qless/version.rb +1 -1
  35. data/lib/qless/wait_until.rb +19 -0
  36. data/lib/qless/worker.rb +243 -33
  37. metadata +49 -30
  38. data/bin/install_phantomjs +0 -7
  39. data/bin/qless-campfire +0 -106
  40. data/bin/qless-growl +0 -99
  41. data/lib/qless/lua.rb +0 -25
@@ -12,7 +12,6 @@ body {
12
12
  position: relative;
13
13
  padding-top: 90px;
14
14
  background-color: #fff;
15
- background-image: url(../img/grid-18px-masked.png);
16
15
  background-repeat: repeat-x;
17
16
  background-position: 0 40px;
18
17
  }
@@ -817,3 +816,24 @@ form.well {
817
816
  }
818
817
 
819
818
  }
819
+
820
+ /* For proper failed job display*/
821
+ .l-sidebyside {
822
+ overflow:auto;
823
+ }
824
+
825
+ .l-sidebyside > * {
826
+ display:inline-block;
827
+ *display: inline;
828
+ *zoom: 1;
829
+ }
830
+
831
+ .failed-job > .min-col-size {
832
+ min-width: 395px;
833
+ }
834
+
835
+ .failed-job .row {
836
+ margin-left:0;
837
+ margin-right:10px;
838
+ height: 30px;
839
+ }
@@ -112,12 +112,12 @@
112
112
  <div class="span6">
113
113
  <h3><small>History</small></h3>
114
114
  <div style="overflow-y:scroll; height: 200px">
115
- <% job.history.reverse.each do |h| %>
115
+ <% job.queue_history.reverse.each do |h| %>
116
116
  <pre><strong><%= h['q'] %></strong>
117
- Put: <%= strftime(Time.at(h['put'])) %><% if not h['popped'].nil? %>
118
- Pop: <%= strftime(Time.at(h['popped'])) %> by <%= h['worker'] %><% end %><% if not h['completed'].nil? %>
119
- Completed: <%= strftime(Time.at(h['completed'])) %><% end %><% if not h['failed'].nil? %>
120
- Failed: <%= strftime(Time.at(h['failed'])) %><% end %></pre>
117
+ Put: <%= strftime(h['put']) %><% if not h['popped'].nil? %>
118
+ Pop: <%= strftime(h['popped']) %> by <%= h['worker'] %><% end %><% if not h['completed'].nil? %>
119
+ Completed: <%= strftime(h['completed']) %><% end %><% if not h['failed'].nil? %>
120
+ Failed: <%= strftime(h['failed']) %><% end %></pre>
121
121
  <% end %>
122
122
  </div>
123
123
  </div>
@@ -18,7 +18,7 @@
18
18
  <%= queue['waiting'] %> /
19
19
  <%= queue['scheduled'] %> /
20
20
  <%= queue['stalled'] %> /
21
- <%= queue['depends'] %> /
21
+ <%= queue['depends'] %> /
22
22
  <%= queue['recurring'] %> <small>(running / waiting / scheduled / stalled / depends / recurring)</small>
23
23
  </h3>
24
24
  </div>
@@ -34,16 +34,22 @@
34
34
  <div class="page-header">
35
35
  <h1>Failed Jobs <small>D'oh!</small></h1>
36
36
  </div>
37
- <% failed.sort_by { |t, count| -count }.each do |t, count| %>
38
- <div class="row">
39
- <div class="span4">
40
- <h3 style="text-overflow: ellipsis; white-space: nowrap; overflow: hidden"><a href="<%= u "/failed/#{t}" %>"><%= t %></a></h3>
37
+ <div class="l-sidebyside failed-job">
38
+ <div class="min-col-size">
39
+ <% failed.sort_by { |t, count| -count }.each do |t, count| %>
40
+ <div class="row">
41
+ <h3><a href="<%= u "/failed/#{t}" %>"><%= t %></a></h3>
42
+ </div>
43
+ <% end %>
41
44
  </div>
42
- <div class="span8">
43
- <h3>| <%= count %> <small>Jobs</small></h3>
45
+ <div>
46
+ <% failed.sort_by { |t, count| -count }.each do |t, count| %>
47
+ <div class="row">
48
+ <h3>| <%= count %> <small>Jobs</small></h3>
49
+ </div>
50
+ <% end %>
44
51
  </div>
45
52
  </div>
46
- <% end %>
47
53
  <% end %>
48
54
 
49
55
  <% if tracked['jobs'].empty? %>
@@ -67,7 +73,6 @@
67
73
  <% end %>
68
74
  <% end %>
69
75
 
70
-
71
76
  <% if workers.empty? %>
72
77
  <div class="page-header">
73
78
  <h1>No Workers <small>Nobody's doin' nothin'!</small></h1>
@@ -0,0 +1,48 @@
1
+ require 'thread'
2
+ require 'qless/wait_until'
3
+
4
+ module Qless
5
+ class Subscriber
6
+ def self.start(*args, &block)
7
+ new(*args, &block).start_pub_sub_listener
8
+ end
9
+
10
+ attr_reader :client, :channel
11
+
12
+ def initialize(client, channel, &message_received_callback)
13
+ @client = client
14
+ @channel = channel
15
+ @message_received_callback = message_received_callback
16
+
17
+ # pub/sub blocks the connection so we must use a different redis connection
18
+ @client_redis = client.redis
19
+ @listener_redis = client.new_redis_connection
20
+
21
+ @my_channel = Qless.generate_jid
22
+ end
23
+
24
+ def start_pub_sub_listener
25
+ @thread = ::Thread.start do
26
+ @listener_redis.subscribe(channel, @my_channel) do |on|
27
+ on.message do |_channel, message|
28
+ if _channel == @my_channel
29
+ @listener_redis.unsubscribe(@my_channel)
30
+ else
31
+ @message_received_callback.call(self, JSON.parse(message))
32
+ end
33
+ end
34
+ end
35
+ end
36
+
37
+ wait_until_thread_listening
38
+ end
39
+
40
+ def wait_until_thread_listening
41
+ Qless::WaitUntil.wait_until(10) do
42
+ @client_redis.publish(@my_channel, 'disconnect') == 1
43
+ end
44
+ end
45
+ end
46
+ end
47
+
48
+
@@ -1,3 +1,3 @@
1
1
  module Qless
2
- VERSION = "0.9.2"
2
+ VERSION = "0.9.3"
3
3
  end
@@ -0,0 +1,19 @@
1
+ module Qless
2
+ module WaitUntil
3
+ TimeoutError = Class.new(StandardError)
4
+
5
+ def wait_until(timeout)
6
+ timeout_at = Time.now + timeout
7
+
8
+ loop do
9
+ return if yield
10
+ sleep 0.002
11
+ if Time.now > timeout_at
12
+ raise TimeoutError, "Timed out after #{timeout} seconds"
13
+ end
14
+ end
15
+ end
16
+
17
+ module_function :wait_until
18
+ end
19
+ end
@@ -2,19 +2,25 @@ require 'qless'
2
2
  require 'time'
3
3
  require 'qless/job_reservers/ordered'
4
4
  require 'qless/job_reservers/round_robin'
5
+ require 'qless/job_reservers/shuffled_round_robin'
6
+ require 'qless/subscriber'
7
+ require 'qless/wait_until'
5
8
 
6
9
  module Qless
7
10
  # This is heavily inspired by Resque's excellent worker:
8
11
  # https://github.com/defunkt/resque/blob/v1.20.0/lib/resque/worker.rb
9
12
  class Worker
10
- def initialize(client, job_reserver, options = {})
11
- @client, @job_reserver = client, job_reserver
13
+ def initialize(job_reserver, options = {})
14
+ self.job_reserver = job_reserver
12
15
  @shutdown = @paused = false
13
16
 
14
17
  self.very_verbose = options[:very_verbose]
15
18
  self.verbose = options[:verbose]
16
19
  self.run_as_single_process = options[:run_as_single_process]
17
20
  self.output = options.fetch(:output, $stdout)
21
+ self.term_timeout = options.fetch(:term_timeout, 4.0)
22
+ @backtrace_replacements = { Dir.pwd => '.' }
23
+ @backtrace_replacements[ENV['GEM_HOME']] = '<GEM_HOME>' if ENV.has_key?('GEM_HOME')
18
24
 
19
25
  output.puts "\n\n\n" if verbose || very_verbose
20
26
  log "Instantiated Worker"
@@ -35,6 +41,13 @@ module Qless
35
41
  # Defaults to $stdout.
36
42
  attr_accessor :output
37
43
 
44
+ # The object responsible for reserving jobs from the Qless server,
45
+ # using some reasonable strategy (e.g. round robin or ordered)
46
+ attr_accessor :job_reserver
47
+
48
+ # How long the child process is given to exit before forcibly killing it.
49
+ attr_accessor :term_timeout
50
+
38
51
  # Starts a worker based on ENV vars. Supported ENV vars:
39
52
  # - REDIS_URL=redis://host:port/db-num (the redis gem uses this automatically)
40
53
  # - QUEUES=high,medium,low or QUEUE=blah
@@ -59,18 +72,22 @@ module Qless
59
72
  options[:very_verbose] = !!ENV['VVERBOSE']
60
73
  options[:run_as_single_process] = !!ENV['RUN_AS_SINGLE_PROCESS']
61
74
 
62
- new(client, reserver, options).work(interval)
75
+ new(reserver, options).work(interval)
63
76
  end
64
77
 
65
78
  def work(interval = 5.0)
66
79
  procline "Starting #{@job_reserver.description}"
67
- register_signal_handlers
80
+ register_parent_signal_handlers
81
+ uniq_clients.each { |client| start_parent_pub_sub_listener_for(client) }
68
82
 
69
83
  loop do
70
84
  break if shutdown?
71
- next if paused?
85
+ if paused?
86
+ sleep interval
87
+ next
88
+ end
72
89
 
73
- unless job = @job_reserver.reserve
90
+ unless job = reserve_job
74
91
  break if interval.zero?
75
92
  procline "Waiting for #{@job_reserver.description}"
76
93
  log! "Sleeping for #{interval} seconds"
@@ -78,31 +95,48 @@ module Qless
78
95
  next
79
96
  end
80
97
 
81
- log "got: #{job.inspect}"
82
-
83
- if run_as_single_process
84
- # We're staying in the same process
85
- procline "Single processing #{job.description}"
86
- perform(job)
87
- elsif @child = fork
88
- # We're in the parent process
89
- procline "Forked #{@child} for #{job.description}"
90
- Process.wait(@child)
91
- else
92
- # We're in the child process
93
- procline "Processing #{job.description}"
94
- perform(job)
95
- exit!
96
- end
98
+ perform_job_in_child_process(job)
97
99
  end
100
+ ensure
101
+ # make sure the worker deregisters on shutdown
102
+ deregister
98
103
  end
99
104
 
100
105
  def perform(job)
101
106
  around_perform(job)
102
107
  rescue Exception => error
103
- fail_job(job, error)
108
+ fail_job(job, error, caller)
104
109
  else
105
- job.complete unless job.state_changed?
110
+ try_complete(job)
111
+ end
112
+
113
+ def reserve_job
114
+ @job_reserver.reserve
115
+ rescue Exception => error
116
+ # We want workers to durably stay up, so we don't want errors
117
+ # during job reserving (e.g. network timeouts, etc) to kill
118
+ # the worker.
119
+ log "Got an error while reserving a job: #{error.class}: #{error.message}"
120
+ end
121
+
122
+ def perform_job_in_child_process(job)
123
+ with_job(job) do
124
+ @child = fork do
125
+ job.reconnect_to_redis
126
+ register_child_signal_handlers
127
+ start_child_pub_sub_listener_for(job.client)
128
+ procline "Processing #{job.description}"
129
+ perform(job)
130
+ exit! # don't run at_exit hooks
131
+ end
132
+
133
+ if @child
134
+ wait_for_child
135
+ else
136
+ procline "Single processing #{job.description}"
137
+ perform(job)
138
+ end
139
+ end
106
140
  end
107
141
 
108
142
  def shutdown
@@ -135,6 +169,33 @@ module Qless
135
169
 
136
170
  private
137
171
 
172
+ def fork
173
+ super unless run_as_single_process
174
+ end
175
+
176
+ def deregister
177
+ uniq_clients.each do |client|
178
+ client.deregister_workers(Qless.worker_name)
179
+ end
180
+ end
181
+
182
+ def uniq_clients
183
+ @uniq_clients ||= @job_reserver.queues.map(&:client).uniq
184
+ end
185
+
186
+ def try_complete(job)
187
+ job.complete unless job.state_changed?
188
+ rescue Job::CantCompleteError => e
189
+ # There's not much we can do here. Complete fails in a few cases:
190
+ # - The job is already failed (i.e. by another worker)
191
+ # - The job is being worked on by another worker
192
+ # - The job has been cancelled
193
+ #
194
+ # We don't want to (or are able to) fail the job with this error in
195
+ # any of these cases, so the best we can do is log the failure.
196
+ log "Failed to complete #{job.inspect}: #{e.message}"
197
+ end
198
+
138
199
  # Allow middleware modules to be mixed in and override the
139
200
  # definition of around_perform while providing a default
140
201
  # implementation so our code can assume the method is present.
@@ -144,47 +205,120 @@ module Qless
144
205
  end
145
206
  }
146
207
 
147
- def fail_job(job, error)
148
- group = "#{job.klass}:#{error.class}"
149
- message = "#{error.message}\n\n#{error.backtrace.join("\n")}"
208
+ def fail_job(job, error, worker_backtrace)
209
+ group = "#{job.klass_name}:#{error.class}"
210
+ message = "#{truncated_message(error)}\n\n#{format_failure_backtrace(error.backtrace, worker_backtrace)}"
150
211
  log "Got #{group} failure from #{job.inspect}"
151
212
  job.fail(group, message)
152
213
  end
153
214
 
215
+ # TODO: pull this out into a config option.
216
+ MAX_ERROR_MESSAGE_SIZE = 10_000
217
+ def truncated_message(error)
218
+ return error.message if error.message.length <= MAX_ERROR_MESSAGE_SIZE
219
+ error.message.slice(0, MAX_ERROR_MESSAGE_SIZE) + "... (truncated due to length)"
220
+ end
221
+
222
+ def format_failure_backtrace(error_backtrace, worker_backtrace)
223
+ (error_backtrace - worker_backtrace).map do |line|
224
+ @backtrace_replacements.inject(line) do |line, (original, new)|
225
+ line.sub(original, new)
226
+ end
227
+ end.join("\n")
228
+ end
229
+
154
230
  def procline(value)
155
231
  $0 = "Qless-#{Qless::VERSION}: #{value} at #{Time.now.iso8601}"
156
232
  log! $0
157
233
  end
158
234
 
235
+ def wait_for_child
236
+ srand # Reseeding
237
+ procline "Forked #{@child} at #{Time.now.to_i}"
238
+ begin
239
+ Process.waitpid(@child)
240
+ rescue SystemCallError
241
+ nil
242
+ end
243
+ end
244
+
245
+ # Kills the forked child immediately with minimal remorse. The job it
246
+ # is processing will not be completed. Send the child a TERM signal,
247
+ # wait 5 seconds, and then a KILL signal if it has not quit
159
248
  def kill_child
160
249
  return unless @child
161
- return unless system("ps -o pid,state -p #{@child}")
162
- Process.kill("KILL", @child) rescue nil
250
+
251
+ if Process.waitpid(@child, Process::WNOHANG)
252
+ log "Child #{@child} already quit."
253
+ return
254
+ end
255
+
256
+ signal_child("TERM", @child)
257
+
258
+ signal_child("KILL", @child) unless quit_gracefully?(@child)
259
+ rescue SystemCallError
260
+ log "Child #{@child} already quit and reaped."
261
+ end
262
+
263
+ # send a signal to a child, have it logged.
264
+ def signal_child(signal, child)
265
+ log "Sending #{signal} signal to child #{child}"
266
+ Process.kill(signal, child)
267
+ end
268
+
269
+ # has our child quit gracefully within the timeout limit?
270
+ def quit_gracefully?(child)
271
+ (term_timeout.to_f * 10).round.times do |i|
272
+ sleep(0.1)
273
+ return true if Process.waitpid(child, Process::WNOHANG)
274
+ end
275
+
276
+ false
163
277
  end
164
278
 
165
- # This is stolen directly from resque... (thanks, @defunkt!)
279
+ # This was originally stolen directly from resque... (thanks, @defunkt!)
166
280
  # Registers the various signal handlers a worker responds to.
167
281
  #
168
282
  # TERM: Shutdown immediately, stop processing jobs.
169
283
  # INT: Shutdown immediately, stop processing jobs.
170
284
  # QUIT: Shutdown after the current job has finished processing.
171
285
  # USR1: Kill the forked child immediately, continue processing jobs.
172
- # USR2: Don't process any new jobs
286
+ # USR2: Don't process any new jobs; dump the backtrace.
173
287
  # CONT: Start processing jobs again after a USR2
174
- def register_signal_handlers
288
+ def register_parent_signal_handlers
175
289
  trap('TERM') { shutdown! }
176
290
  trap('INT') { shutdown! }
177
291
 
178
292
  begin
179
293
  trap('QUIT') { shutdown }
180
294
  trap('USR1') { kill_child }
181
- trap('USR2') { pause_processing }
295
+ trap('USR2') do
296
+ log "Current backtrace (parent): \n\n#{caller.join("\n")}\n\n"
297
+ pause_processing
298
+ end
299
+
182
300
  trap('CONT') { unpause_processing }
183
301
  rescue ArgumentError
184
302
  warn "Signals QUIT, USR1, USR2, and/or CONT not supported."
185
303
  end
186
304
  end
187
305
 
306
+ def register_child_signal_handlers
307
+ trap('TERM') { raise SignalException.new("SIGTERM") }
308
+ trap('INT', 'DEFAULT')
309
+
310
+ begin
311
+ trap('QUIT', 'DEFAULT')
312
+ trap('USR1', 'DEFAULT')
313
+ trap('USR2', 'DEFAULT')
314
+
315
+ trap('USR2') do
316
+ log "Current backtrace (child): \n\n#{caller.join("\n")}\n\n"
317
+ end
318
+ rescue ArgumentError
319
+ end
320
+ end
321
+
188
322
  # Log a message to STDOUT if we are verbose or very_verbose.
189
323
  def log(message)
190
324
  if verbose
@@ -199,6 +333,82 @@ module Qless
199
333
  def log!(message)
200
334
  log message if very_verbose
201
335
  end
336
+
337
+ def start_parent_pub_sub_listener_for(client)
338
+ Subscriber.start(client, "ql:w:#{Qless.worker_name}") do |subscriber, message|
339
+ if message["event"] == "lock_lost" && message["jid"] == current_job_jid
340
+ fail_job_due_to_timeout
341
+ kill_child
342
+ end
343
+ end
344
+ end
345
+
346
+ def start_child_pub_sub_listener_for(client)
347
+ Subscriber.start(client, "ql:w:#{Qless.worker_name}:#{Process.pid}") do |subscriber, message|
348
+ if message["event"] == "notify_backtrace"
349
+ notify_parent_of_job_backtrace(client, message.fetch('notify_list'))
350
+ end
351
+ end
352
+ end
353
+
354
+ def with_job(job)
355
+ @job = job
356
+ yield
357
+ ensure
358
+ @job = nil
359
+ end
360
+
361
+ # To prevent race conditions (with our listener thread),
362
+ # we cannot use a pattern like `use(@job) if @job` because
363
+ # the value of `@job` could change between the checking of
364
+ # it and the use of it. Here we use a pattern that avoids
365
+ # the issue -- get the job into a local, and yield that if
366
+ # it is set.
367
+ def access_current_job
368
+ if job = @job
369
+ yield job
370
+ end
371
+ end
372
+
373
+ def current_job_jid
374
+ access_current_job &:jid
375
+ end
376
+
377
+ JobLockLost = Class.new(StandardError)
378
+
379
+ def fail_job_due_to_timeout
380
+ access_current_job do |job|
381
+ error = JobLockLost.new
382
+ error.set_backtrace(get_backtrace_from_child(job.client.redis))
383
+ fail_job(job, error, caller)
384
+ end
385
+ end
386
+
387
+ def notify_parent_of_job_backtrace(client, list)
388
+ job_backtrace = Thread.main.backtrace
389
+ client.redis.lpush list, JSON.dump(job_backtrace)
390
+ client.redis.pexpire list, BACKTRACE_EXPIRATION_TIMEOUT_MS
391
+ end
392
+
393
+ WAIT_FOR_CHILD_BACKTRACE_TIMEOUT = 4
394
+ BACKTRACE_EXPIRATION_TIMEOUT_MS = 60_000 # timeout after a minute
395
+
396
+ def get_backtrace_from_child(child_redis)
397
+ notification_list = "ql:child_backtraces:#{Qless.generate_jid}"
398
+ request_backtrace = { "event" => "notify_backtrace",
399
+ "notify_list" => notification_list }
400
+
401
+ if child_redis.publish("ql:w:#{Qless.worker_name}:#{@child}", JSON.dump(request_backtrace)).zero?
402
+ return ["Could not obtain child backtrace since it was not listening."]
403
+ end
404
+
405
+ begin
406
+ _, backtrace_json = child_redis.blpop(notification_list, WAIT_FOR_CHILD_BACKTRACE_TIMEOUT)
407
+ JSON.parse(backtrace_json)
408
+ rescue => e
409
+ ["Could not obtain child backtrace: #{e.class}: #{e.message}"] + e.backtrace
410
+ end
411
+ end
202
412
  end
203
413
  end
204
414