inst-jobs 2.0.0 → 2.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d674b7da21caf04eb87ff9823ed549c93a901219669316090d088f0699564e59
4
- data.tar.gz: 021456d34f12eff8cc988db866018d701fc77ffdbb57e9fb308fc1bd25a91ecb
3
+ metadata.gz: c062c222e731bd490efe572108a508bf78faabee4479f7fe6927a89688d9ef0b
4
+ data.tar.gz: 3b1678fc017230e990bc7e8d4e652c23ab59413953ce72a312b13adfa7626193
5
5
  SHA512:
6
- metadata.gz: ad78cfdd9026db24b714c532c8ee837a875e443afc375909f0c130e3cfbf87d1f872344f982d931838bfa6649a2f1edc59430f6444a2baee08f8afb568015cfc
7
- data.tar.gz: e2b127477f0687958178505628b9544aa5c49e7aa1d0ceef32892250aa26aeb1c77f12bcacd6682e17c2bc379f987b154a0f982e029852432c39f7b3a5335df8
6
+ metadata.gz: 8c1a722c17c9abc8f5c8a44cb28f6584dc9fb16c1edcccc8df566ad21a5f81af7a54fb70282e2689aee11947dcd96f44ca01dfe542d71c8d3d6b7f145a572ce7
7
+ data.tar.gz: 9a7a65c71820d4b04f1e1ac2bf498cf030490a597d075d87d4399a392a7da1bbf50cfb3d5eeb1dea9c357d11e00aabf5e469f062c1fe9cc4b02cc8ed08e1a192
@@ -87,6 +87,10 @@ module Delayed
87
87
  batches[batch_enqueue_args] << kwargs
88
88
  return true
89
89
  else
90
+ if kwargs[:on_conflict].present?
91
+ Delayed::Logging.logger.warn("[DELAYED_JOB] WARNING: providing 'on_conflict' as an option to a non-singleton job will have no effect. Discarding.")
92
+ kwargs.delete(:on_conflict)
93
+ end
90
94
  job = self.create(**kwargs)
91
95
  end
92
96
 
@@ -178,6 +182,10 @@ module Delayed
178
182
  expires_at && (self.class.db_time_now >= expires_at)
179
183
  end
180
184
 
185
+ def inferred_max_attempts
186
+ self.max_attempts || Delayed::Settings.max_attempts
187
+ end
188
+
181
189
  # Reschedule the job in the future (when a job fails).
182
190
  # Uses an exponential scale depending on the number of failed attempts.
183
191
  def reschedule(error = nil, time = nil)
@@ -190,7 +198,7 @@ module Delayed
190
198
 
191
199
  self.attempts += 1 unless return_code == :unlock
192
200
 
193
- if self.attempts >= (self.max_attempts || Delayed::Settings.max_attempts)
201
+ if self.attempts >= self.inferred_max_attempts
194
202
  permanent_failure error || "max attempts reached"
195
203
  elsif expired?
196
204
  permanent_failure error || "job has expired"
@@ -12,6 +12,7 @@ module Delayed
12
12
  :loop => [:worker],
13
13
  :perform => [:worker, :job],
14
14
  :pop => [:worker],
15
+ :retry => [:worker, :job, :exception],
15
16
  :work_queue_pop => [:work_queue, :worker_config],
16
17
  :check_for_work => [:work_queue],
17
18
  }
@@ -56,10 +56,7 @@ class Periodic
56
56
  inferred_args = {
57
57
  max_attempts: 1,
58
58
  run_at: @cron.next_time(Delayed::Periodic.now).utc.to_time,
59
- singleton: (@job_args[:singleton] == false ? nil : tag),
60
- # yes, checking for whether it is actually the boolean literal false,
61
- # which means the consuming code really does not want this job to be
62
- # a singleton at all.
59
+ singleton: tag,
63
60
  on_conflict: :patient
64
61
  }
65
62
  @job_args.merge(inferred_args)
@@ -39,6 +39,7 @@ class Pool
39
39
  Process.wait unlock_pid
40
40
 
41
41
  spawn_periodic_auditor
42
+ spawn_abandoned_job_cleanup
42
43
  spawn_all_workers
43
44
  say "Workers spawned"
44
45
  join
@@ -111,6 +112,34 @@ class Pool
111
112
  end
112
113
  end
113
114
 
115
+ def spawn_abandoned_job_cleanup
116
+ return if Settings.disable_abandoned_job_cleanup
117
+ cleanup_interval_in_minutes = 60
118
+ @abandoned_cleanup_thread = Thread.new do
119
+ # every hour (staggered by process)
120
+ # check for dead jobs and cull them.
121
+ # Will actually be more often based on the
122
+ # number of worker nodes in the pool. This will actually
123
+ # be a max of N times per hour where N is the number of workers,
124
+ # but they won't overrun each other because the health check
125
+ # takes an advisory lock internally
126
+ sleep(rand(cleanup_interval_in_minutes * 60))
127
+ loop do
128
+ schedule_abandoned_job_cleanup
129
+ sleep(cleanup_interval_in_minutes * 60)
130
+ end
131
+ end
132
+ end
133
+
134
+ def schedule_abandoned_job_cleanup
135
+ pid = fork_with_reconnects do
136
+ # we want to avoid db connections in the main pool process
137
+ $0 = "delayed_abandoned_job_cleanup"
138
+ Delayed::Worker::HealthCheck.reschedule_abandoned_jobs
139
+ end
140
+ workers[pid] = :abandoned_job_cleanup
141
+ end
142
+
114
143
  def spawn_periodic_auditor
115
144
  return if Settings.disable_periodic_jobs
116
145
 
@@ -217,6 +246,8 @@ class Pool
217
246
  case worker
218
247
  when :periodic_audit
219
248
  say "ran auditor: #{worker}"
249
+ when :abandoned_job_cleanup
250
+ say "ran cleanup: #{worker}"
220
251
  when :work_queue
221
252
  say "work queue exited, restarting", :info
222
253
  spawn_work_queue
@@ -8,6 +8,7 @@ module Delayed
8
8
  module Settings
9
9
  SETTINGS = [
10
10
  :default_job_options,
11
+ :disable_abandoned_job_cleanup,
11
12
  :disable_periodic_jobs,
12
13
  :disable_automatic_orphan_unlocking,
13
14
  :fetch_batch_size,
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Delayed
4
- VERSION = "2.0.0"
4
+ VERSION = "2.2.0"
5
5
  end
@@ -3,6 +3,17 @@
3
3
  module Delayed
4
4
 
5
5
  class TimeoutError < RuntimeError; end
6
+ class RetriableError < RuntimeError
7
+ # this error is a special case. You _should_ raise
8
+ # it from inside the rescue block for another error,
9
+ # because it indicates: "something made this job fail
10
+ # but we're pretty sure it's transient and it's safe to try again".
11
+ # the workflow is still the same (retry will happen unless
12
+ # retries are exhausted), but it won't call the :error
13
+ # callback unless it can't retry anymore. It WILL call the
14
+ # separate ":retry" callback, which is ONLY activated
15
+ # for this kind of error.
16
+ end
6
17
 
7
18
  require 'tmpdir'
8
19
  require 'set'
@@ -94,7 +105,11 @@ class Worker
94
105
  end
95
106
 
96
107
  def exit?
97
- @exit
108
+ !!@exit || parent_exited?
109
+ end
110
+
111
+ def parent_exited?
112
+ @parent_pid && @parent_pid != Process.ppid
98
113
  end
99
114
 
100
115
  def wake_up
@@ -198,32 +213,38 @@ class Worker
198
213
  end
199
214
 
200
215
  def perform(job)
201
- count = 1
202
- raise Delayed::Backend::JobExpired, "job expired at #{job.expires_at}" if job.expired?
203
- self.class.lifecycle.run_callbacks(:perform, self, job) do
204
- set_process_name("run:#{Settings.worker_procname_prefix}#{job.id}:#{job.name}")
205
- logger.info("Processing #{log_job(job, :long)}")
206
- runtime = Benchmark.realtime do
207
- if job.batch?
208
- # each job in the batch will have perform called on it, so we don't
209
- # need a timeout around this
210
- count = perform_batch(job)
211
- else
212
- job.invoke_job
216
+ begin
217
+ count = 1
218
+ raise Delayed::Backend::JobExpired, "job expired at #{job.expires_at}" if job.expired?
219
+ self.class.lifecycle.run_callbacks(:perform, self, job) do
220
+ set_process_name("run:#{Settings.worker_procname_prefix}#{job.id}:#{job.name}")
221
+ logger.info("Processing #{log_job(job, :long)}")
222
+ runtime = Benchmark.realtime do
223
+ if job.batch?
224
+ # each job in the batch will have perform called on it, so we don't
225
+ # need a timeout around this
226
+ count = perform_batch(job)
227
+ else
228
+ job.invoke_job
229
+ end
230
+ job.destroy
213
231
  end
214
- job.destroy
232
+ logger.info("Completed #{log_job(job)} #{"%.0fms" % (runtime * 1000)}")
233
+ end
234
+ rescue ::Delayed::RetriableError => re
235
+ can_retry = job.attempts + 1 < job.inferred_max_attempts
236
+ callback_type = can_retry ? :retry : :error
237
+ self.class.lifecycle.run_callbacks(callback_type, self, job, re) do
238
+ handle_failed_job(job, re)
239
+ end
240
+ rescue SystemExit => se
241
+ # There wasn't really a failure here so no callbacks and whatnot needed,
242
+ # still reschedule the job though.
243
+ job.reschedule(se)
244
+ rescue Exception => e
245
+ self.class.lifecycle.run_callbacks(:error, self, job, e) do
246
+ handle_failed_job(job, e)
215
247
  end
216
- logger.info("Completed #{log_job(job)} #{"%.0fms" % (runtime * 1000)}")
217
- end
218
- count
219
- rescue SystemExit => se
220
- # There wasn't really a failure here so no callbacks and whatnot needed,
221
- # still reschedule the job though.
222
- job.reschedule(se)
223
- count
224
- rescue Exception => e
225
- self.class.lifecycle.run_callbacks(:error, self, job, e) do
226
- handle_failed_job(job, e)
227
248
  end
228
249
  count
229
250
  end
@@ -23,12 +23,13 @@ module Delayed
23
23
  def reschedule_abandoned_jobs
24
24
  return if Settings.worker_health_check_type == :none
25
25
  Delayed::Job.transaction do
26
- # this job is a special case, and is not a singleton
26
+ # this action is a special case, and SHOULD NOT be a periodic job
27
27
  # because if it gets wiped out suddenly during execution
28
28
  # it can't go clean up it's abandoned self. Therefore,
29
- # we try to get an advisory lock when it runs. If we succeed,
30
- # no other job is trying to do this right now (and if we abandon the
31
- # job, the transaction will end, releasing the advisory lock).
29
+ # we expect it to get run from it's own process forked from the job pool
30
+ # and we try to get an advisory lock when it runs. If we succeed,
31
+ # no other worker is trying to do this right now (and if we abandon the
32
+ # operation, the transaction will end, releasing the advisory lock).
32
33
  result = attempt_advisory_lock
33
34
  return unless result
34
35
  checker = Worker::HealthCheck.build(
@@ -59,8 +60,8 @@ module Delayed
59
60
 
60
61
  def attempt_advisory_lock
61
62
  lock_name = "Delayed::Worker::HealthCheck#reschedule_abandoned_jobs"
62
- output = ActiveRecord::Base.connection.execute("SELECT pg_try_advisory_xact_lock(half_md5_as_bigint('#{lock_name}'));")
63
- output.getvalue(0, 0)
63
+ conn = ActiveRecord::Base.connection
64
+ conn.select_value("SELECT pg_try_advisory_xact_lock(#{conn.quote_table_name('half_md5_as_bigint')}('#{lock_name}'));")
64
65
  end
65
66
  end
66
67
 
@@ -14,6 +14,7 @@ RSpec.describe Delayed::Periodic do
14
14
  ensure
15
15
  Delayed::Periodic.scheduled = prev_sched
16
16
  Delayed::Periodic.overrides = prev_ovr
17
+ Delayed::Job.delete_all
17
18
  end
18
19
 
19
20
  describe ".cron" do
@@ -26,14 +27,5 @@ RSpec.describe Delayed::Periodic do
26
27
  expect(instance).to_not be_nil
27
28
  expect(instance.enqueue_args[:singleton]).to eq("periodic: just a test")
28
29
  end
29
-
30
- it "uses no singleton if told to skip" do
31
- Delayed::Periodic.cron job_name, '*/10 * * * *', {singleton: false} do
32
- # no-op
33
- end
34
- instance = Delayed::Periodic.scheduled[job_name]
35
- expect(instance).to_not be_nil
36
- expect(instance.enqueue_args[:singleton]).to be_nil
37
- end
38
30
  end
39
31
  end
@@ -6,6 +6,11 @@ describe Delayed::Worker do
6
6
  let(:worker_config) { {
7
7
  queue: "test", min_priority: 1, max_priority: 2, stuff: "stuff",
8
8
  }.freeze }
9
+ let(:job_attrs) { {
10
+ id: 42, name: "testjob", full_name: "testfullname", :last_error= => nil,
11
+ attempts: 1, reschedule: nil, :expired? => false,
12
+ payload_object: {}, priority: 25
13
+ }.freeze }
9
14
  subject { described_class.new(worker_config.dup) }
10
15
 
11
16
  after { Delayed::Worker.lifecycle.reset! }
@@ -14,9 +19,24 @@ describe Delayed::Worker do
14
19
  it "fires off an error callback when a job raises an exception" do
15
20
  fired = false
16
21
  Delayed::Worker.lifecycle.before(:error) {|worker, exception| fired = true}
17
- job = double(:last_error= => nil, attempts: 1, reschedule: nil)
18
- subject.perform(job)
22
+ job = double(job_attrs)
23
+ output_count = subject.perform(job)
19
24
  expect(fired).to be_truthy
25
+ expect(output_count).to eq(1)
26
+ end
27
+
28
+ it "uses the retry callback for a retriable exception" do
29
+ error_fired = retry_fired = false
30
+ Delayed::Worker.lifecycle.before(:error) {|worker, exception| error_fired = true }
31
+ Delayed::Worker.lifecycle.before(:retry) {|worker, exception| retry_fired = true}
32
+ job = Delayed::Job.new(payload_object: {}, priority: 25, strand: "test_jobs", max_attempts: 3)
33
+ expect(job).to receive(:invoke_job) do
34
+ raise Delayed::RetriableError, "that's all this job does"
35
+ end
36
+ output_count = subject.perform(job)
37
+ expect(error_fired).to be_falsey
38
+ expect(retry_fired).to be_truthy
39
+ expect(output_count).to eq(1)
20
40
  end
21
41
 
22
42
  it "reloads" do
@@ -35,7 +55,7 @@ describe Delayed::Worker do
35
55
  expect(ActionDispatch::Reloader).to receive(:prepare!).once
36
56
  expect(ActionDispatch::Reloader).to receive(:cleanup!).once
37
57
  end
38
- job = double(:last_error= => nil, attempts: 0, reschedule: nil, expired?: false)
58
+ job = double(job_attrs)
39
59
  subject.perform(job)
40
60
  end
41
61
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: inst-jobs
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.0
4
+ version: 2.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tobias Luetke
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2020-12-07 00:00:00.000000000 Z
12
+ date: 2021-01-06 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: activerecord