inst-jobs 3.0.5 → 3.0.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 11059ce779a0ff644edcae25335e23b7ad2a4448dfb48e8cbc95295f3e6c468c
4
- data.tar.gz: 84dfa8a1185219823013e1363190b0469e397dd423105eb29a205fea1e7207fa
3
+ metadata.gz: 245e01e85640f50351b2bca5ae7b7391c5373c0f8bc5128f1294d9e7566b5346
4
+ data.tar.gz: 38622bfe41a62682e198119a6d86e10210e06341d1a045f30cdabd6281600566
5
5
  SHA512:
6
- metadata.gz: 9e4c5291673edccd5760cd11d087102e0d8829d25793a6ca4f9f3255336b3abb1ce2bcf7426a79a76d0f373b44ec2b0152c395974f78a4552942f7a858d9d499
7
- data.tar.gz: 77aea9b18c3492e2b98f26a23c0591c8da9a31a34c0736868f00d54b3eadff77f7b2bbeb00b91b0819f93bffcefb7a26d1f2c829ebd82bc6b58a5c5a7bb01b16
6
+ metadata.gz: ced7dd5a9cfe21b545d1ade7d05efabd53de6eff97589626dd38ce650b1da641d93811b48848339bed3057f3e873c77f6c6b4d5d3d29640a68694d40f20402b2
7
+ data.tar.gz: 82f0bfad222bef95dec0c154e6a649c184ba4f09c122db0a52e6f5d17f30ce836987def104d4b49fab4b293838e0ba37034d1cad3fdb4e2df2ebe6c7a0556ec3
@@ -0,0 +1,59 @@
1
+ # frozen_string_literal: true
2
+
3
+ class FixSingletonRaceConditionInsert < ActiveRecord::Migration[5.2]
4
+ def change
5
+ reversible do |direction|
6
+ direction.up do
7
+ execute(<<~SQL)
8
+ CREATE OR REPLACE FUNCTION delayed_jobs_before_insert_row_tr_fn () RETURNS trigger AS $$
9
+ BEGIN
10
+ IF NEW.strand IS NOT NULL THEN
11
+ PERFORM pg_advisory_xact_lock(half_md5_as_bigint(NEW.strand));
12
+ IF (SELECT COUNT(*) FROM (
13
+ SELECT 1 FROM delayed_jobs WHERE strand = NEW.strand AND next_in_strand=true LIMIT NEW.max_concurrent
14
+ ) s) = NEW.max_concurrent THEN
15
+ NEW.next_in_strand := false;
16
+ END IF;
17
+ END IF;
18
+ IF NEW.singleton IS NOT NULL THEN
19
+ PERFORM pg_advisory_xact_lock(half_md5_as_bigint(CONCAT('singleton:', NEW.singleton)));
20
+ -- this condition seems silly, but it forces postgres to use the two partial indexes on singleton,
21
+ -- rather than doing a seq scan
22
+ PERFORM 1 FROM delayed_jobs WHERE singleton = NEW.singleton AND (locked_by IS NULL OR locked_by IS NOT NULL);
23
+ IF FOUND THEN
24
+ NEW.next_in_strand := false;
25
+ END IF;
26
+ END IF;
27
+ RETURN NEW;
28
+ END;
29
+ $$ LANGUAGE plpgsql;
30
+ SQL
31
+ end
32
+ direction.down do
33
+ execute(<<~SQL)
34
+ CREATE OR REPLACE FUNCTION delayed_jobs_before_insert_row_tr_fn () RETURNS trigger AS $$
35
+ BEGIN
36
+ IF NEW.strand IS NOT NULL THEN
37
+ PERFORM pg_advisory_xact_lock(half_md5_as_bigint(NEW.strand));
38
+ IF (SELECT COUNT(*) FROM (
39
+ SELECT 1 FROM delayed_jobs WHERE strand = NEW.strand AND next_in_strand=true LIMIT NEW.max_concurrent
40
+ ) s) = NEW.max_concurrent THEN
41
+ NEW.next_in_strand := false;
42
+ END IF;
43
+ END IF;
44
+ IF NEW.singleton IS NOT NULL THEN
45
+ -- this condition seems silly, but it forces postgres to use the two partial indexes on singleton,
46
+ -- rather than doing a seq scan
47
+ PERFORM 1 FROM delayed_jobs WHERE singleton = NEW.singleton AND (locked_by IS NULL OR locked_by IS NOT NULL);
48
+ IF FOUND THEN
49
+ NEW.next_in_strand := false;
50
+ END IF;
51
+ END IF;
52
+ RETURN NEW;
53
+ END;
54
+ $$ LANGUAGE plpgsql;
55
+ SQL
56
+ end
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,207 @@
1
+ # frozen_string_literal: true
2
+
3
+ class FixSingletonRaceConditionDelete < ActiveRecord::Migration[6.0]
4
+ def up
5
+ execute(<<~SQL)
6
+ CREATE OR REPLACE FUNCTION delayed_jobs_after_delete_row_tr_fn () RETURNS trigger AS $$
7
+ DECLARE
8
+ next_strand varchar;
9
+ running_count integer;
10
+ should_lock boolean;
11
+ should_be_precise boolean;
12
+ update_query varchar;
13
+ skip_locked varchar;
14
+ transition boolean;
15
+ BEGIN
16
+ IF OLD.strand IS NOT NULL THEN
17
+ should_lock := true;
18
+ should_be_precise := OLD.id % (OLD.max_concurrent * 4) = 0;
19
+
20
+ IF NOT should_be_precise AND OLD.max_concurrent > 16 THEN
21
+ running_count := (SELECT COUNT(*) FROM (
22
+ SELECT 1 as one FROM delayed_jobs WHERE strand = OLD.strand AND next_in_strand = 't' LIMIT OLD.max_concurrent
23
+ ) subquery_for_count);
24
+ should_lock := running_count < OLD.max_concurrent;
25
+ END IF;
26
+
27
+ IF should_lock THEN
28
+ PERFORM pg_advisory_xact_lock(half_md5_as_bigint(OLD.strand));
29
+ END IF;
30
+
31
+ -- note that we don't really care if the row we're deleting has a singleton, or if it even
32
+ -- matches the row(s) we're going to update. we just need to make sure that whatever
33
+ -- singleton we grab isn't already running (which is a simple existence check, since
34
+ -- the unique indexes ensure there is at most one singleton running, and one queued)
35
+ update_query := 'UPDATE delayed_jobs SET next_in_strand=true WHERE id IN (
36
+ SELECT id FROM delayed_jobs j2
37
+ WHERE next_in_strand=false AND
38
+ j2.strand=$1.strand AND
39
+ (j2.singleton IS NULL OR NOT EXISTS (SELECT 1 FROM delayed_jobs j3 WHERE j3.singleton=j2.singleton AND j3.id<>j2.id AND (j3.locked_by IS NULL OR j3.locked_by IS NOT NULL)))
40
+ ORDER BY j2.strand_order_override ASC, j2.id ASC
41
+ LIMIT ';
42
+
43
+ IF should_be_precise THEN
44
+ running_count := (SELECT COUNT(*) FROM (
45
+ SELECT 1 FROM delayed_jobs WHERE strand = OLD.strand AND next_in_strand = 't' LIMIT OLD.max_concurrent
46
+ ) s);
47
+ IF running_count < OLD.max_concurrent THEN
48
+ update_query := update_query || '($1.max_concurrent - $2)';
49
+ ELSE
50
+ -- we have too many running already; just bail
51
+ RETURN OLD;
52
+ END IF;
53
+ ELSE
54
+ update_query := update_query || '1';
55
+
56
+ -- n-strands don't require precise ordering; we can make this query more performant
57
+ IF OLD.max_concurrent > 1 THEN
58
+ skip_locked := ' SKIP LOCKED';
59
+ END IF;
60
+ END IF;
61
+
62
+ update_query := update_query || ' FOR UPDATE' || COALESCE(skip_locked, '') || ')';
63
+ EXECUTE update_query USING OLD, running_count;
64
+ END IF;
65
+
66
+ IF OLD.singleton IS NOT NULL THEN
67
+ PERFORM pg_advisory_xact_lock(half_md5_as_bigint(CONCAT('singleton:', OLD.singleton)));
68
+
69
+ transition := EXISTS (SELECT 1 FROM delayed_jobs AS j1 WHERE j1.singleton = OLD.singleton AND j1.strand IS DISTINCT FROM OLD.strand AND locked_by IS NULL);
70
+
71
+ IF transition THEN
72
+ next_strand := (SELECT j1.strand FROM delayed_jobs AS j1 WHERE j1.singleton = OLD.singleton AND j1.strand IS DISTINCT FROM OLD.strand AND locked_by IS NULL AND j1.strand IS NOT NULL LIMIT 1);
73
+
74
+ IF next_strand IS NOT NULL THEN
75
+ -- if the singleton has a new strand defined, we need to lock it to ensure we obey n_strand constraints --
76
+ IF NOT pg_try_advisory_xact_lock(half_md5_as_bigint(next_strand)) THEN
77
+ -- a failure to acquire the lock means that another process already has it and will thus handle this singleton --
78
+ RETURN OLD;
79
+ END IF;
80
+ END IF;
81
+ ELSIF OLD.strand IS NOT NULL THEN
82
+ -- if there is no transition and there is a strand then we have already handled this singleton in the case above --
83
+ RETURN OLD;
84
+ END IF;
85
+
86
+ -- handles transitioning a singleton from stranded to not stranded --
87
+ -- handles transitioning a singleton from unstranded to stranded --
88
+ -- handles transitioning a singleton from strand A to strand B --
89
+ -- these transitions are a relatively rare case, so we take a shortcut and --
90
+ -- only start the next singleton if its strand does not currently have any running jobs --
91
+ -- if it does, the next stranded job that finishes will start this singleton if it can --
92
+ UPDATE delayed_jobs SET next_in_strand=true WHERE id IN (
93
+ SELECT id FROM delayed_jobs j2
94
+ WHERE next_in_strand=false AND
95
+ j2.singleton=OLD.singleton AND
96
+ j2.locked_by IS NULL AND
97
+ (j2.strand IS NULL OR NOT EXISTS (SELECT 1 FROM delayed_jobs j3 WHERE j3.strand=j2.strand AND j3.id<>j2.id))
98
+ FOR UPDATE
99
+ );
100
+ END IF;
101
+ RETURN OLD;
102
+ END;
103
+ $$ LANGUAGE plpgsql;
104
+ SQL
105
+ end
106
+
107
+ def down
108
+ execute(<<~SQL)
109
+ CREATE OR REPLACE FUNCTION delayed_jobs_after_delete_row_tr_fn () RETURNS trigger AS $$
110
+ DECLARE
111
+ next_strand varchar;
112
+ running_count integer;
113
+ should_lock boolean;
114
+ should_be_precise boolean;
115
+ update_query varchar;
116
+ skip_locked varchar;
117
+ transition boolean;
118
+ BEGIN
119
+ IF OLD.strand IS NOT NULL THEN
120
+ should_lock := true;
121
+ should_be_precise := OLD.id % (OLD.max_concurrent * 4) = 0;
122
+
123
+ IF NOT should_be_precise AND OLD.max_concurrent > 16 THEN
124
+ running_count := (SELECT COUNT(*) FROM (
125
+ SELECT 1 as one FROM delayed_jobs WHERE strand = OLD.strand AND next_in_strand = 't' LIMIT OLD.max_concurrent
126
+ ) subquery_for_count);
127
+ should_lock := running_count < OLD.max_concurrent;
128
+ END IF;
129
+
130
+ IF should_lock THEN
131
+ PERFORM pg_advisory_xact_lock(half_md5_as_bigint(OLD.strand));
132
+ END IF;
133
+
134
+ -- note that we don't really care if the row we're deleting has a singleton, or if it even
135
+ -- matches the row(s) we're going to update. we just need to make sure that whatever
136
+ -- singleton we grab isn't already running (which is a simple existence check, since
137
+ -- the unique indexes ensure there is at most one singleton running, and one queued)
138
+ update_query := 'UPDATE delayed_jobs SET next_in_strand=true WHERE id IN (
139
+ SELECT id FROM delayed_jobs j2
140
+ WHERE next_in_strand=false AND
141
+ j2.strand=$1.strand AND
142
+ (j2.singleton IS NULL OR NOT EXISTS (SELECT 1 FROM delayed_jobs j3 WHERE j3.singleton=j2.singleton AND j3.id<>j2.id AND (j3.locked_by IS NULL OR j3.locked_by IS NOT NULL)))
143
+ ORDER BY j2.strand_order_override ASC, j2.id ASC
144
+ LIMIT ';
145
+
146
+ IF should_be_precise THEN
147
+ running_count := (SELECT COUNT(*) FROM (
148
+ SELECT 1 FROM delayed_jobs WHERE strand = OLD.strand AND next_in_strand = 't' LIMIT OLD.max_concurrent
149
+ ) s);
150
+ IF running_count < OLD.max_concurrent THEN
151
+ update_query := update_query || '($1.max_concurrent - $2)';
152
+ ELSE
153
+ -- we have too many running already; just bail
154
+ RETURN OLD;
155
+ END IF;
156
+ ELSE
157
+ update_query := update_query || '1';
158
+
159
+ -- n-strands don't require precise ordering; we can make this query more performant
160
+ IF OLD.max_concurrent > 1 THEN
161
+ skip_locked := ' SKIP LOCKED';
162
+ END IF;
163
+ END IF;
164
+
165
+ update_query := update_query || ' FOR UPDATE' || COALESCE(skip_locked, '') || ')';
166
+ EXECUTE update_query USING OLD, running_count;
167
+ END IF;
168
+
169
+ IF OLD.singleton IS NOT NULL THEN
170
+ transition := EXISTS (SELECT 1 FROM delayed_jobs AS j1 WHERE j1.singleton = OLD.singleton AND j1.strand IS DISTINCT FROM OLD.strand AND locked_by IS NULL);
171
+
172
+ IF transition THEN
173
+ next_strand := (SELECT j1.strand FROM delayed_jobs AS j1 WHERE j1.singleton = OLD.singleton AND j1.strand IS DISTINCT FROM OLD.strand AND locked_by IS NULL AND j1.strand IS NOT NULL LIMIT 1);
174
+
175
+ IF next_strand IS NOT NULL THEN
176
+ -- if the singleton has a new strand defined, we need to lock it to ensure we obey n_strand constraints --
177
+ IF NOT pg_try_advisory_xact_lock(half_md5_as_bigint(next_strand)) THEN
178
+ -- a failure to acquire the lock means that another process already has it and will thus handle this singleton --
179
+ RETURN OLD;
180
+ END IF;
181
+ END IF;
182
+ ELSIF OLD.strand IS NOT NULL THEN
183
+ -- if there is no transition and there is a strand then we have already handled this singleton in the case above --
184
+ RETURN OLD;
185
+ END IF;
186
+
187
+ -- handles transitioning a singleton from stranded to not stranded --
188
+ -- handles transitioning a singleton from unstranded to stranded --
189
+ -- handles transitioning a singleton from strand A to strand B --
190
+ -- these transitions are a relatively rare case, so we take a shortcut and --
191
+ -- only start the next singleton if its strand does not currently have any running jobs --
192
+ -- if it does, the next stranded job that finishes will start this singleton if it can --
193
+ UPDATE delayed_jobs SET next_in_strand=true WHERE id IN (
194
+ SELECT id FROM delayed_jobs j2
195
+ WHERE next_in_strand=false AND
196
+ j2.singleton=OLD.singleton AND
197
+ j2.locked_by IS NULL AND
198
+ (j2.strand IS NULL OR NOT EXISTS (SELECT 1 FROM delayed_jobs j3 WHERE j3.strand=j2.strand AND j3.id<>j2.id))
199
+ FOR UPDATE
200
+ );
201
+ END IF;
202
+ RETURN OLD;
203
+ END;
204
+ $$ LANGUAGE plpgsql;
205
+ SQL
206
+ end
207
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Delayed
4
- VERSION = "3.0.5"
4
+ VERSION = "3.0.6"
5
5
  end
@@ -389,14 +389,10 @@ shared_examples_for "a backend" do
389
389
  expect(job1.reload.handler).to include("ErrorJob")
390
390
  end
391
391
 
392
- context "next_in_strand management - deadlocks", non_transactional: true do
392
+ context "next_in_strand management - deadlocks and race conditions", non_transactional: true do
393
393
  # The following unit tests are fairly slow and non-deterministic. It may be
394
394
  # easier to make them fail quicker and more consistently by adding a random
395
395
  # sleep into the appropriate trigger(s).
396
- #
397
- # Example:
398
- # PERFORM pg_advisory_xact_lock(half_md5_as_bigint(OLD.strand));
399
- # PERFORM pg_sleep(random() * 2);
400
396
 
401
397
  def loop_secs(val)
402
398
  loop_start = Time.now.utc
@@ -408,46 +404,119 @@ shared_examples_for "a backend" do
408
404
  end
409
405
  end
410
406
 
411
- it "doesn't deadlock when transitioning from strand_a to strand_b" do
407
+ def loop_until_found(params)
408
+ found = false
409
+
410
+ loop_secs(10.seconds) do
411
+ if Delayed::Job.exists?(**params)
412
+ found = true
413
+ break
414
+ end
415
+ end
416
+
417
+ raise "timed out waiting for condition" unless found
418
+ end
419
+
420
+ def thread_body
421
+ yield
422
+ rescue
423
+ Thread.current.thread_variable_set(:fail, true)
424
+ raise
425
+ end
426
+
427
+ it "doesn't orphan the singleton when two are queued consecutively" do
428
+ # In order to reproduce this one efficiently, you'll probably want to add
429
+ # a sleep within delayed_jobs_before_insert_row_tr_fn.
430
+ # IF NEW.singleton IS NOT NULL THEN
431
+ # ...
432
+ # PERFORM pg_sleep(random() * 2);
433
+ # END IF;
434
+
412
435
  threads = []
413
436
 
414
- def thread_body(j1_params, j2_params)
415
- loop do
416
- j1 = create_job(**j1_params)
417
- j2 = create_job(**j2_params)
437
+ threads << Thread.new do
438
+ thread_body do
439
+ loop do
440
+ create_job(singleton: "singleton_job")
441
+ create_job(singleton: "singleton_job")
442
+ end
443
+ end
444
+ end
418
445
 
419
- expect(j1.reload.next_in_strand).to eq(true)
420
- expect(j2.reload.next_in_strand).to eq(false)
446
+ threads << Thread.new do
447
+ thread_body do
448
+ loop do
449
+ Delayed::Job.get_and_lock_next_available("w1")&.destroy
450
+ end
451
+ end
452
+ end
421
453
 
422
- j1.delete
454
+ threads << Thread.new do
455
+ thread_body do
456
+ loop do
457
+ loop_until_found(singleton: "singleton_job", next_in_strand: true)
458
+ end
459
+ end
460
+ end
423
461
 
424
- # In case we couldn't acquire a lock, we actually need to wait for
425
- # the other thread to set this to true.
426
- loop_secs(10.seconds) do
427
- break if j2.reload.next_in_strand
462
+ begin
463
+ loop_secs(60.seconds) do
464
+ if threads.any? { |x| x.thread_variable_get(:fail) }
465
+ raise "at least one job became orphaned or other error"
428
466
  end
467
+ end
468
+ ensure
469
+ threads.each(&:kill)
470
+ threads.each(&:join)
471
+ end
472
+ end
473
+
474
+ it "doesn't deadlock when transitioning from strand_a to strand_b" do
475
+ # In order to reproduce this one efficiently, you'll probably want to add
476
+ # a sleep within delayed_jobs_after_delete_row_tr_fn.
477
+ # PERFORM pg_advisory_xact_lock(half_md5_as_bigint(OLD.strand));
478
+ # PERFORM pg_sleep(random() * 2);
429
479
 
430
- expect(j2.reload.next_in_strand).to eq(true)
480
+ threads = []
481
+
482
+ threads << Thread.new do
483
+ thread_body do
484
+ loop do
485
+ j1 = create_job(singleton: "myjobs", strand: "myjobs2", locked_by: "w1")
486
+ j2 = create_job(singleton: "myjobs", strand: "myjobs")
431
487
 
432
- j2.delete
488
+ j1.delete
489
+ j2.delete
490
+ end
433
491
  end
434
- rescue
435
- Thread.current.thread_variable_set(:fail, true)
436
- raise
437
492
  end
438
493
 
439
494
  threads << Thread.new do
440
- thread_body(
441
- { singleton: "myjobs", strand: "myjobs2", locked_by: "w1" },
442
- { singleton: "myjobs", strand: "myjobs" }
443
- )
495
+ thread_body do
496
+ loop do
497
+ j1 = create_job(singleton: "myjobs2", strand: "myjobs", locked_by: "w1")
498
+ j2 = create_job(singleton: "myjobs2", strand: "myjobs2")
499
+
500
+ j1.delete
501
+ j2.delete
502
+ end
503
+ end
444
504
  end
445
505
 
446
506
  threads << Thread.new do
447
- thread_body(
448
- { singleton: "myjobs2", strand: "myjobs", locked_by: "w1" },
449
- { singleton: "myjobs2", strand: "myjobs2" }
450
- )
507
+ thread_body do
508
+ loop do
509
+ loop_until_found(singleton: "myjobs", next_in_strand: true)
510
+ end
511
+ end
512
+ end
513
+
514
+ threads << Thread.new do
515
+ thread_body do
516
+ loop do
517
+ loop_until_found(singleton: "myjobs2", next_in_strand: true)
518
+ end
519
+ end
451
520
  end
452
521
 
453
522
  begin
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: inst-jobs
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.0.5
4
+ version: 3.0.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Cody Cutrer
@@ -10,7 +10,7 @@ authors:
10
10
  autorequire:
11
11
  bindir: exe
12
12
  cert_chain: []
13
- date: 2021-12-09 00:00:00.000000000 Z
13
+ date: 2021-12-20 00:00:00.000000000 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: activerecord
@@ -467,6 +467,8 @@ files:
467
467
  - db/migrate/20210929204903_update_conflicting_singleton_function_to_use_index.rb
468
468
  - db/migrate/20211101190934_update_after_delete_trigger_for_singleton_index.rb
469
469
  - db/migrate/20211207094200_update_after_delete_trigger_for_singleton_transition_cases.rb
470
+ - db/migrate/20211220112800_fix_singleton_race_condition_insert.rb
471
+ - db/migrate/20211220113000_fix_singleton_race_condition_delete.rb
470
472
  - exe/inst_jobs
471
473
  - lib/delayed/backend/active_record.rb
472
474
  - lib/delayed/backend/base.rb