switchman-inst-jobs 3.2.7 → 3.2.8

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ac66ce458b7356068a5f576593f9692b5aa58d608584033a3aca941b31dc4a2d
4
- data.tar.gz: c790f410b55117257f9b447987f68617b61222b10bf621385736b096affa25f8
3
+ metadata.gz: e4023807a9b569aff8d1751ded026466f0e135b8eae0c6ad9ff7bd80a1a7f818
4
+ data.tar.gz: 8a61cde9444723e15b3ec0c6034791e1887972a572755fea1e72b87fe004ec7d
5
5
  SHA512:
6
- metadata.gz: 5d21bcfec5e444aeaf4bf839c13ab40453ea7b27c780e9153aeea23d5692800ff0e36dac604c7b7dec7c7723aea6a1baba675748a543240a69b37a3486665110
7
- data.tar.gz: ea0797fc7294cf1be7b641ad874e4560a69a6a7bc04228ee85f7c284f5fcc1288139b6824c72ae5db1c521b19fa3aec55d36c7831dc819046a8af7f2d623886b
6
+ metadata.gz: 0a947509044626f1ccebbd8c1ec34db74e0086dd3df8f8ef7a6b3dfbc6c616bb76a3832cf3a168191b954d1241586bacf8a7909bbeb3a7be1eec21eb3426865e
7
+ data.tar.gz: fb103f3ec6eff9c2b356f592f133c8eaae99b08487623b3d0aee257f1cb1215620d66824eb221ec745abc96e9c19fdcb5c254bd5bffec78bfcd283d9cd2b9745
@@ -0,0 +1,31 @@
1
+ # frozen_string_literal: true
2
+
3
+ class FixSingletonUniqueConstraint < ActiveRecord::Migration[5.2]
4
+ disable_ddl_transaction!
5
+
6
+ def up
7
+ rename_index :delayed_jobs, 'index_delayed_jobs_on_singleton_not_running', 'index_delayed_jobs_on_singleton_not_running_old'
8
+ rename_index :delayed_jobs, 'index_delayed_jobs_on_singleton_running', 'index_delayed_jobs_on_singleton_running_old'
9
+
10
+ # only one job can be queued in a singleton
11
+ add_index :delayed_jobs,
12
+ :singleton,
13
+ where: "singleton IS NOT NULL AND (locked_by IS NULL OR locked_by = '#{::Delayed::Backend::Base::ON_HOLD_LOCKED_BY}')",
14
+ unique: true,
15
+ name: 'index_delayed_jobs_on_singleton_not_running',
16
+ algorithm: :concurrently
17
+
18
+ # only one job can be running for a singleton
19
+ add_index :delayed_jobs,
20
+ :singleton,
21
+ where: "singleton IS NOT NULL AND locked_by IS NOT NULL AND locked_by <> '#{::Delayed::Backend::Base::ON_HOLD_LOCKED_BY}'",
22
+ unique: true,
23
+ name: 'index_delayed_jobs_on_singleton_running',
24
+ algorithm: :concurrently
25
+ end
26
+
27
+ def down
28
+ remove_index :delayed_jobs, name: 'index_delayed_jobs_on_singleton_not_running_old'
29
+ remove_index :delayed_jobs, name: 'index_delayed_jobs_on_singleton_running_old'
30
+ end
31
+ end
@@ -0,0 +1,60 @@
1
+ # frozen_string_literal: true
2
+
3
+ class UpdateInsertTriggerForSingletonUniqueConstraintChange < ActiveRecord::Migration[5.2]
4
+ def change
5
+ reversible do |direction|
6
+ direction.up do
7
+ execute(<<~SQL)
8
+ CREATE OR REPLACE FUNCTION #{connection.quote_table_name('delayed_jobs_before_insert_row_tr_fn')} () RETURNS trigger AS $$
9
+ BEGIN
10
+ IF NEW.strand IS NOT NULL THEN
11
+ PERFORM pg_advisory_xact_lock(half_md5_as_bigint(NEW.strand));
12
+ IF (SELECT COUNT(*) FROM (
13
+ SELECT 1 FROM delayed_jobs WHERE strand = NEW.strand AND next_in_strand=true LIMIT NEW.max_concurrent
14
+ ) s) = NEW.max_concurrent THEN
15
+ NEW.next_in_strand := false;
16
+ END IF;
17
+ END IF;
18
+ IF NEW.singleton IS NOT NULL THEN
19
+ PERFORM pg_advisory_xact_lock(half_md5_as_bigint(CONCAT('singleton:', NEW.singleton)));
20
+ -- this condition seems silly, but it forces postgres to use the two partial indexes on singleton,
21
+ -- rather than doing a seq scan
22
+ PERFORM 1 FROM delayed_jobs WHERE singleton = NEW.singleton AND (locked_by IS NULL OR locked_by = '#{::Delayed::Backend::Base::ON_HOLD_LOCKED_BY}' OR locked_by <> '#{::Delayed::Backend::Base::ON_HOLD_LOCKED_BY}');
23
+ IF FOUND THEN
24
+ NEW.next_in_strand := false;
25
+ END IF;
26
+ END IF;
27
+ RETURN NEW;
28
+ END;
29
+ $$ LANGUAGE plpgsql SET search_path TO #{::Switchman::Shard.current.name};
30
+ SQL
31
+ end
32
+ direction.down do
33
+ execute(<<~SQL)
34
+ CREATE OR REPLACE FUNCTION #{connection.quote_table_name('delayed_jobs_before_insert_row_tr_fn')} () RETURNS trigger AS $$
35
+ BEGIN
36
+ IF NEW.strand IS NOT NULL THEN
37
+ PERFORM pg_advisory_xact_lock(half_md5_as_bigint(NEW.strand));
38
+ IF (SELECT COUNT(*) FROM (
39
+ SELECT 1 FROM delayed_jobs WHERE strand = NEW.strand AND next_in_strand=true LIMIT NEW.max_concurrent
40
+ ) s) = NEW.max_concurrent THEN
41
+ NEW.next_in_strand := false;
42
+ END IF;
43
+ END IF;
44
+ IF NEW.singleton IS NOT NULL THEN
45
+ PERFORM pg_advisory_xact_lock(half_md5_as_bigint(CONCAT('singleton:', NEW.singleton)));
46
+ -- this condition seems silly, but it forces postgres to use the two partial indexes on singleton,
47
+ -- rather than doing a seq scan
48
+ PERFORM 1 FROM delayed_jobs WHERE singleton = NEW.singleton AND (locked_by IS NULL OR locked_by IS NOT NULL);
49
+ IF FOUND THEN
50
+ NEW.next_in_strand := false;
51
+ END IF;
52
+ END IF;
53
+ RETURN NEW;
54
+ END;
55
+ $$ LANGUAGE plpgsql SET search_path TO #{::Switchman::Shard.current.name};
56
+ SQL
57
+ end
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,209 @@
1
+ # frozen_string_literal: true
2
+
3
+ class UpdateDeleteTriggerForSingletonUniqueConstraintChange < ActiveRecord::Migration[5.2]
4
+ def up
5
+ execute(<<~SQL)
6
+ CREATE OR REPLACE FUNCTION #{connection.quote_table_name('delayed_jobs_after_delete_row_tr_fn')} () RETURNS trigger AS $$
7
+ DECLARE
8
+ next_strand varchar;
9
+ running_count integer;
10
+ should_lock boolean;
11
+ should_be_precise boolean;
12
+ update_query varchar;
13
+ skip_locked varchar;
14
+ transition boolean;
15
+ BEGIN
16
+ IF OLD.strand IS NOT NULL THEN
17
+ should_lock := true;
18
+ should_be_precise := OLD.id % (OLD.max_concurrent * 4) = 0;
19
+
20
+ IF NOT should_be_precise AND OLD.max_concurrent > 16 THEN
21
+ running_count := (SELECT COUNT(*) FROM (
22
+ SELECT 1 as one FROM delayed_jobs WHERE strand = OLD.strand AND next_in_strand = 't' LIMIT OLD.max_concurrent
23
+ ) subquery_for_count);
24
+ should_lock := running_count < OLD.max_concurrent;
25
+ END IF;
26
+
27
+ IF should_lock THEN
28
+ PERFORM pg_advisory_xact_lock(half_md5_as_bigint(OLD.strand));
29
+ END IF;
30
+
31
+ -- note that we don't really care if the row we're deleting has a singleton, or if it even
32
+ -- matches the row(s) we're going to update. we just need to make sure that whatever
33
+ -- singleton we grab isn't already running (which is a simple existence check, since
34
+ -- the unique indexes ensure there is at most one singleton running, and one queued)
35
+ update_query := 'UPDATE delayed_jobs SET next_in_strand=true WHERE id IN (
36
+ SELECT id FROM delayed_jobs j2
37
+ WHERE next_in_strand=false AND
38
+ j2.strand=$1.strand AND
39
+ (j2.singleton IS NULL OR NOT EXISTS (SELECT 1 FROM delayed_jobs j3 WHERE j3.singleton=j2.singleton AND j3.id<>j2.id AND (j3.locked_by IS NULL OR j3.locked_by = ''#{::Delayed::Backend::Base::ON_HOLD_LOCKED_BY}'' OR j3.locked_by <> ''#{::Delayed::Backend::Base::ON_HOLD_LOCKED_BY}'')))
40
+ ORDER BY j2.strand_order_override ASC, j2.id ASC
41
+ LIMIT ';
42
+
43
+ IF should_be_precise THEN
44
+ running_count := (SELECT COUNT(*) FROM (
45
+ SELECT 1 FROM delayed_jobs WHERE strand = OLD.strand AND next_in_strand = 't' LIMIT OLD.max_concurrent
46
+ ) s);
47
+ IF running_count < OLD.max_concurrent THEN
48
+ update_query := update_query || '($1.max_concurrent - $2)';
49
+ ELSE
50
+ -- we have too many running already; just bail
51
+ RETURN OLD;
52
+ END IF;
53
+ ELSE
54
+ update_query := update_query || '1';
55
+
56
+ -- n-strands don't require precise ordering; we can make this query more performant
57
+ IF OLD.max_concurrent > 1 THEN
58
+ skip_locked := ' SKIP LOCKED';
59
+ END IF;
60
+ END IF;
61
+
62
+ update_query := update_query || ' FOR UPDATE' || COALESCE(skip_locked, '') || ')';
63
+ EXECUTE update_query USING OLD, running_count;
64
+ END IF;
65
+
66
+ IF OLD.singleton IS NOT NULL THEN
67
+ PERFORM pg_advisory_xact_lock(half_md5_as_bigint(CONCAT('singleton:', OLD.singleton)));
68
+
69
+ transition := EXISTS (SELECT 1 FROM delayed_jobs AS j1 WHERE j1.singleton = OLD.singleton AND j1.strand IS DISTINCT FROM OLD.strand AND locked_by IS NULL);
70
+
71
+ IF transition THEN
72
+ next_strand := (SELECT j1.strand FROM delayed_jobs AS j1 WHERE j1.singleton = OLD.singleton AND j1.strand IS DISTINCT FROM OLD.strand AND locked_by IS NULL AND j1.strand IS NOT NULL LIMIT 1);
73
+
74
+ IF next_strand IS NOT NULL THEN
75
+ -- if the singleton has a new strand defined, we need to lock it to ensure we obey n_strand constraints --
76
+ IF NOT pg_try_advisory_xact_lock(half_md5_as_bigint(next_strand)) THEN
77
+ -- a failure to acquire the lock means that another process already has it and will thus handle this singleton --
78
+ RETURN OLD;
79
+ END IF;
80
+ END IF;
81
+ ELSIF OLD.strand IS NOT NULL THEN
82
+ -- if there is no transition and there is a strand then we have already handled this singleton in the case above --
83
+ RETURN OLD;
84
+ END IF;
85
+
86
+ -- handles transitioning a singleton from stranded to not stranded --
87
+ -- handles transitioning a singleton from unstranded to stranded --
88
+ -- handles transitioning a singleton from strand A to strand B --
89
+ -- these transitions are a relatively rare case, so we take a shortcut and --
90
+ -- only start the next singleton if its strand does not currently have any running jobs --
91
+ -- if it does, the next stranded job that finishes will start this singleton if it can --
92
+ UPDATE delayed_jobs SET next_in_strand=true WHERE id IN (
93
+ SELECT id FROM delayed_jobs j2
94
+ WHERE next_in_strand=false AND
95
+ j2.singleton=OLD.singleton AND
96
+ j2.locked_by IS NULL AND
97
+ (j2.strand IS NULL OR NOT EXISTS (SELECT 1 FROM delayed_jobs j3 WHERE j3.strand=j2.strand AND j3.id<>j2.id))
98
+ FOR UPDATE
99
+ );
100
+ END IF;
101
+ RETURN OLD;
102
+ END;
103
+ $$ LANGUAGE plpgsql SET search_path TO #{::Switchman::Shard.current.name};
104
+ SQL
105
+ end
106
+
107
+ def down
108
+ execute(<<~SQL)
109
+ CREATE OR REPLACE FUNCTION #{connection.quote_table_name('delayed_jobs_after_delete_row_tr_fn')} () RETURNS trigger AS $$
110
+ DECLARE
111
+ next_strand varchar;
112
+ running_count integer;
113
+ should_lock boolean;
114
+ should_be_precise boolean;
115
+ update_query varchar;
116
+ skip_locked varchar;
117
+ transition boolean;
118
+ BEGIN
119
+ IF OLD.strand IS NOT NULL THEN
120
+ should_lock := true;
121
+ should_be_precise := OLD.id % (OLD.max_concurrent * 4) = 0;
122
+
123
+ IF NOT should_be_precise AND OLD.max_concurrent > 16 THEN
124
+ running_count := (SELECT COUNT(*) FROM (
125
+ SELECT 1 as one FROM delayed_jobs WHERE strand = OLD.strand AND next_in_strand = 't' LIMIT OLD.max_concurrent
126
+ ) subquery_for_count);
127
+ should_lock := running_count < OLD.max_concurrent;
128
+ END IF;
129
+
130
+ IF should_lock THEN
131
+ PERFORM pg_advisory_xact_lock(half_md5_as_bigint(OLD.strand));
132
+ END IF;
133
+
134
+ -- note that we don't really care if the row we're deleting has a singleton, or if it even
135
+ -- matches the row(s) we're going to update. we just need to make sure that whatever
136
+ -- singleton we grab isn't already running (which is a simple existence check, since
137
+ -- the unique indexes ensure there is at most one singleton running, and one queued)
138
+ update_query := 'UPDATE delayed_jobs SET next_in_strand=true WHERE id IN (
139
+ SELECT id FROM delayed_jobs j2
140
+ WHERE next_in_strand=false AND
141
+ j2.strand=$1.strand AND
142
+ (j2.singleton IS NULL OR NOT EXISTS (SELECT 1 FROM delayed_jobs j3 WHERE j3.singleton=j2.singleton AND j3.id<>j2.id AND (j3.locked_by IS NULL OR j3.locked_by IS NOT NULL)))
143
+ ORDER BY j2.strand_order_override ASC, j2.id ASC
144
+ LIMIT ';
145
+
146
+ IF should_be_precise THEN
147
+ running_count := (SELECT COUNT(*) FROM (
148
+ SELECT 1 FROM delayed_jobs WHERE strand = OLD.strand AND next_in_strand = 't' LIMIT OLD.max_concurrent
149
+ ) s);
150
+ IF running_count < OLD.max_concurrent THEN
151
+ update_query := update_query || '($1.max_concurrent - $2)';
152
+ ELSE
153
+ -- we have too many running already; just bail
154
+ RETURN OLD;
155
+ END IF;
156
+ ELSE
157
+ update_query := update_query || '1';
158
+
159
+ -- n-strands don't require precise ordering; we can make this query more performant
160
+ IF OLD.max_concurrent > 1 THEN
161
+ skip_locked := ' SKIP LOCKED';
162
+ END IF;
163
+ END IF;
164
+
165
+ update_query := update_query || ' FOR UPDATE' || COALESCE(skip_locked, '') || ')';
166
+ EXECUTE update_query USING OLD, running_count;
167
+ END IF;
168
+
169
+ IF OLD.singleton IS NOT NULL THEN
170
+ PERFORM pg_advisory_xact_lock(half_md5_as_bigint(CONCAT('singleton:', OLD.singleton)));
171
+
172
+ transition := EXISTS (SELECT 1 FROM delayed_jobs AS j1 WHERE j1.singleton = OLD.singleton AND j1.strand IS DISTINCT FROM OLD.strand AND locked_by IS NULL);
173
+
174
+ IF transition THEN
175
+ next_strand := (SELECT j1.strand FROM delayed_jobs AS j1 WHERE j1.singleton = OLD.singleton AND j1.strand IS DISTINCT FROM OLD.strand AND locked_by IS NULL AND j1.strand IS NOT NULL LIMIT 1);
176
+
177
+ IF next_strand IS NOT NULL THEN
178
+ -- if the singleton has a new strand defined, we need to lock it to ensure we obey n_strand constraints --
179
+ IF NOT pg_try_advisory_xact_lock(half_md5_as_bigint(next_strand)) THEN
180
+ -- a failure to acquire the lock means that another process already has it and will thus handle this singleton --
181
+ RETURN OLD;
182
+ END IF;
183
+ END IF;
184
+ ELSIF OLD.strand IS NOT NULL THEN
185
+ -- if there is no transition and there is a strand then we have already handled this singleton in the case above --
186
+ RETURN OLD;
187
+ END IF;
188
+
189
+ -- handles transitioning a singleton from stranded to not stranded --
190
+ -- handles transitioning a singleton from unstranded to stranded --
191
+ -- handles transitioning a singleton from strand A to strand B --
192
+ -- these transitions are a relatively rare case, so we take a shortcut and --
193
+ -- only start the next singleton if its strand does not currently have any running jobs --
194
+ -- if it does, the next stranded job that finishes will start this singleton if it can --
195
+ UPDATE delayed_jobs SET next_in_strand=true WHERE id IN (
196
+ SELECT id FROM delayed_jobs j2
197
+ WHERE next_in_strand=false AND
198
+ j2.singleton=OLD.singleton AND
199
+ j2.locked_by IS NULL AND
200
+ (j2.strand IS NULL OR NOT EXISTS (SELECT 1 FROM delayed_jobs j3 WHERE j3.strand=j2.strand AND j3.id<>j2.id))
201
+ FOR UPDATE
202
+ );
203
+ END IF;
204
+ RETURN OLD;
205
+ END;
206
+ $$ LANGUAGE plpgsql SET search_path TO #{::Switchman::Shard.current.name};
207
+ SQL
208
+ end
209
+ end
@@ -0,0 +1,31 @@
1
+ # frozen_string_literal: true
2
+
3
+ class RemoveOldSingletonIndex < ActiveRecord::Migration[5.2]
4
+ disable_ddl_transaction!
5
+
6
+ def up
7
+ remove_index :delayed_jobs, name: 'index_delayed_jobs_on_singleton_not_running_old'
8
+ remove_index :delayed_jobs, name: 'index_delayed_jobs_on_singleton_running_old'
9
+ end
10
+
11
+ def down
12
+ rename_index :delayed_jobs, 'index_delayed_jobs_on_singleton_not_running', 'index_delayed_jobs_on_singleton_not_running_old'
13
+ rename_index :delayed_jobs, 'index_delayed_jobs_on_singleton_running', 'index_delayed_jobs_on_singleton_running_old'
14
+
15
+ # only one job can be queued in a singleton
16
+ add_index :delayed_jobs,
17
+ :singleton,
18
+ where: 'singleton IS NOT NULL AND locked_by IS NULL',
19
+ unique: true,
20
+ name: 'index_delayed_jobs_on_singleton_not_running',
21
+ algorithm: :concurrently
22
+
23
+ # only one job can be running for a singleton
24
+ add_index :delayed_jobs,
25
+ :singleton,
26
+ where: 'singleton IS NOT NULL AND locked_by IS NOT NULL',
27
+ unique: true,
28
+ name: 'index_delayed_jobs_on_singleton_running',
29
+ algorithm: :concurrently
30
+ end
31
+ end
@@ -19,13 +19,17 @@ module SwitchmanInstJobs
19
19
 
20
20
  # Ensure jobs get unblocked on the new shard if they exist
21
21
  ::Delayed::Worker.lifecycle.after(:perform) do |_worker, job|
22
- if job.strand
22
+ if job.strand || job.singleton
23
+ column = job.strand ? :strand : :singleton
24
+
23
25
  ::Switchman::Shard.clear_cache
24
26
  ::Switchman::Shard.default.activate do
25
27
  current_job_shard = ::Switchman::Shard.lookup(job.shard_id).delayed_jobs_shard
26
28
  if current_job_shard != ::Switchman::Shard.current(:delayed_jobs)
27
29
  current_job_shard.activate(:delayed_jobs) do
28
- j = ::Delayed::Job.where(strand: job.strand).next_in_strand_order.first
30
+ ::Delayed::Job.where(source: 'JobsMigrator::StrandBlocker', **{ column => job.try(column) }).delete_all
31
+
32
+ j = ::Delayed::Job.where(**{ column => job.try(column) }).next_in_strand_order.first
29
33
  j.update_column(:next_in_strand, true) if j && !j.next_in_strand
30
34
  end
31
35
  end
@@ -89,7 +89,9 @@ module SwitchmanInstJobs
89
89
  migrate_everything
90
90
  end
91
91
 
92
- def migrate_strands
92
+ def migrate_strands(batch_size: 1_000)
93
+ source_shard = ::Switchman::Shard.current(:delayed_jobs)
94
+
93
95
  # there are 4 scenarios to deal with here
94
96
  # 1) no running job, no jobs moved: do nothing
95
97
  # 2) running job, no jobs moved; create blocker with next_in_strand=false
@@ -98,60 +100,64 @@ module SwitchmanInstJobs
98
100
  # those (= do nothing since it should already be false)
99
101
  # 4) no running job, jobs moved: set next_in_strand=true on the first of
100
102
  # those (= do nothing since it should already be true)
103
+ handler = lambda { |scope, column, blocker_job_kwargs = {}|
104
+ shard_map = build_shard_map(scope, source_shard)
105
+ shard_map.each do |(target_shard, source_shard_ids)|
106
+ shard_scope = scope.where(shard_id: source_shard_ids)
101
107
 
102
- source_shard = ::Switchman::Shard.current(:delayed_jobs)
103
- strand_scope = ::Delayed::Job.shard(source_shard).where.not(strand: nil)
104
- shard_map = build_shard_map(strand_scope, source_shard)
105
- shard_map.each do |(target_shard, source_shard_ids)|
106
- shard_scope = strand_scope.where(shard_id: source_shard_ids)
107
-
108
- # 1) is taken care of because it should not show up here in strands
109
- strands = shard_scope.distinct.order(:strand).pluck(:strand)
110
-
111
- target_shard.activate(:delayed_jobs) do
112
- strands.each do |strand|
113
- transaction_on([source_shard, target_shard]) do
114
- this_strand_scope = shard_scope.where(strand: strand)
115
- # we want to copy all the jobs except the one that is still running.
116
- jobs_scope = this_strand_scope.where(locked_by: nil)
117
-
118
- # 2) and part of 3) are taken care of here by creating a blocker
119
- # job with next_in_strand = false. as soon as the current
120
- # running job is finished it should set next_in_strand
121
- # We lock it to ensure that the jobs worker can't delete it until we are done moving the strand
122
- # Since we only unlock it on the new jobs queue *after* deleting from the original
123
- # the lock ensures the blocker always gets unlocked
124
- first = this_strand_scope.where.not(locked_by: nil).next_in_strand_order.lock.first
125
- if first
126
- first_job = ::Delayed::Job.create!(strand: strand, next_in_strand: false)
127
- first_job.payload_object = ::Delayed::PerformableMethod.new(Kernel, :sleep, args: [0])
128
- first_job.queue = first.queue
129
- first_job.tag = 'Kernel.sleep'
130
- first_job.source = 'JobsMigrator::StrandBlocker'
131
- first_job.max_attempts = 1
132
- # If we ever have jobs left over from 9999 jobs moves of a single shard,
133
- # something has gone terribly wrong
134
- first_job.strand_order_override = -9999
135
- first_job.save!
136
- # the rest of 3) is taken care of here
137
- # make sure that all the jobs moved over are NOT next in strand
138
- ::Delayed::Job.where(next_in_strand: true, strand: strand, locked_by: nil).
139
- update_all(next_in_strand: false)
140
- end
108
+ # 1) is taken care of because it should not show up here in strands
109
+ values = shard_scope.distinct.order(column).pluck(column)
141
110
 
142
- # 4) is taken care of here, by leaving next_in_strand alone and
143
- # it should execute on the new shard
144
- batch_move_jobs(
145
- target_shard: target_shard,
146
- source_shard: source_shard,
147
- scope: jobs_scope
148
- ) do |job, new_job|
149
- # This ensures jobs enqueued on the old jobs shard run before jobs on the new jobs queue
150
- new_job.strand_order_override = job.strand_order_override - 1
111
+ target_shard.activate(:delayed_jobs) do
112
+ values.each do |value|
113
+ transaction_on([source_shard, target_shard]) do
114
+ value_scope = shard_scope.where(**{ column => value })
115
+ # we want to copy all the jobs except the one that is still running.
116
+ jobs_scope = value_scope.where(locked_by: nil)
117
+
118
+ # 2) and part of 3) are taken care of here by creating a blocker
119
+ # job with next_in_strand = false. as soon as the current
120
+ # running job is finished it should set next_in_strand
121
+ # We lock it to ensure that the jobs worker can't delete it until we are done moving the strand
122
+ # Since we only unlock it on the new jobs queue *after* deleting from the original
123
+ # the lock ensures the blocker always gets unlocked
124
+ first = value_scope.where.not(locked_by: nil).next_in_strand_order.lock.first
125
+ if first
126
+ create_blocker_job(queue: first.queue, **{ column => value }, **blocker_job_kwargs)
127
+ # the rest of 3) is taken care of here
128
+ # make sure that all the jobs moved over are NOT next in strand
129
+ ::Delayed::Job.where(next_in_strand: true, locked_by: nil, **{ column => value }).
130
+ update_all(next_in_strand: false)
131
+ end
132
+
133
+ # 4) is taken care of here, by leaving next_in_strand alone and
134
+ # it should execute on the new shard
135
+ batch_move_jobs(
136
+ target_shard: target_shard,
137
+ source_shard: source_shard,
138
+ scope: jobs_scope,
139
+ batch_size: batch_size
140
+ ) do |job, new_job|
141
+ # This ensures jobs enqueued on the old jobs shard run before jobs on the new jobs queue
142
+ new_job.strand_order_override = job.strand_order_override - 1
143
+ end
151
144
  end
152
145
  end
153
146
  end
147
+ end
148
+ }
149
+
150
+ strand_scope = ::Delayed::Job.shard(source_shard).where.not(strand: nil)
151
+ singleton_scope = ::Delayed::Job.shard(source_shard).where('strand IS NULL AND singleton IS NOT NULL')
152
+ all_scope = ::Delayed::Job.shard(source_shard).where('strand IS NOT NULL OR singleton IS NOT NULL')
153
+
154
+ handler.call(strand_scope, :strand)
155
+ handler.call(singleton_scope, :singleton,
156
+ { locked_at: DateTime.now, locked_by: ::Delayed::Backend::Base::ON_HOLD_BLOCKER })
154
157
 
158
+ shard_map = build_shard_map(all_scope, source_shard)
159
+ shard_map.each do |(target_shard, source_shard_ids)|
160
+ target_shard.activate(:delayed_jobs) do
155
161
  updated = ::Switchman::Shard.where(id: source_shard_ids, block_stranded: true).
156
162
  update_all(block_stranded: false)
157
163
  # If this is being manually re-run for some reason to clean something up, don't wait for nothing to happen
@@ -166,26 +172,40 @@ module SwitchmanInstJobs
166
172
  end
167
173
  end
168
174
 
169
- def unblock_strands(target_shard)
170
- target_shard.activate(:delayed_jobs) do
171
- loop do
172
- # We only want to unlock stranded jobs where they don't belong to a blocked shard (if they *do* belong)
173
- # to a blocked shard, they must be part of a concurrent jobs migration from a different source shard to
174
- # this target shard, so we shouldn't unlock them yet. We only ever unlock one job here to keep the
175
- # logic cleaner; if the job is n-stranded, after the first one runs, the trigger will unlock larger
176
- # batches
177
- break if ::Delayed::Job.where(id: ::Delayed::Job.select('DISTINCT ON (strand) id').
178
- where.not(strand: nil).
179
- where.not(shard_id: ::Switchman::Shard.where(block_stranded: true).pluck(:id)).where(
175
+ def unblock_strands(target_shard, batch_size: 10_000)
176
+ block_stranded_ids = ::Switchman::Shard.where(block_stranded: true).pluck(:id)
177
+ query = lambda { |column, scope|
178
+ ::Delayed::Job.
179
+ where(id: ::Delayed::Job.select("DISTINCT ON (#{column}) id").
180
+ where(scope).
181
+ where.not(shard_id: block_stranded_ids).
182
+ where(
180
183
  ::Delayed::Job.select(1).from("#{::Delayed::Job.quoted_table_name} dj2").
181
184
  where("dj2.next_in_strand = true OR dj2.source = 'JobsMigrator::StrandBlocker'").
182
- where('dj2.strand = delayed_jobs.strand').arel.exists.not
183
- ).order(:strand, :strand_order_override, :id)).limit(500).update_all(next_in_strand: true).zero?
185
+ where("dj2.#{column} = delayed_jobs.#{column}").arel.exists.not
186
+ ).
187
+ order(column, :strand_order_override, :id)).limit(batch_size)
188
+ }
189
+
190
+ target_shard.activate(:delayed_jobs) do
191
+ # We only want to unlock stranded jobs where they don't belong to a blocked shard (if they *do* belong)
192
+ # to a blocked shard, they must be part of a concurrent jobs migration from a different source shard to
193
+ # this target shard, so we shouldn't unlock them yet. We only ever unlock one job here to keep the
194
+ # logic cleaner; if the job is n-stranded, after the first one runs, the trigger will unlock larger
195
+ # batches
196
+
197
+ loop do
198
+ break if query.call(:strand, 'strand IS NOT NULL').update_all(next_in_strand: true).zero?
199
+ end
200
+
201
+ loop do
202
+ break if query.call(:singleton,
203
+ 'strand IS NULL AND singleton IS NOT NULL').update_all(next_in_strand: true).zero?
184
204
  end
185
205
  end
186
206
  end
187
207
 
188
- def migrate_everything
208
+ def migrate_everything(batch_size: 1_000)
189
209
  source_shard = ::Switchman::Shard.current(:delayed_jobs)
190
210
  scope = ::Delayed::Job.shard(source_shard).where('strand IS NULL')
191
211
 
@@ -194,13 +214,26 @@ module SwitchmanInstJobs
194
214
  batch_move_jobs(
195
215
  target_shard: target_shard,
196
216
  source_shard: source_shard,
197
- scope: scope.where(shard_id: source_shard_ids).where(locked_by: nil)
217
+ scope: scope.where(shard_id: source_shard_ids).where(locked_by: nil),
218
+ batch_size: batch_size
198
219
  )
199
220
  end
200
221
  end
201
222
 
202
223
  private
203
224
 
225
+ def create_blocker_job(**kwargs)
226
+ first_job = ::Delayed::Job.create!(**kwargs, next_in_strand: false)
227
+ first_job.payload_object = ::Delayed::PerformableMethod.new(Kernel, :sleep, args: [0])
228
+ first_job.tag = 'Kernel.sleep'
229
+ first_job.source = 'JobsMigrator::StrandBlocker'
230
+ first_job.max_attempts = 1
231
+ # If we ever have jobs left over from 9999 jobs moves of a single shard,
232
+ # something has gone terribly wrong
233
+ first_job.strand_order_override = -9999
234
+ first_job.save!
235
+ end
236
+
204
237
  def build_shard_map(scope, source_shard)
205
238
  shard_ids = scope.distinct.pluck(:shard_id)
206
239
 
@@ -215,10 +248,10 @@ module SwitchmanInstJobs
215
248
  shard_map
216
249
  end
217
250
 
218
- def batch_move_jobs(target_shard:, source_shard:, scope:)
251
+ def batch_move_jobs(target_shard:, source_shard:, scope:, batch_size:)
219
252
  while scope.exists?
220
253
  # Adapted from get_and_lock_next_available in delayed/backend/active_record.rb
221
- target_jobs = scope.limit(1000).lock('FOR UPDATE SKIP LOCKED')
254
+ target_jobs = scope.limit(batch_size).lock('FOR UPDATE SKIP LOCKED')
222
255
 
223
256
  query = source_shard.activate(:delayed_jobs) do
224
257
  "WITH limited_jobs AS (#{target_jobs.to_sql}) " \
@@ -1,3 +1,3 @@
1
1
  module SwitchmanInstJobs
2
- VERSION = '3.2.7'.freeze
2
+ VERSION = '3.2.8'.freeze
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: switchman-inst-jobs
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.2.7
4
+ version: 3.2.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Bryan Petty
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-12-20 00:00:00.000000000 Z
11
+ date: 2022-02-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: inst-jobs
@@ -305,6 +305,10 @@ files:
305
305
  - db/migrate/20211207094200_update_after_delete_trigger_for_singleton_transition_cases.rb
306
306
  - db/migrate/20211220112800_fix_singleton_race_condition_insert.rb
307
307
  - db/migrate/20211220113000_fix_singleton_race_condition_delete.rb
308
+ - db/migrate/20220127091200_fix_singleton_unique_constraint.rb
309
+ - db/migrate/20220128084800_update_insert_trigger_for_singleton_unique_constraint_change.rb
310
+ - db/migrate/20220128084900_update_delete_trigger_for_singleton_unique_constraint_change.rb
311
+ - db/migrate/20220203063200_remove_old_singleton_index.rb
308
312
  - lib/switchman-inst-jobs.rb
309
313
  - lib/switchman_inst_jobs.rb
310
314
  - lib/switchman_inst_jobs/active_record/connection_adapters/postgresql_adapter.rb