gitlab-sidekiq-fetcher 0.5.1.pre.alpha → 0.5.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.gitignore +1 -0
- data/.gitlab-ci.yml +7 -7
- data/README.md +11 -0
- data/gitlab-sidekiq-fetcher.gemspec +10 -10
- data/lib/sidekiq-reliable-fetch.rb +1 -0
- data/lib/sidekiq/base_reliable_fetch.rb +117 -94
- data/lib/sidekiq/interrupted_set.rb +47 -0
- data/spec/base_reliable_fetch_spec.rb +31 -7
- data/spec/fetch_shared_examples.rb +146 -8
- data/tests/README.md +8 -6
- data/tests/{retry_test → interruption}/config.rb +0 -1
- data/tests/{retry_test/no_retry_test.rb → interruption/test_kill_signal.rb} +5 -4
- data/tests/{retry_test/retry_test.rb → interruption/test_term_signal.rb} +5 -4
- data/tests/interruption/worker.rb +15 -0
- data/tests/{reliability_test → reliability}/config.rb +0 -0
- data/tests/{reliability_test → reliability}/reliability_test.rb +0 -0
- data/tests/{reliability_test → reliability}/worker.rb +0 -0
- metadata +13 -14
- data/tests/retry_test/no_retry_worker.rb +0 -21
- data/tests/retry_test/worker.rb +0 -23
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: eb384451139d1638854cd94a673b12e0f7fd5a42c663535da0145fd153ba4c51
|
4
|
+
data.tar.gz: 43851081b6406fe8a876824ef034c548665382707db0aaf41c84c186a98419b0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 617032e226ffc898b1c411b0f7598169272bc8521156b9eadcb35b48a139cd854f1b12a2fd98e7fd174128727f0b8c60917780cd2b6dec82cb1370a6260b20ee
|
7
|
+
data.tar.gz: 5e56e3840a5f0a4f437ec298de7b68ef796dcfa7d31e69117398ceb22b4ed07fbb4eac4e711b344f5134fd748f9afa476d0d4ffc216cc02cd4eec5b9db90a9c2
|
data/.gitignore
CHANGED
data/.gitlab-ci.yml
CHANGED
@@ -25,7 +25,7 @@ rspec:
|
|
25
25
|
.integration:
|
26
26
|
stage: test
|
27
27
|
script:
|
28
|
-
- cd tests/
|
28
|
+
- cd tests/reliability
|
29
29
|
- bundle exec ruby reliability_test.rb
|
30
30
|
services:
|
31
31
|
- redis:alpine
|
@@ -47,19 +47,19 @@ integration_basic:
|
|
47
47
|
variables:
|
48
48
|
JOB_FETCHER: basic
|
49
49
|
|
50
|
-
|
50
|
+
kill_interruption:
|
51
51
|
stage: test
|
52
52
|
script:
|
53
|
-
- cd tests/
|
54
|
-
- bundle exec ruby
|
53
|
+
- cd tests/interruption
|
54
|
+
- bundle exec ruby test_kill_signal.rb
|
55
55
|
services:
|
56
56
|
- redis:alpine
|
57
57
|
|
58
|
-
|
58
|
+
term_interruption:
|
59
59
|
stage: test
|
60
60
|
script:
|
61
|
-
- cd tests/
|
62
|
-
- bundle exec ruby
|
61
|
+
- cd tests/interruption
|
62
|
+
- bundle exec ruby test_term_signal.rb
|
63
63
|
services:
|
64
64
|
- redis:alpine
|
65
65
|
|
data/README.md
CHANGED
@@ -10,6 +10,17 @@ There are two strategies implemented: [Reliable fetch](http://redis.io/commands/
|
|
10
10
|
semi-reliable fetch that uses regular `brpop` and `lpush` to pick the job and put it to working queue. The main benefit of "Reliable" strategy is that `rpoplpush` is atomic, eliminating a race condition in which jobs can be lost.
|
11
11
|
However, it comes at a cost because `rpoplpush` can't watch multiple lists at the same time so we need to iterate over the entire queue list which significantly increases pressure on Redis when there are more than a few queues. The "semi-reliable" strategy is much more reliable than the default Sidekiq fetcher, though. Compared to the reliable fetch strategy, it does not increase pressure on Redis significantly.
|
12
12
|
|
13
|
+
### Interruption handling
|
14
|
+
|
15
|
+
Sidekiq expects any job to report succcess or to fail. In the last case, Sidekiq puts `retry_count` counter
|
16
|
+
into the job and keeps to re-run the job until the counter reched the maximum allowed value. When the job has
|
17
|
+
not been given a chance to finish its work(to report success or fail), for example, when it was killed forcibly or when the job was requeued, after receiving TERM signal, the standard retry mechanisme does not get into the game and the job will be retried indefinatelly. This is why Reliable fetcher maintains a special counter `interrupted_count`
|
18
|
+
which is used to limit the amount of such retries. In both cases, Reliable Fetcher increments counter `interrupted_count` and rejects the job from running again when the counter exceeds `max_retries_after_interruption` times (default: 3 times).
|
19
|
+
Such a job will be put to `interrupted` queue. This queue mostly behaves as Sidekiq Dead queue so it only stores a limited amount of jobs for a limited term. Same as for Dead queue, all the limits are configurable via `interrupted_max_jobs` (default: 10_000) and `interrupted_timeout_in_seconds` (default: 3 months) Sidekiq option keys.
|
20
|
+
|
21
|
+
You can also disable special handling of interrupted jobs by setting `max_retries_after_interruption` into `-1`.
|
22
|
+
In this case, interrupted jobs will be run without any limits from Reliable Fetcher and they won't be put into Interrupted queue.
|
23
|
+
|
13
24
|
|
14
25
|
## Installation
|
15
26
|
|
@@ -1,14 +1,14 @@
|
|
1
1
|
Gem::Specification.new do |s|
|
2
|
-
s.name
|
3
|
-
s.version
|
4
|
-
s.authors
|
5
|
-
s.email
|
6
|
-
s.license
|
7
|
-
s.homepage
|
8
|
-
s.summary
|
9
|
-
s.description
|
2
|
+
s.name = 'gitlab-sidekiq-fetcher'
|
3
|
+
s.version = '0.5.5'
|
4
|
+
s.authors = ['TEA', 'GitLab']
|
5
|
+
s.email = 'valery@gitlab.com'
|
6
|
+
s.license = 'LGPL-3.0'
|
7
|
+
s.homepage = 'https://gitlab.com/gitlab-org/sidekiq-reliable-fetch/'
|
8
|
+
s.summary = 'Reliable fetch extension for Sidekiq'
|
9
|
+
s.description = 'Redis reliable queue pattern implemented in Sidekiq'
|
10
10
|
s.require_paths = ['lib']
|
11
|
-
s.files
|
12
|
-
s.test_files
|
11
|
+
s.files = `git ls-files`.split($\)
|
12
|
+
s.test_files = []
|
13
13
|
s.add_dependency 'sidekiq', '~> 5'
|
14
14
|
end
|
@@ -1,6 +1,6 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
|
3
|
+
require_relative 'interrupted_set'
|
4
4
|
|
5
5
|
module Sidekiq
|
6
6
|
class BaseReliableFetch
|
@@ -18,6 +18,13 @@ module Sidekiq
|
|
18
18
|
# Defines the COUNT parameter that will be passed to Redis SCAN command
|
19
19
|
SCAN_COUNT = 1000
|
20
20
|
|
21
|
+
# How much time a job can be interrupted
|
22
|
+
DEFAULT_MAX_RETRIES_AFTER_INTERRUPTION = 3
|
23
|
+
|
24
|
+
# Regexes for matching working queue keys
|
25
|
+
WORKING_QUEUE_REGEX = /#{WORKING_QUEUE_PREFIX}:(queue:.*):([^:]*:[0-9]*:[0-9a-f]*)\z/.freeze
|
26
|
+
LEGACY_WORKING_QUEUE_REGEX = /#{WORKING_QUEUE_PREFIX}:(queue:.*):([^:]*:[0-9]*)\z/.freeze
|
27
|
+
|
21
28
|
UnitOfWork = Struct.new(:queue, :job) do
|
22
29
|
def acknowledge
|
23
30
|
Sidekiq.redis { |conn| conn.lrem(Sidekiq::BaseReliableFetch.working_queue_name(queue), 1, job) }
|
@@ -65,162 +72,178 @@ module Sidekiq
|
|
65
72
|
end
|
66
73
|
end
|
67
74
|
|
68
|
-
def self.
|
69
|
-
|
75
|
+
def self.hostname
|
76
|
+
Socket.gethostname
|
77
|
+
end
|
78
|
+
|
79
|
+
def self.process_nonce
|
80
|
+
@@process_nonce ||= SecureRandom.hex(6)
|
70
81
|
end
|
71
82
|
|
72
|
-
def self.
|
73
|
-
|
83
|
+
def self.identity
|
84
|
+
@@identity ||= "#{hostname}:#{$$}:#{process_nonce}"
|
74
85
|
end
|
75
86
|
|
76
87
|
def self.heartbeat
|
77
88
|
Sidekiq.redis do |conn|
|
78
|
-
conn.set(heartbeat_key(
|
89
|
+
conn.set(heartbeat_key(identity), 1, ex: HEARTBEAT_LIFESPAN)
|
79
90
|
end
|
80
91
|
|
81
|
-
Sidekiq.logger.debug("Heartbeat for
|
92
|
+
Sidekiq.logger.debug("Heartbeat for #{identity}")
|
82
93
|
end
|
83
94
|
|
84
95
|
def self.bulk_requeue(inprogress, _options)
|
85
96
|
return if inprogress.empty?
|
86
97
|
|
87
|
-
Sidekiq.logger.debug('Re-queueing terminated jobs')
|
88
|
-
|
89
98
|
Sidekiq.redis do |conn|
|
90
99
|
inprogress.each do |unit_of_work|
|
91
100
|
conn.multi do |multi|
|
92
|
-
|
101
|
+
preprocess_interrupted_job(unit_of_work.job, unit_of_work.queue, multi)
|
102
|
+
|
93
103
|
multi.lrem(working_queue_name(unit_of_work.queue), 1, unit_of_work.job)
|
94
104
|
end
|
95
105
|
end
|
96
106
|
end
|
97
|
-
|
98
|
-
Sidekiq.logger.info("Pushed #{inprogress.size} jobs back to Redis")
|
99
107
|
rescue => e
|
100
108
|
Sidekiq.logger.warn("Failed to requeue #{inprogress.size} jobs: #{e.message}")
|
101
109
|
end
|
102
110
|
|
103
|
-
def self.
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
111
|
+
def self.clean_working_queue!(original_queue, working_queue)
|
112
|
+
Sidekiq.redis do |conn|
|
113
|
+
while job = conn.rpop(working_queue)
|
114
|
+
preprocess_interrupted_job(job, original_queue)
|
115
|
+
end
|
116
|
+
end
|
109
117
|
end
|
110
118
|
|
111
|
-
|
112
|
-
|
113
|
-
|
119
|
+
def self.preprocess_interrupted_job(job, queue, conn = nil)
|
120
|
+
msg = Sidekiq.load_json(job)
|
121
|
+
msg['interrupted_count'] = msg['interrupted_count'].to_i + 1
|
114
122
|
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
@queues = options[:queues].map { |q| "queue:#{q}" }
|
123
|
+
if interruption_exhausted?(msg)
|
124
|
+
send_to_quarantine(msg, conn)
|
125
|
+
else
|
126
|
+
requeue_job(queue, msg, conn)
|
127
|
+
end
|
121
128
|
end
|
122
129
|
|
123
|
-
def
|
124
|
-
|
130
|
+
def self.extract_queue_and_identity(key)
|
131
|
+
# New identity format is "{hostname}:{pid}:{randomhex}
|
132
|
+
# Old identity format is "{hostname}:{pid}"
|
133
|
+
# Queue names may also have colons (namespaced).
|
134
|
+
# Expressing this in a single regex is unreadable
|
125
135
|
|
126
|
-
|
127
|
-
|
136
|
+
# Test the newer expected format first, only checking the older if necessary
|
137
|
+
original_queue, identity = key.scan(WORKING_QUEUE_REGEX).flatten
|
138
|
+
return original_queue, identity unless original_queue.nil? || identity.nil?
|
128
139
|
|
129
|
-
|
130
|
-
raise NotImplementedError,
|
131
|
-
"#{self.class} does not implement #{__method__}"
|
140
|
+
key.scan(LEGACY_WORKING_QUEUE_REGEX).flatten
|
132
141
|
end
|
133
142
|
|
134
|
-
|
135
|
-
|
136
|
-
def
|
137
|
-
|
143
|
+
# Detect "old" jobs and requeue them because the worker they were assigned
|
144
|
+
# to probably failed miserably.
|
145
|
+
def self.clean_working_queues!
|
146
|
+
Sidekiq.logger.info('Cleaning working queues')
|
138
147
|
|
139
148
|
Sidekiq.redis do |conn|
|
140
|
-
count
|
141
|
-
|
142
|
-
while job = conn.rpop(working_queue)
|
143
|
-
msg = begin
|
144
|
-
Sidekiq.load_json(job)
|
145
|
-
rescue => e
|
146
|
-
Sidekiq.logger.info("Skipped job: #{job} as we couldn't parse it")
|
147
|
-
next
|
148
|
-
end
|
149
|
-
|
150
|
-
msg['retry_count'] = msg['retry_count'].to_i + 1
|
151
|
-
|
152
|
-
if retries_exhausted?(msg)
|
153
|
-
send_to_morgue(msg)
|
154
|
-
else
|
155
|
-
job = Sidekiq.dump_json(msg)
|
149
|
+
conn.scan_each(match: "#{WORKING_QUEUE_PREFIX}:queue:*", count: SCAN_COUNT) do |key|
|
150
|
+
original_queue, identity = extract_queue_and_identity(key)
|
156
151
|
|
157
|
-
|
152
|
+
next if original_queue.nil? || identity.nil?
|
158
153
|
|
159
|
-
|
160
|
-
end
|
154
|
+
clean_working_queue!(original_queue, key) if worker_dead?(identity, conn)
|
161
155
|
end
|
162
|
-
|
163
|
-
Sidekiq.logger.info("Requeued #{count} dead jobs to #{original_queue}")
|
164
156
|
end
|
165
157
|
end
|
166
158
|
|
167
|
-
def
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
159
|
+
def self.worker_dead?(identity, conn)
|
160
|
+
!conn.get(heartbeat_key(identity))
|
161
|
+
end
|
162
|
+
|
163
|
+
def self.heartbeat_key(identity)
|
164
|
+
"reliable-fetcher-heartbeat-#{identity.gsub(':', '-')}"
|
165
|
+
end
|
174
166
|
|
175
|
-
|
167
|
+
def self.working_queue_name(queue)
|
168
|
+
"#{WORKING_QUEUE_PREFIX}:#{queue}:#{identity}"
|
169
|
+
end
|
176
170
|
|
177
|
-
|
171
|
+
def self.interruption_exhausted?(msg)
|
172
|
+
return false if max_retries_after_interruption(msg['class']) < 0
|
178
173
|
|
179
|
-
msg['
|
174
|
+
msg['interrupted_count'].to_i >= max_retries_after_interruption(msg['class'])
|
180
175
|
end
|
181
176
|
|
182
|
-
def
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
177
|
+
def self.max_retries_after_interruption(worker_class)
|
178
|
+
max_retries_after_interruption = nil
|
179
|
+
|
180
|
+
max_retries_after_interruption ||= begin
|
181
|
+
Object.const_get(worker_class).sidekiq_options[:max_retries_after_interruption]
|
182
|
+
rescue NameError
|
187
183
|
end
|
184
|
+
|
185
|
+
max_retries_after_interruption ||= Sidekiq.options[:max_retries_after_interruption]
|
186
|
+
max_retries_after_interruption ||= DEFAULT_MAX_RETRIES_AFTER_INTERRUPTION
|
187
|
+
max_retries_after_interruption
|
188
188
|
end
|
189
189
|
|
190
|
-
def
|
190
|
+
def self.send_to_quarantine(msg, multi_connection = nil)
|
191
191
|
Sidekiq.logger.warn(
|
192
192
|
class: msg['class'],
|
193
193
|
jid: msg['jid'],
|
194
|
-
message: %(Reliable Fetcher: adding dead #{msg['class']} job #{msg['jid']})
|
194
|
+
message: %(Reliable Fetcher: adding dead #{msg['class']} job #{msg['jid']} to interrupted queue)
|
195
195
|
)
|
196
196
|
|
197
|
-
|
198
|
-
Sidekiq::
|
197
|
+
job = Sidekiq.dump_json(msg)
|
198
|
+
Sidekiq::InterruptedSet.new.put(job, connection: multi_connection)
|
199
199
|
end
|
200
200
|
|
201
|
-
#
|
202
|
-
# to
|
203
|
-
def
|
204
|
-
|
201
|
+
# If you want this method to be run is a scope of multi connection
|
202
|
+
# you need to pass it
|
203
|
+
def self.requeue_job(queue, msg, conn)
|
204
|
+
with_connection(conn) do |conn|
|
205
|
+
conn.lpush(queue, Sidekiq.dump_json(msg))
|
206
|
+
end
|
205
207
|
|
206
|
-
Sidekiq.
|
207
|
-
|
208
|
-
|
209
|
-
|
208
|
+
Sidekiq.logger.info(
|
209
|
+
message: "Pushed job #{msg['jid']} back to queue #{queue}",
|
210
|
+
jid: msg['jid'],
|
211
|
+
queue: queue
|
212
|
+
)
|
213
|
+
end
|
210
214
|
|
211
|
-
|
215
|
+
# Yield block with an existing connection or creates another one
|
216
|
+
def self.with_connection(conn, &block)
|
217
|
+
return yield(conn) if conn
|
212
218
|
|
213
|
-
|
214
|
-
end
|
215
|
-
end
|
219
|
+
Sidekiq.redis { |conn| yield(conn) }
|
216
220
|
end
|
217
221
|
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
+
attr_reader :cleanup_interval, :last_try_to_take_lease_at, :lease_interval,
|
223
|
+
:queues, :use_semi_reliable_fetch,
|
224
|
+
:strictly_ordered_queues
|
225
|
+
|
226
|
+
def initialize(options)
|
227
|
+
@cleanup_interval = options.fetch(:cleanup_interval, DEFAULT_CLEANUP_INTERVAL)
|
228
|
+
@lease_interval = options.fetch(:lease_interval, DEFAULT_LEASE_INTERVAL)
|
229
|
+
@last_try_to_take_lease_at = 0
|
230
|
+
@strictly_ordered_queues = !!options[:strict]
|
231
|
+
@queues = options[:queues].map { |q| "queue:#{q}" }
|
222
232
|
end
|
223
233
|
|
234
|
+
def retrieve_work
|
235
|
+
self.class.clean_working_queues! if take_lease
|
236
|
+
|
237
|
+
retrieve_unit_of_work
|
238
|
+
end
|
239
|
+
|
240
|
+
def retrieve_unit_of_work
|
241
|
+
raise NotImplementedError,
|
242
|
+
"#{self.class} does not implement #{__method__}"
|
243
|
+
end
|
244
|
+
|
245
|
+
private
|
246
|
+
|
224
247
|
def take_lease
|
225
248
|
return unless allowed_to_take_a_lease?
|
226
249
|
|
@@ -0,0 +1,47 @@
|
|
1
|
+
require 'sidekiq/api'
|
2
|
+
|
3
|
+
module Sidekiq
|
4
|
+
class InterruptedSet < ::Sidekiq::JobSet
|
5
|
+
DEFAULT_MAX_CAPACITY = 10_000
|
6
|
+
DEFAULT_MAX_TIMEOUT = 90 * 24 * 60 * 60 # 3 months
|
7
|
+
|
8
|
+
def initialize
|
9
|
+
super "interrupted"
|
10
|
+
end
|
11
|
+
|
12
|
+
def put(message, opts = {})
|
13
|
+
now = Time.now.to_f
|
14
|
+
|
15
|
+
with_multi_connection(opts[:connection]) do |conn|
|
16
|
+
conn.zadd(name, now.to_s, message)
|
17
|
+
conn.zremrangebyscore(name, '-inf', now - self.class.timeout)
|
18
|
+
conn.zremrangebyrank(name, 0, - self.class.max_jobs)
|
19
|
+
end
|
20
|
+
|
21
|
+
true
|
22
|
+
end
|
23
|
+
|
24
|
+
# Yield block inside an existing multi connection or creates new one
|
25
|
+
def with_multi_connection(conn, &block)
|
26
|
+
return yield(conn) if conn
|
27
|
+
|
28
|
+
Sidekiq.redis do |c|
|
29
|
+
c.multi do |multi|
|
30
|
+
yield(multi)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def retry_all
|
36
|
+
each(&:retry) while size > 0
|
37
|
+
end
|
38
|
+
|
39
|
+
def self.max_jobs
|
40
|
+
Sidekiq.options[:interrupted_max_jobs] || DEFAULT_MAX_CAPACITY
|
41
|
+
end
|
42
|
+
|
43
|
+
def self.timeout
|
44
|
+
Sidekiq.options[:interrupted_timeout_in_seconds] || DEFAULT_MAX_TIMEOUT
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -5,10 +5,11 @@ require 'sidekiq/reliable_fetch'
|
|
5
5
|
require 'sidekiq/semi_reliable_fetch'
|
6
6
|
|
7
7
|
describe Sidekiq::BaseReliableFetch do
|
8
|
+
let(:job) { Sidekiq.dump_json(class: 'Bob', args: [1, 2, 'foo']) }
|
9
|
+
|
8
10
|
before { Sidekiq.redis(&:flushdb) }
|
9
11
|
|
10
12
|
describe 'UnitOfWork' do
|
11
|
-
let(:job) { Sidekiq.dump_json({ class: 'Bob', args: [1, 2, 'foo'] }) }
|
12
13
|
let(:fetcher) { Sidekiq::ReliableFetch.new(queues: ['foo']) }
|
13
14
|
|
14
15
|
describe '#requeue' do
|
@@ -39,19 +40,42 @@ describe Sidekiq::BaseReliableFetch do
|
|
39
40
|
end
|
40
41
|
|
41
42
|
describe '.bulk_requeue' do
|
43
|
+
let!(:queue1) { Sidekiq::Queue.new('foo') }
|
44
|
+
let!(:queue2) { Sidekiq::Queue.new('bar') }
|
45
|
+
|
42
46
|
it 'requeues the bulk' do
|
43
|
-
|
44
|
-
|
47
|
+
uow = described_class::UnitOfWork
|
48
|
+
jobs = [ uow.new('queue:foo', job), uow.new('queue:foo', job), uow.new('queue:bar', job) ]
|
49
|
+
described_class.bulk_requeue(jobs, queues: [])
|
45
50
|
|
46
|
-
expect(queue1.size).to eq
|
47
|
-
expect(queue2.size).to eq
|
51
|
+
expect(queue1.size).to eq 2
|
52
|
+
expect(queue2.size).to eq 1
|
53
|
+
end
|
48
54
|
|
55
|
+
it 'puts jobs into interrupted queue' do
|
49
56
|
uow = described_class::UnitOfWork
|
50
|
-
|
57
|
+
interrupted_job = Sidekiq.dump_json(class: 'Bob', args: [1, 2, 'foo'], interrupted_count: 3)
|
58
|
+
jobs = [ uow.new('queue:foo', interrupted_job), uow.new('queue:foo', job), uow.new('queue:bar', job) ]
|
59
|
+
described_class.bulk_requeue(jobs, queues: [])
|
60
|
+
|
61
|
+
expect(queue1.size).to eq 1
|
62
|
+
expect(queue2.size).to eq 1
|
63
|
+
expect(Sidekiq::InterruptedSet.new.size).to eq 1
|
64
|
+
end
|
65
|
+
|
66
|
+
it 'does not put jobs into interrupted queue if it is disabled' do
|
67
|
+
Sidekiq.options[:max_retries_after_interruption] = -1
|
68
|
+
|
69
|
+
uow = described_class::UnitOfWork
|
70
|
+
interrupted_job = Sidekiq.dump_json(class: 'Bob', args: [1, 2, 'foo'], interrupted_count: 3)
|
71
|
+
jobs = [ uow.new('queue:foo', interrupted_job), uow.new('queue:foo', job), uow.new('queue:bar', job) ]
|
51
72
|
described_class.bulk_requeue(jobs, queues: [])
|
52
73
|
|
53
74
|
expect(queue1.size).to eq 2
|
54
75
|
expect(queue2.size).to eq 1
|
76
|
+
expect(Sidekiq::InterruptedSet.new.size).to eq 0
|
77
|
+
|
78
|
+
Sidekiq.options[:max_retries_after_interruption] = 3
|
55
79
|
end
|
56
80
|
end
|
57
81
|
|
@@ -63,7 +87,7 @@ describe Sidekiq::BaseReliableFetch do
|
|
63
87
|
Sidekiq.redis do |conn|
|
64
88
|
sleep 0.2 # Give the time to heartbeat thread to make a loop
|
65
89
|
|
66
|
-
heartbeat_key = described_class.heartbeat_key(
|
90
|
+
heartbeat_key = described_class.heartbeat_key(described_class.identity)
|
67
91
|
heartbeat = conn.get(heartbeat_key)
|
68
92
|
|
69
93
|
expect(heartbeat).not_to be_nil
|
@@ -4,7 +4,7 @@ shared_examples 'a Sidekiq fetcher' do
|
|
4
4
|
before { Sidekiq.redis(&:flushdb) }
|
5
5
|
|
6
6
|
describe '#retrieve_work' do
|
7
|
-
let(:job) { Sidekiq.dump_json(
|
7
|
+
let(:job) { Sidekiq.dump_json(class: 'Bob', args: [1, 2, 'foo']) }
|
8
8
|
let(:fetcher) { described_class.new(queues: ['assigned']) }
|
9
9
|
|
10
10
|
it 'retrieves the job and puts it to working queue' do
|
@@ -24,17 +24,18 @@ shared_examples 'a Sidekiq fetcher' do
|
|
24
24
|
expect(fetcher.retrieve_work).to be_nil
|
25
25
|
end
|
26
26
|
|
27
|
-
it 'requeues jobs from dead working queue with incremented
|
27
|
+
it 'requeues jobs from dead working queue with incremented interrupted_count' do
|
28
28
|
Sidekiq.redis do |conn|
|
29
29
|
conn.rpush(other_process_working_queue_name('assigned'), job)
|
30
30
|
end
|
31
31
|
|
32
32
|
expected_job = Sidekiq.load_json(job)
|
33
|
-
expected_job['
|
33
|
+
expected_job['interrupted_count'] = 1
|
34
34
|
expected_job = Sidekiq.dump_json(expected_job)
|
35
35
|
|
36
36
|
uow = fetcher.retrieve_work
|
37
37
|
|
38
|
+
expect(uow).to_not be_nil
|
38
39
|
expect(uow.job).to eq expected_job
|
39
40
|
|
40
41
|
Sidekiq.redis do |conn|
|
@@ -42,6 +43,40 @@ shared_examples 'a Sidekiq fetcher' do
|
|
42
43
|
end
|
43
44
|
end
|
44
45
|
|
46
|
+
it 'ignores working queue keys in unknown formats' do
|
47
|
+
# Add a spurious non-numeric char segment at the end; this simulates any other
|
48
|
+
# incorrect form in general
|
49
|
+
malformed_key = "#{other_process_working_queue_name('assigned')}:X"
|
50
|
+
Sidekiq.redis do |conn|
|
51
|
+
conn.rpush(malformed_key, job)
|
52
|
+
end
|
53
|
+
|
54
|
+
uow = fetcher.retrieve_work
|
55
|
+
|
56
|
+
Sidekiq.redis do |conn|
|
57
|
+
expect(conn.llen(malformed_key)).to eq 1
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
it 'requeues jobs from legacy dead working queue with incremented interrupted_count' do
|
62
|
+
Sidekiq.redis do |conn|
|
63
|
+
conn.rpush(legacy_other_process_working_queue_name('assigned'), job)
|
64
|
+
end
|
65
|
+
|
66
|
+
expected_job = Sidekiq.load_json(job)
|
67
|
+
expected_job['interrupted_count'] = 1
|
68
|
+
expected_job = Sidekiq.dump_json(expected_job)
|
69
|
+
|
70
|
+
uow = fetcher.retrieve_work
|
71
|
+
|
72
|
+
expect(uow).to_not be_nil
|
73
|
+
expect(uow.job).to eq expected_job
|
74
|
+
|
75
|
+
Sidekiq.redis do |conn|
|
76
|
+
expect(conn.llen(legacy_other_process_working_queue_name('assigned'))).to eq 0
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
45
80
|
it 'does not requeue jobs from live working queue' do
|
46
81
|
working_queue = live_other_process_working_queue_name('assigned')
|
47
82
|
|
@@ -61,8 +96,7 @@ shared_examples 'a Sidekiq fetcher' do
|
|
61
96
|
it 'does not clean up orphaned jobs more than once per cleanup interval' do
|
62
97
|
Sidekiq.redis = Sidekiq::RedisConnection.create(url: REDIS_URL, size: 10)
|
63
98
|
|
64
|
-
|
65
|
-
.to receive(:clean_working_queues!).once
|
99
|
+
expect(described_class).to receive(:clean_working_queues!).once
|
66
100
|
|
67
101
|
threads = 10.times.map do
|
68
102
|
Thread.new do
|
@@ -98,6 +132,104 @@ shared_examples 'a Sidekiq fetcher' do
|
|
98
132
|
|
99
133
|
expect(jobs).to include 'this_job_should_not_stuck'
|
100
134
|
end
|
135
|
+
|
136
|
+
context 'with namespaced queues' do
|
137
|
+
let (:queue) { 'namespace:assigned' }
|
138
|
+
let (:fetcher) { described_class.new(queues: [queue]) }
|
139
|
+
|
140
|
+
it 'requeues jobs from dead namespaced working queue with incremented interrupted_count' do
|
141
|
+
Sidekiq.redis do |conn|
|
142
|
+
conn.rpush(other_process_working_queue_name(queue), job)
|
143
|
+
end
|
144
|
+
|
145
|
+
expected_job = Sidekiq.load_json(job)
|
146
|
+
expected_job['interrupted_count'] = 1
|
147
|
+
expected_job = Sidekiq.dump_json(expected_job)
|
148
|
+
|
149
|
+
uow = fetcher.retrieve_work
|
150
|
+
|
151
|
+
expect(uow).to_not be_nil
|
152
|
+
expect(uow.job).to eq expected_job
|
153
|
+
|
154
|
+
Sidekiq.redis do |conn|
|
155
|
+
expect(conn.llen(other_process_working_queue_name(queue))).to eq 0
|
156
|
+
end
|
157
|
+
end
|
158
|
+
|
159
|
+
it 'does not requeue jobs in a namespaced queue from live working queue' do
|
160
|
+
working_queue = live_other_process_working_queue_name(queue)
|
161
|
+
|
162
|
+
Sidekiq.redis do |conn|
|
163
|
+
conn.rpush(working_queue, job)
|
164
|
+
end
|
165
|
+
|
166
|
+
uow = fetcher.retrieve_work
|
167
|
+
|
168
|
+
expect(uow).to be_nil
|
169
|
+
|
170
|
+
Sidekiq.redis do |conn|
|
171
|
+
expect(conn.llen(working_queue)).to eq 1
|
172
|
+
end
|
173
|
+
end
|
174
|
+
end
|
175
|
+
|
176
|
+
context 'with deeper namespaced queues' do
|
177
|
+
let (:queue) { 'deep:namespace:assigned' }
|
178
|
+
let (:fetcher) { described_class.new(queues: [queue]) }
|
179
|
+
|
180
|
+
it 'requeues jobs from dead namespaced working queue with incremented interrupted_count' do
|
181
|
+
Sidekiq.redis do |conn|
|
182
|
+
conn.rpush(other_process_working_queue_name(queue), job)
|
183
|
+
end
|
184
|
+
|
185
|
+
expected_job = Sidekiq.load_json(job)
|
186
|
+
expected_job['interrupted_count'] = 1
|
187
|
+
expected_job = Sidekiq.dump_json(expected_job)
|
188
|
+
|
189
|
+
uow = fetcher.retrieve_work
|
190
|
+
|
191
|
+
expect(uow).to_not be_nil
|
192
|
+
expect(uow.job).to eq expected_job
|
193
|
+
|
194
|
+
Sidekiq.redis do |conn|
|
195
|
+
expect(conn.llen(other_process_working_queue_name(queue))).to eq 0
|
196
|
+
end
|
197
|
+
end
|
198
|
+
|
199
|
+
it 'does not requeue jobs in a deeper namespaced queue from live working queue' do
|
200
|
+
working_queue = live_other_process_working_queue_name(queue)
|
201
|
+
|
202
|
+
Sidekiq.redis do |conn|
|
203
|
+
conn.rpush(working_queue, job)
|
204
|
+
end
|
205
|
+
|
206
|
+
uow = fetcher.retrieve_work
|
207
|
+
|
208
|
+
expect(uow).to be_nil
|
209
|
+
|
210
|
+
Sidekiq.redis do |conn|
|
211
|
+
expect(conn.llen(working_queue)).to eq 1
|
212
|
+
end
|
213
|
+
end
|
214
|
+
end
|
215
|
+
|
216
|
+
context 'with short cleanup interval' do
|
217
|
+
let(:short_interval) { 1 }
|
218
|
+
let(:fetcher) { described_class.new(queues: queues, lease_interval: short_interval, cleanup_interval: short_interval) }
|
219
|
+
|
220
|
+
it 'requeues when there is no heartbeat' do
|
221
|
+
Sidekiq.redis { |conn| conn.rpush('queue:assigned', job) }
|
222
|
+
# Use of retrieve_work twice with a sleep ensures we have exercised the
|
223
|
+
# `identity` method to create the working queue key name and that it
|
224
|
+
# matches the patterns used in the cleanup
|
225
|
+
uow = fetcher.retrieve_work
|
226
|
+
sleep(short_interval + 1)
|
227
|
+
uow = fetcher.retrieve_work
|
228
|
+
|
229
|
+
# Will only receive a UnitOfWork if the job was detected as failed and requeued
|
230
|
+
expect(uow).to_not be_nil
|
231
|
+
end
|
232
|
+
end
|
101
233
|
end
|
102
234
|
end
|
103
235
|
|
@@ -107,17 +239,23 @@ def working_queue_size(queue_name)
|
|
107
239
|
end
|
108
240
|
end
|
109
241
|
|
110
|
-
def
|
242
|
+
def legacy_other_process_working_queue_name(queue)
|
111
243
|
"#{Sidekiq::BaseReliableFetch::WORKING_QUEUE_PREFIX}:queue:#{queue}:#{Socket.gethostname}:#{::Process.pid + 1}"
|
112
244
|
end
|
113
245
|
|
246
|
+
|
247
|
+
def other_process_working_queue_name(queue)
|
248
|
+
"#{Sidekiq::BaseReliableFetch::WORKING_QUEUE_PREFIX}:queue:#{queue}:#{Socket.gethostname}:#{::Process.pid + 1}:#{::SecureRandom.hex(6)}"
|
249
|
+
end
|
250
|
+
|
114
251
|
def live_other_process_working_queue_name(queue)
|
115
252
|
pid = ::Process.pid + 1
|
116
253
|
hostname = Socket.gethostname
|
254
|
+
nonce = SecureRandom.hex(6)
|
117
255
|
|
118
256
|
Sidekiq.redis do |conn|
|
119
|
-
conn.set(Sidekiq::BaseReliableFetch.heartbeat_key(hostname
|
257
|
+
conn.set(Sidekiq::BaseReliableFetch.heartbeat_key("#{hostname}-#{pid}-#{nonce}"), 1)
|
120
258
|
end
|
121
259
|
|
122
|
-
"#{Sidekiq::BaseReliableFetch::WORKING_QUEUE_PREFIX}:queue:#{queue}:#{hostname}:#{pid}"
|
260
|
+
"#{Sidekiq::BaseReliableFetch::WORKING_QUEUE_PREFIX}:queue:#{queue}:#{hostname}:#{pid}:#{nonce}"
|
123
261
|
end
|
data/tests/README.md
CHANGED
@@ -18,18 +18,20 @@ You need to have redis server running on default HTTP port `6379`. To use other
|
|
18
18
|
This tool spawns configured number of Sidekiq workers and when the amount of processed jobs is about half of origin
|
19
19
|
number it will kill all the workers with `kill -9` and then it will spawn new workers again until all the jobs are processed. To track the process and counters we use Redis keys/counters.
|
20
20
|
|
21
|
-
# How to run
|
21
|
+
# How to run interruption tests
|
22
22
|
|
23
23
|
```
|
24
|
-
cd
|
25
|
-
bundle exec ruby retry_test.rb
|
24
|
+
cd tests/interruption
|
26
25
|
|
27
|
-
#
|
28
|
-
bundle exec ruby
|
26
|
+
# Verify "KILL" signal
|
27
|
+
bundle exec ruby test_kill_signal.rb
|
28
|
+
|
29
|
+
# Verify "TERM" signal
|
30
|
+
bundle exec ruby test_term_signal.rb
|
29
31
|
```
|
30
32
|
|
31
33
|
It requires Redis to be running on 6379 port.
|
32
34
|
|
33
35
|
## How it works
|
34
36
|
|
35
|
-
It spawns Sidekiq workers then creates a job that will kill itself after a moment. The reliable fetcher will bring it back. The purpose is to verify that job is run no more then
|
37
|
+
It spawns Sidekiq workers then creates a job that will kill itself after a moment. The reliable fetcher will bring it back. The purpose is to verify that job is run no more then allowed number of times.
|
@@ -4,21 +4,22 @@ require 'sidekiq'
|
|
4
4
|
require_relative 'config'
|
5
5
|
require_relative '../support/utils'
|
6
6
|
|
7
|
-
|
7
|
+
EXPECTED_NUM_TIMES_BEEN_RUN = 3
|
8
|
+
NUM_WORKERS = EXPECTED_NUM_TIMES_BEEN_RUN + 1
|
8
9
|
|
9
10
|
Sidekiq.redis(&:flushdb)
|
10
11
|
|
11
12
|
pids = spawn_workers(NUM_WORKERS)
|
12
13
|
|
13
|
-
|
14
|
+
RetryTestWorker.perform_async
|
14
15
|
|
15
16
|
sleep 300
|
16
17
|
|
17
18
|
Sidekiq.redis do |redis|
|
18
19
|
times_has_been_run = redis.get('times_has_been_run').to_i
|
19
|
-
assert 'The job has been run', times_has_been_run,
|
20
|
+
assert 'The job has been run', times_has_been_run, EXPECTED_NUM_TIMES_BEEN_RUN
|
20
21
|
end
|
21
22
|
|
22
|
-
assert 'Found
|
23
|
+
assert 'Found interruption exhausted jobs', Sidekiq::InterruptedSet.new.size, 1
|
23
24
|
|
24
25
|
stop_workers(pids)
|
@@ -4,21 +4,22 @@ require 'sidekiq'
|
|
4
4
|
require_relative 'config'
|
5
5
|
require_relative '../support/utils'
|
6
6
|
|
7
|
-
|
7
|
+
EXPECTED_NUM_TIMES_BEEN_RUN = 3
|
8
|
+
NUM_WORKERS = EXPECTED_NUM_TIMES_BEEN_RUN + 1
|
8
9
|
|
9
10
|
Sidekiq.redis(&:flushdb)
|
10
11
|
|
11
12
|
pids = spawn_workers(NUM_WORKERS)
|
12
13
|
|
13
|
-
|
14
|
+
RetryTestWorker.perform_async('TERM', 60)
|
14
15
|
|
15
16
|
sleep 300
|
16
17
|
|
17
18
|
Sidekiq.redis do |redis|
|
18
19
|
times_has_been_run = redis.get('times_has_been_run').to_i
|
19
|
-
assert 'The job has been run', times_has_been_run,
|
20
|
+
assert 'The job has been run', times_has_been_run, EXPECTED_NUM_TIMES_BEEN_RUN
|
20
21
|
end
|
21
22
|
|
22
|
-
assert 'Found
|
23
|
+
assert 'Found interruption exhausted jobs', Sidekiq::InterruptedSet.new.size, 1
|
23
24
|
|
24
25
|
stop_workers(pids)
|
@@ -0,0 +1,15 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class RetryTestWorker
|
4
|
+
include Sidekiq::Worker
|
5
|
+
|
6
|
+
def perform(signal = 'KILL', wait_seconds = 1)
|
7
|
+
Sidekiq.redis do |redis|
|
8
|
+
redis.incr('times_has_been_run')
|
9
|
+
end
|
10
|
+
|
11
|
+
Process.kill(signal, Process.pid)
|
12
|
+
|
13
|
+
sleep wait_seconds
|
14
|
+
end
|
15
|
+
end
|
File without changes
|
File without changes
|
File without changes
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: gitlab-sidekiq-fetcher
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- TEA
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2021-02-24 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: sidekiq
|
@@ -42,6 +42,7 @@ files:
|
|
42
42
|
- gitlab-sidekiq-fetcher.gemspec
|
43
43
|
- lib/sidekiq-reliable-fetch.rb
|
44
44
|
- lib/sidekiq/base_reliable_fetch.rb
|
45
|
+
- lib/sidekiq/interrupted_set.rb
|
45
46
|
- lib/sidekiq/reliable_fetch.rb
|
46
47
|
- lib/sidekiq/semi_reliable_fetch.rb
|
47
48
|
- spec/base_reliable_fetch_spec.rb
|
@@ -50,14 +51,13 @@ files:
|
|
50
51
|
- spec/semi_reliable_fetch_spec.rb
|
51
52
|
- spec/spec_helper.rb
|
52
53
|
- tests/README.md
|
53
|
-
- tests/
|
54
|
-
- tests/
|
55
|
-
- tests/
|
56
|
-
- tests/
|
57
|
-
- tests/
|
58
|
-
- tests/
|
59
|
-
- tests/
|
60
|
-
- tests/retry_test/worker.rb
|
54
|
+
- tests/interruption/config.rb
|
55
|
+
- tests/interruption/test_kill_signal.rb
|
56
|
+
- tests/interruption/test_term_signal.rb
|
57
|
+
- tests/interruption/worker.rb
|
58
|
+
- tests/reliability/config.rb
|
59
|
+
- tests/reliability/reliability_test.rb
|
60
|
+
- tests/reliability/worker.rb
|
61
61
|
- tests/support/utils.rb
|
62
62
|
homepage: https://gitlab.com/gitlab-org/sidekiq-reliable-fetch/
|
63
63
|
licenses:
|
@@ -74,12 +74,11 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
74
74
|
version: '0'
|
75
75
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
76
76
|
requirements:
|
77
|
-
- - "
|
77
|
+
- - ">="
|
78
78
|
- !ruby/object:Gem::Version
|
79
|
-
version:
|
79
|
+
version: '0'
|
80
80
|
requirements: []
|
81
|
-
|
82
|
-
rubygems_version: 2.5.2
|
81
|
+
rubygems_version: 3.1.4
|
83
82
|
signing_key:
|
84
83
|
specification_version: 4
|
85
84
|
summary: Reliable fetch extension for Sidekiq
|
@@ -1,21 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
class NoRetryTestWorker
|
4
|
-
include Sidekiq::Worker
|
5
|
-
|
6
|
-
sidekiq_options retry: false
|
7
|
-
|
8
|
-
sidekiq_retry_in do |count, exception|
|
9
|
-
1 # retry in one second
|
10
|
-
end
|
11
|
-
|
12
|
-
def perform
|
13
|
-
sleep 1
|
14
|
-
|
15
|
-
Sidekiq.redis do |redis|
|
16
|
-
redis.incr('times_has_been_run')
|
17
|
-
end
|
18
|
-
|
19
|
-
Process.kill('KILL', Process.pid) # Job suicide, OOM killer imitation
|
20
|
-
end
|
21
|
-
end
|
data/tests/retry_test/worker.rb
DELETED
@@ -1,23 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
class RetryTestWorker
|
4
|
-
include Sidekiq::Worker
|
5
|
-
|
6
|
-
EXPECTED_NUM_TIMES_BEEN_RUN = 2
|
7
|
-
|
8
|
-
sidekiq_options retry: EXPECTED_NUM_TIMES_BEEN_RUN
|
9
|
-
|
10
|
-
sidekiq_retry_in do |count, exception|
|
11
|
-
1 # retry in one second
|
12
|
-
end
|
13
|
-
|
14
|
-
def perform
|
15
|
-
sleep 1
|
16
|
-
|
17
|
-
Sidekiq.redis do |redis|
|
18
|
-
redis.incr('times_has_been_run')
|
19
|
-
end
|
20
|
-
|
21
|
-
Process.kill('KILL', Process.pid) # Job suicide, OOM killer imitation
|
22
|
-
end
|
23
|
-
end
|