gitlab-sidekiq-fetcher 0.5.1 → 0.7.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/.gitlab-ci.yml +7 -9
- data/Gemfile +1 -1
- data/Gemfile.lock +9 -11
- data/README.md +13 -0
- data/gitlab-sidekiq-fetcher.gemspec +11 -11
- data/lib/sidekiq-reliable-fetch.rb +1 -0
- data/lib/sidekiq/base_reliable_fetch.rb +87 -78
- data/lib/sidekiq/interrupted_set.rb +47 -0
- data/lib/sidekiq/reliable_fetch.rb +4 -6
- data/spec/base_reliable_fetch_spec.rb +34 -9
- data/spec/fetch_shared_examples.rb +22 -7
- data/tests/README.md +8 -6
- data/tests/{retry_test → interruption}/config.rb +0 -1
- data/tests/{retry_test/no_retry_test.rb → interruption/test_kill_signal.rb} +5 -4
- data/tests/{retry_test/retry_test.rb → interruption/test_term_signal.rb} +5 -4
- data/tests/interruption/worker.rb +15 -0
- data/tests/{reliability_test → reliability}/config.rb +0 -0
- data/tests/{reliability_test → reliability}/reliability_test.rb +1 -1
- data/tests/reliability/worker.rb +14 -0
- data/tests/support/utils.rb +1 -1
- metadata +13 -13
- data/tests/reliability_test/worker.rb +0 -26
- data/tests/retry_test/no_retry_worker.rb +0 -21
- data/tests/retry_test/worker.rb +0 -23
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: efdc9461358b538f2b0692cc179639b802a6f0bf4959b73e1d0821a4c40f0713
|
4
|
+
data.tar.gz: ccbe252394f22e6fafb198ddba9481e23776e7cfa2848807e1e5c34a5269c76d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 010da1750797c367b01cd8e1d8df281fa70759e9035f0ebaea71f53349f3ebbe54bb04dea74694d50bb595938b049733119ffd5ee4c85d5fd42d719e71a07809
|
7
|
+
data.tar.gz: d9f89a5dc1e6d6117df1f3704159ea79160a038d8a7a47939d1c772f07c4f3be2426389cd63d14a0ecf469035cc583ba3ea943cf5a14c2232ee3cb44bf09b21c
|
data/.gitignore
CHANGED
data/.gitlab-ci.yml
CHANGED
@@ -25,7 +25,7 @@ rspec:
|
|
25
25
|
.integration:
|
26
26
|
stage: test
|
27
27
|
script:
|
28
|
-
- cd tests/
|
28
|
+
- cd tests/reliability
|
29
29
|
- bundle exec ruby reliability_test.rb
|
30
30
|
services:
|
31
31
|
- redis:alpine
|
@@ -40,30 +40,28 @@ integration_reliable:
|
|
40
40
|
variables:
|
41
41
|
JOB_FETCHER: reliable
|
42
42
|
|
43
|
-
|
44
43
|
integration_basic:
|
45
44
|
extends: .integration
|
46
45
|
allow_failure: yes
|
47
46
|
variables:
|
48
47
|
JOB_FETCHER: basic
|
49
48
|
|
50
|
-
|
49
|
+
kill_interruption:
|
51
50
|
stage: test
|
52
51
|
script:
|
53
|
-
- cd tests/
|
54
|
-
- bundle exec ruby
|
52
|
+
- cd tests/interruption
|
53
|
+
- bundle exec ruby test_kill_signal.rb
|
55
54
|
services:
|
56
55
|
- redis:alpine
|
57
56
|
|
58
|
-
|
57
|
+
term_interruption:
|
59
58
|
stage: test
|
60
59
|
script:
|
61
|
-
- cd tests/
|
62
|
-
- bundle exec ruby
|
60
|
+
- cd tests/interruption
|
61
|
+
- bundle exec ruby test_term_signal.rb
|
63
62
|
services:
|
64
63
|
- redis:alpine
|
65
64
|
|
66
|
-
|
67
65
|
# rubocop:
|
68
66
|
# script:
|
69
67
|
# - bundle exec rubocop
|
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -2,7 +2,7 @@ GEM
|
|
2
2
|
remote: https://rubygems.org/
|
3
3
|
specs:
|
4
4
|
coderay (1.1.2)
|
5
|
-
connection_pool (2.2.
|
5
|
+
connection_pool (2.2.3)
|
6
6
|
diff-lcs (1.3)
|
7
7
|
docile (1.3.1)
|
8
8
|
json (2.1.0)
|
@@ -10,10 +10,8 @@ GEM
|
|
10
10
|
pry (0.11.3)
|
11
11
|
coderay (~> 1.1.0)
|
12
12
|
method_source (~> 0.9.0)
|
13
|
-
rack (2.
|
14
|
-
|
15
|
-
rack
|
16
|
-
redis (4.0.2)
|
13
|
+
rack (2.2.3)
|
14
|
+
redis (4.2.1)
|
17
15
|
rspec (3.8.0)
|
18
16
|
rspec-core (~> 3.8.0)
|
19
17
|
rspec-expectations (~> 3.8.0)
|
@@ -27,10 +25,10 @@ GEM
|
|
27
25
|
diff-lcs (>= 1.2.0, < 2.0)
|
28
26
|
rspec-support (~> 3.8.0)
|
29
27
|
rspec-support (3.8.0)
|
30
|
-
sidekiq (
|
31
|
-
connection_pool (
|
32
|
-
rack
|
33
|
-
redis (>=
|
28
|
+
sidekiq (6.1.0)
|
29
|
+
connection_pool (>= 2.2.2)
|
30
|
+
rack (~> 2.0)
|
31
|
+
redis (>= 4.2.0)
|
34
32
|
simplecov (0.16.1)
|
35
33
|
docile (~> 1.1)
|
36
34
|
json (>= 1.8, < 3)
|
@@ -43,8 +41,8 @@ PLATFORMS
|
|
43
41
|
DEPENDENCIES
|
44
42
|
pry
|
45
43
|
rspec (~> 3)
|
46
|
-
sidekiq (~>
|
44
|
+
sidekiq (~> 6.1)
|
47
45
|
simplecov
|
48
46
|
|
49
47
|
BUNDLED WITH
|
50
|
-
1.17.
|
48
|
+
1.17.2
|
data/README.md
CHANGED
@@ -6,10 +6,23 @@ fetches from Redis.
|
|
6
6
|
|
7
7
|
It's based on https://github.com/TEA-ebook/sidekiq-reliable-fetch.
|
8
8
|
|
9
|
+
**IMPORTANT NOTE:** Since version `0.7.0` this gem works only with `sidekiq >= 6.1` (which introduced Fetch API breaking changes). Please use version `~> 0.5` if you use older version of the `sidekiq` .
|
10
|
+
|
9
11
|
There are two strategies implemented: [Reliable fetch](http://redis.io/commands/rpoplpush#pattern-reliable-queue) using `rpoplpush` command and
|
10
12
|
semi-reliable fetch that uses regular `brpop` and `lpush` to pick the job and put it to working queue. The main benefit of "Reliable" strategy is that `rpoplpush` is atomic, eliminating a race condition in which jobs can be lost.
|
11
13
|
However, it comes at a cost because `rpoplpush` can't watch multiple lists at the same time so we need to iterate over the entire queue list which significantly increases pressure on Redis when there are more than a few queues. The "semi-reliable" strategy is much more reliable than the default Sidekiq fetcher, though. Compared to the reliable fetch strategy, it does not increase pressure on Redis significantly.
|
12
14
|
|
15
|
+
### Interruption handling
|
16
|
+
|
17
|
+
Sidekiq expects any job to report succcess or to fail. In the last case, Sidekiq puts `retry_count` counter
|
18
|
+
into the job and keeps to re-run the job until the counter reched the maximum allowed value. When the job has
|
19
|
+
not been given a chance to finish its work(to report success or fail), for example, when it was killed forcibly or when the job was requeued, after receiving TERM signal, the standard retry mechanisme does not get into the game and the job will be retried indefinatelly. This is why Reliable fetcher maintains a special counter `interrupted_count`
|
20
|
+
which is used to limit the amount of such retries. In both cases, Reliable Fetcher increments counter `interrupted_count` and rejects the job from running again when the counter exceeds `max_retries_after_interruption` times (default: 3 times).
|
21
|
+
Such a job will be put to `interrupted` queue. This queue mostly behaves as Sidekiq Dead queue so it only stores a limited amount of jobs for a limited term. Same as for Dead queue, all the limits are configurable via `interrupted_max_jobs` (default: 10_000) and `interrupted_timeout_in_seconds` (default: 3 months) Sidekiq option keys.
|
22
|
+
|
23
|
+
You can also disable special handling of interrupted jobs by setting `max_retries_after_interruption` into `-1`.
|
24
|
+
In this case, interrupted jobs will be run without any limits from Reliable Fetcher and they won't be put into Interrupted queue.
|
25
|
+
|
13
26
|
|
14
27
|
## Installation
|
15
28
|
|
@@ -1,14 +1,14 @@
|
|
1
1
|
Gem::Specification.new do |s|
|
2
|
-
s.name
|
3
|
-
s.version
|
4
|
-
s.authors
|
5
|
-
s.email
|
6
|
-
s.license
|
7
|
-
s.homepage
|
8
|
-
s.summary
|
9
|
-
s.description
|
2
|
+
s.name = 'gitlab-sidekiq-fetcher'
|
3
|
+
s.version = '0.7.1'
|
4
|
+
s.authors = ['TEA', 'GitLab']
|
5
|
+
s.email = 'valery@gitlab.com'
|
6
|
+
s.license = 'LGPL-3.0'
|
7
|
+
s.homepage = 'https://gitlab.com/gitlab-org/sidekiq-reliable-fetch/'
|
8
|
+
s.summary = 'Reliable fetch extension for Sidekiq'
|
9
|
+
s.description = 'Redis reliable queue pattern implemented in Sidekiq'
|
10
10
|
s.require_paths = ['lib']
|
11
|
-
s.files
|
12
|
-
s.test_files
|
13
|
-
s.add_dependency 'sidekiq', '~>
|
11
|
+
s.files = `git ls-files`.split($\)
|
12
|
+
s.test_files = []
|
13
|
+
s.add_dependency 'sidekiq', '~> 6.1'
|
14
14
|
end
|
@@ -1,6 +1,6 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
|
3
|
+
require_relative 'interrupted_set'
|
4
4
|
|
5
5
|
module Sidekiq
|
6
6
|
class BaseReliableFetch
|
@@ -18,6 +18,9 @@ module Sidekiq
|
|
18
18
|
# Defines the COUNT parameter that will be passed to Redis SCAN command
|
19
19
|
SCAN_COUNT = 1000
|
20
20
|
|
21
|
+
# How much time a job can be interrupted
|
22
|
+
DEFAULT_MAX_RETRIES_AFTER_INTERRUPTION = 3
|
23
|
+
|
21
24
|
UnitOfWork = Struct.new(:queue, :job) do
|
22
25
|
def acknowledge
|
23
26
|
Sidekiq.redis { |conn| conn.lrem(Sidekiq::BaseReliableFetch.working_queue_name(queue), 1, job) }
|
@@ -38,11 +41,13 @@ module Sidekiq
|
|
38
41
|
end
|
39
42
|
|
40
43
|
def self.setup_reliable_fetch!(config)
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
44
|
+
fetch_strategy = if config.options[:semi_reliable_fetch]
|
45
|
+
Sidekiq::SemiReliableFetch
|
46
|
+
else
|
47
|
+
Sidekiq::ReliableFetch
|
48
|
+
end
|
49
|
+
|
50
|
+
config.options[:fetch] = fetch_strategy.new(config.options)
|
46
51
|
|
47
52
|
Sidekiq.logger.info('GitLab reliable fetch activated!')
|
48
53
|
|
@@ -81,23 +86,8 @@ module Sidekiq
|
|
81
86
|
Sidekiq.logger.debug("Heartbeat for hostname: #{hostname} and pid: #{pid}")
|
82
87
|
end
|
83
88
|
|
84
|
-
def self.
|
85
|
-
|
86
|
-
|
87
|
-
Sidekiq.logger.debug('Re-queueing terminated jobs')
|
88
|
-
|
89
|
-
Sidekiq.redis do |conn|
|
90
|
-
inprogress.each do |unit_of_work|
|
91
|
-
conn.multi do |multi|
|
92
|
-
multi.lpush(unit_of_work.queue, unit_of_work.job)
|
93
|
-
multi.lrem(working_queue_name(unit_of_work.queue), 1, unit_of_work.job)
|
94
|
-
end
|
95
|
-
end
|
96
|
-
end
|
97
|
-
|
98
|
-
Sidekiq.logger.info("Pushed #{inprogress.size} jobs back to Redis")
|
99
|
-
rescue => e
|
100
|
-
Sidekiq.logger.warn("Failed to requeue #{inprogress.size} jobs: #{e.message}")
|
89
|
+
def self.worker_dead?(hostname, pid, conn)
|
90
|
+
!conn.get(heartbeat_key(hostname, pid))
|
101
91
|
end
|
102
92
|
|
103
93
|
def self.heartbeat_key(hostname, pid)
|
@@ -113,6 +103,8 @@ module Sidekiq
|
|
113
103
|
:strictly_ordered_queues
|
114
104
|
|
115
105
|
def initialize(options)
|
106
|
+
raise ArgumentError, 'missing queue list' unless options[:queues]
|
107
|
+
|
116
108
|
@cleanup_interval = options.fetch(:cleanup_interval, DEFAULT_CLEANUP_INTERVAL)
|
117
109
|
@lease_interval = options.fetch(:lease_interval, DEFAULT_LEASE_INTERVAL)
|
118
110
|
@last_try_to_take_lease_at = 0
|
@@ -128,99 +120,116 @@ module Sidekiq
|
|
128
120
|
|
129
121
|
def retrieve_unit_of_work
|
130
122
|
raise NotImplementedError,
|
131
|
-
|
123
|
+
"#{self.class} does not implement #{__method__}"
|
132
124
|
end
|
133
125
|
|
134
|
-
|
135
|
-
|
136
|
-
def clean_working_queue!(working_queue)
|
137
|
-
original_queue = working_queue.gsub(/#{WORKING_QUEUE_PREFIX}:|:[^:]*:[0-9]*\z/, '')
|
126
|
+
def bulk_requeue(inprogress, _options)
|
127
|
+
return if inprogress.empty?
|
138
128
|
|
139
129
|
Sidekiq.redis do |conn|
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
msg = begin
|
144
|
-
Sidekiq.load_json(job)
|
145
|
-
rescue => e
|
146
|
-
Sidekiq.logger.info("Skipped job: #{job} as we couldn't parse it")
|
147
|
-
next
|
148
|
-
end
|
149
|
-
|
150
|
-
msg['retry_count'] = msg['retry_count'].to_i + 1
|
151
|
-
|
152
|
-
if retries_exhausted?(msg)
|
153
|
-
send_to_morgue(msg)
|
154
|
-
else
|
155
|
-
job = Sidekiq.dump_json(msg)
|
156
|
-
|
157
|
-
conn.lpush(original_queue, job)
|
130
|
+
inprogress.each do |unit_of_work|
|
131
|
+
conn.multi do |multi|
|
132
|
+
preprocess_interrupted_job(unit_of_work.job, unit_of_work.queue, multi)
|
158
133
|
|
159
|
-
|
134
|
+
multi.lrem(self.class.working_queue_name(unit_of_work.queue), 1, unit_of_work.job)
|
160
135
|
end
|
161
136
|
end
|
162
|
-
|
163
|
-
Sidekiq.logger.info("Requeued #{count} dead jobs to #{original_queue}")
|
164
137
|
end
|
138
|
+
rescue => e
|
139
|
+
Sidekiq.logger.warn("Failed to requeue #{inprogress.size} jobs: #{e.message}")
|
165
140
|
end
|
166
141
|
|
167
|
-
|
168
|
-
# `retry` parameter can be empty when job is running the first time and when
|
169
|
-
# it's not specified in worker class explicitly.
|
170
|
-
# In that case, the default parameter gets injected into the job when
|
171
|
-
# it fails the first time in JobRetry#local.
|
172
|
-
# We should handle the case when `retry` is explicitly set to false
|
173
|
-
return true if msg['retry'] === false
|
174
|
-
|
175
|
-
max_retries_default = Sidekiq.options.fetch(:max_retries, Sidekiq::JobRetry::DEFAULT_MAX_RETRY_ATTEMPTS)
|
176
|
-
|
177
|
-
max_retry_attempts = retry_attempts_from(msg['retry'], max_retries_default)
|
142
|
+
private
|
178
143
|
|
179
|
-
|
180
|
-
|
144
|
+
def preprocess_interrupted_job(job, queue, conn = nil)
|
145
|
+
msg = Sidekiq.load_json(job)
|
146
|
+
msg['interrupted_count'] = msg['interrupted_count'].to_i + 1
|
181
147
|
|
182
|
-
|
183
|
-
|
184
|
-
msg_retry
|
148
|
+
if interruption_exhausted?(msg)
|
149
|
+
send_to_quarantine(msg, conn)
|
185
150
|
else
|
186
|
-
|
151
|
+
requeue_job(queue, msg, conn)
|
187
152
|
end
|
188
153
|
end
|
189
154
|
|
190
|
-
|
191
|
-
|
192
|
-
|
155
|
+
# If you want this method to be run in a scope of multi connection
|
156
|
+
# you need to pass it
|
157
|
+
def requeue_job(queue, msg, conn)
|
158
|
+
with_connection(conn) do |conn|
|
159
|
+
conn.lpush(queue, Sidekiq.dump_json(msg))
|
160
|
+
end
|
161
|
+
|
162
|
+
Sidekiq.logger.info(
|
163
|
+
message: "Pushed job #{msg['jid']} back to queue #{queue}",
|
193
164
|
jid: msg['jid'],
|
194
|
-
|
165
|
+
queue: queue
|
195
166
|
)
|
196
|
-
|
197
|
-
payload = Sidekiq.dump_json(msg)
|
198
|
-
Sidekiq::DeadSet.new.kill(payload, notify_failure: false)
|
199
167
|
end
|
200
168
|
|
201
169
|
# Detect "old" jobs and requeue them because the worker they were assigned
|
202
170
|
# to probably failed miserably.
|
203
171
|
def clean_working_queues!
|
204
|
-
Sidekiq.logger.info(
|
172
|
+
Sidekiq.logger.info('Cleaning working queues')
|
205
173
|
|
206
174
|
Sidekiq.redis do |conn|
|
207
175
|
conn.scan_each(match: "#{WORKING_QUEUE_PREFIX}:queue:*", count: SCAN_COUNT) do |key|
|
208
176
|
# Example: "working:name_of_the_job:queue:{hostname}:{PID}"
|
209
177
|
hostname, pid = key.scan(/:([^:]*):([0-9]*)\z/).flatten
|
210
178
|
|
211
|
-
|
179
|
+
next if hostname.nil? || pid.nil?
|
212
180
|
|
213
|
-
clean_working_queue!(key) if worker_dead?(hostname, pid)
|
181
|
+
clean_working_queue!(key) if self.class.worker_dead?(hostname, pid, conn)
|
214
182
|
end
|
215
183
|
end
|
216
184
|
end
|
217
185
|
|
218
|
-
def
|
186
|
+
def clean_working_queue!(working_queue)
|
187
|
+
original_queue = working_queue.gsub(/#{WORKING_QUEUE_PREFIX}:|:[^:]*:[0-9]*\z/, '')
|
188
|
+
|
219
189
|
Sidekiq.redis do |conn|
|
220
|
-
|
190
|
+
while job = conn.rpop(working_queue)
|
191
|
+
preprocess_interrupted_job(job, original_queue)
|
192
|
+
end
|
221
193
|
end
|
222
194
|
end
|
223
195
|
|
196
|
+
def interruption_exhausted?(msg)
|
197
|
+
return false if max_retries_after_interruption(msg['class']) < 0
|
198
|
+
|
199
|
+
msg['interrupted_count'].to_i >= max_retries_after_interruption(msg['class'])
|
200
|
+
end
|
201
|
+
|
202
|
+
def max_retries_after_interruption(worker_class)
|
203
|
+
max_retries_after_interruption = nil
|
204
|
+
|
205
|
+
max_retries_after_interruption ||= begin
|
206
|
+
Object.const_get(worker_class).sidekiq_options[:max_retries_after_interruption]
|
207
|
+
rescue NameError
|
208
|
+
end
|
209
|
+
|
210
|
+
max_retries_after_interruption ||= Sidekiq.options[:max_retries_after_interruption]
|
211
|
+
max_retries_after_interruption ||= DEFAULT_MAX_RETRIES_AFTER_INTERRUPTION
|
212
|
+
max_retries_after_interruption
|
213
|
+
end
|
214
|
+
|
215
|
+
def send_to_quarantine(msg, multi_connection = nil)
|
216
|
+
Sidekiq.logger.warn(
|
217
|
+
class: msg['class'],
|
218
|
+
jid: msg['jid'],
|
219
|
+
message: %(Reliable Fetcher: adding dead #{msg['class']} job #{msg['jid']} to interrupted queue)
|
220
|
+
)
|
221
|
+
|
222
|
+
job = Sidekiq.dump_json(msg)
|
223
|
+
Sidekiq::InterruptedSet.new.put(job, connection: multi_connection)
|
224
|
+
end
|
225
|
+
|
226
|
+
# Yield block with an existing connection or creates another one
|
227
|
+
def with_connection(conn)
|
228
|
+
return yield(conn) if conn
|
229
|
+
|
230
|
+
Sidekiq.redis { |redis_conn| yield(redis_conn) }
|
231
|
+
end
|
232
|
+
|
224
233
|
def take_lease
|
225
234
|
return unless allowed_to_take_a_lease?
|
226
235
|
|
@@ -0,0 +1,47 @@
|
|
1
|
+
require 'sidekiq/api'
|
2
|
+
|
3
|
+
module Sidekiq
|
4
|
+
class InterruptedSet < ::Sidekiq::JobSet
|
5
|
+
DEFAULT_MAX_CAPACITY = 10_000
|
6
|
+
DEFAULT_MAX_TIMEOUT = 90 * 24 * 60 * 60 # 3 months
|
7
|
+
|
8
|
+
def initialize
|
9
|
+
super "interrupted"
|
10
|
+
end
|
11
|
+
|
12
|
+
def put(message, opts = {})
|
13
|
+
now = Time.now.to_f
|
14
|
+
|
15
|
+
with_multi_connection(opts[:connection]) do |conn|
|
16
|
+
conn.zadd(name, now.to_s, message)
|
17
|
+
conn.zremrangebyscore(name, '-inf', now - self.class.timeout)
|
18
|
+
conn.zremrangebyrank(name, 0, - self.class.max_jobs)
|
19
|
+
end
|
20
|
+
|
21
|
+
true
|
22
|
+
end
|
23
|
+
|
24
|
+
# Yield block inside an existing multi connection or creates new one
|
25
|
+
def with_multi_connection(conn, &block)
|
26
|
+
return yield(conn) if conn
|
27
|
+
|
28
|
+
Sidekiq.redis do |c|
|
29
|
+
c.multi do |multi|
|
30
|
+
yield(multi)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def retry_all
|
36
|
+
each(&:retry) while size > 0
|
37
|
+
end
|
38
|
+
|
39
|
+
def self.max_jobs
|
40
|
+
Sidekiq.options[:interrupted_max_jobs] || DEFAULT_MAX_CAPACITY
|
41
|
+
end
|
42
|
+
|
43
|
+
def self.timeout
|
44
|
+
Sidekiq.options[:interrupted_timeout_in_seconds] || DEFAULT_MAX_TIMEOUT
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -6,23 +6,21 @@ module Sidekiq
|
|
6
6
|
# we inject a regular sleep into the loop.
|
7
7
|
RELIABLE_FETCH_IDLE_TIMEOUT = 5 # seconds
|
8
8
|
|
9
|
-
attr_reader :
|
9
|
+
attr_reader :queues_size
|
10
10
|
|
11
11
|
def initialize(options)
|
12
12
|
super
|
13
13
|
|
14
|
+
@queues = queues.uniq if strictly_ordered_queues
|
14
15
|
@queues_size = queues.size
|
15
|
-
@queues_iterator = queues.cycle
|
16
16
|
end
|
17
17
|
|
18
18
|
private
|
19
19
|
|
20
20
|
def retrieve_unit_of_work
|
21
|
-
|
22
|
-
|
23
|
-
queues_size.times do
|
24
|
-
queue = queues_iterator.next
|
21
|
+
queues_list = strictly_ordered_queues ? queues : queues.shuffle
|
25
22
|
|
23
|
+
queues_list.each do |queue|
|
26
24
|
work = Sidekiq.redis do |conn|
|
27
25
|
conn.rpoplpush(queue, self.class.working_queue_name(queue))
|
28
26
|
end
|
@@ -5,10 +5,11 @@ require 'sidekiq/reliable_fetch'
|
|
5
5
|
require 'sidekiq/semi_reliable_fetch'
|
6
6
|
|
7
7
|
describe Sidekiq::BaseReliableFetch do
|
8
|
+
let(:job) { Sidekiq.dump_json(class: 'Bob', args: [1, 2, 'foo']) }
|
9
|
+
|
8
10
|
before { Sidekiq.redis(&:flushdb) }
|
9
11
|
|
10
12
|
describe 'UnitOfWork' do
|
11
|
-
let(:job) { Sidekiq.dump_json({ class: 'Bob', args: [1, 2, 'foo'] }) }
|
12
13
|
let(:fetcher) { Sidekiq::ReliableFetch.new(queues: ['foo']) }
|
13
14
|
|
14
15
|
describe '#requeue' do
|
@@ -38,25 +39,49 @@ describe Sidekiq::BaseReliableFetch do
|
|
38
39
|
end
|
39
40
|
end
|
40
41
|
|
41
|
-
describe '
|
42
|
+
describe '#bulk_requeue' do
|
43
|
+
let(:options) { { queues: %w[foo bar] } }
|
44
|
+
let!(:queue1) { Sidekiq::Queue.new('foo') }
|
45
|
+
let!(:queue2) { Sidekiq::Queue.new('bar') }
|
46
|
+
|
42
47
|
it 'requeues the bulk' do
|
43
|
-
|
44
|
-
|
48
|
+
uow = described_class::UnitOfWork
|
49
|
+
jobs = [ uow.new('queue:foo', job), uow.new('queue:foo', job), uow.new('queue:bar', job) ]
|
50
|
+
described_class.new(options).bulk_requeue(jobs, nil)
|
45
51
|
|
46
|
-
expect(queue1.size).to eq
|
47
|
-
expect(queue2.size).to eq
|
52
|
+
expect(queue1.size).to eq 2
|
53
|
+
expect(queue2.size).to eq 1
|
54
|
+
end
|
48
55
|
|
56
|
+
it 'puts jobs into interrupted queue' do
|
49
57
|
uow = described_class::UnitOfWork
|
50
|
-
|
51
|
-
|
58
|
+
interrupted_job = Sidekiq.dump_json(class: 'Bob', args: [1, 2, 'foo'], interrupted_count: 3)
|
59
|
+
jobs = [ uow.new('queue:foo', interrupted_job), uow.new('queue:foo', job), uow.new('queue:bar', job) ]
|
60
|
+
described_class.new(options).bulk_requeue(jobs, nil)
|
61
|
+
|
62
|
+
expect(queue1.size).to eq 1
|
63
|
+
expect(queue2.size).to eq 1
|
64
|
+
expect(Sidekiq::InterruptedSet.new.size).to eq 1
|
65
|
+
end
|
66
|
+
|
67
|
+
it 'does not put jobs into interrupted queue if it is disabled' do
|
68
|
+
Sidekiq.options[:max_retries_after_interruption] = -1
|
69
|
+
|
70
|
+
uow = described_class::UnitOfWork
|
71
|
+
interrupted_job = Sidekiq.dump_json(class: 'Bob', args: [1, 2, 'foo'], interrupted_count: 3)
|
72
|
+
jobs = [ uow.new('queue:foo', interrupted_job), uow.new('queue:foo', job), uow.new('queue:bar', job) ]
|
73
|
+
described_class.new(options).bulk_requeue(jobs, nil)
|
52
74
|
|
53
75
|
expect(queue1.size).to eq 2
|
54
76
|
expect(queue2.size).to eq 1
|
77
|
+
expect(Sidekiq::InterruptedSet.new.size).to eq 0
|
78
|
+
|
79
|
+
Sidekiq.options[:max_retries_after_interruption] = 3
|
55
80
|
end
|
56
81
|
end
|
57
82
|
|
58
83
|
it 'sets heartbeat' do
|
59
|
-
config = double(:sidekiq_config, options: {})
|
84
|
+
config = double(:sidekiq_config, options: { queues: %w[foo bar] })
|
60
85
|
|
61
86
|
heartbeat_thread = described_class.setup_reliable_fetch!(config)
|
62
87
|
|
@@ -4,8 +4,8 @@ shared_examples 'a Sidekiq fetcher' do
|
|
4
4
|
before { Sidekiq.redis(&:flushdb) }
|
5
5
|
|
6
6
|
describe '#retrieve_work' do
|
7
|
-
let(:job) { Sidekiq.dump_json(
|
8
|
-
let(:fetcher) { described_class.new(queues:
|
7
|
+
let(:job) { Sidekiq.dump_json(class: 'Bob', args: [1, 2, 'foo']) }
|
8
|
+
let(:fetcher) { described_class.new(queues: queues) }
|
9
9
|
|
10
10
|
it 'retrieves the job and puts it to working queue' do
|
11
11
|
Sidekiq.redis { |conn| conn.rpush('queue:assigned', job) }
|
@@ -24,13 +24,13 @@ shared_examples 'a Sidekiq fetcher' do
|
|
24
24
|
expect(fetcher.retrieve_work).to be_nil
|
25
25
|
end
|
26
26
|
|
27
|
-
it 'requeues jobs from dead working queue with incremented
|
27
|
+
it 'requeues jobs from dead working queue with incremented interrupted_count' do
|
28
28
|
Sidekiq.redis do |conn|
|
29
29
|
conn.rpush(other_process_working_queue_name('assigned'), job)
|
30
30
|
end
|
31
31
|
|
32
32
|
expected_job = Sidekiq.load_json(job)
|
33
|
-
expected_job['
|
33
|
+
expected_job['interrupted_count'] = 1
|
34
34
|
expected_job = Sidekiq.dump_json(expected_job)
|
35
35
|
|
36
36
|
uow = fetcher.retrieve_work
|
@@ -42,6 +42,22 @@ shared_examples 'a Sidekiq fetcher' do
|
|
42
42
|
end
|
43
43
|
end
|
44
44
|
|
45
|
+
it 'ignores working queue keys in unknown formats' do
|
46
|
+
# Add a spurious non-numeric char segment at the end; this simulates any other
|
47
|
+
# incorrect form in general
|
48
|
+
malformed_key = "#{other_process_working_queue_name('assigned')}:X"
|
49
|
+
Sidekiq.redis do |conn|
|
50
|
+
conn.rpush(malformed_key, job)
|
51
|
+
end
|
52
|
+
|
53
|
+
uow = fetcher.retrieve_work
|
54
|
+
|
55
|
+
Sidekiq.redis do |conn|
|
56
|
+
expect(conn.llen(malformed_key)).to eq 1
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
|
45
61
|
it 'does not requeue jobs from live working queue' do
|
46
62
|
working_queue = live_other_process_working_queue_name('assigned')
|
47
63
|
|
@@ -61,12 +77,11 @@ shared_examples 'a Sidekiq fetcher' do
|
|
61
77
|
it 'does not clean up orphaned jobs more than once per cleanup interval' do
|
62
78
|
Sidekiq.redis = Sidekiq::RedisConnection.create(url: REDIS_URL, size: 10)
|
63
79
|
|
64
|
-
|
65
|
-
.to receive(:clean_working_queues!).once
|
80
|
+
expect(fetcher).to receive(:clean_working_queues!).once
|
66
81
|
|
67
82
|
threads = 10.times.map do
|
68
83
|
Thread.new do
|
69
|
-
|
84
|
+
fetcher.retrieve_work
|
70
85
|
end
|
71
86
|
end
|
72
87
|
|
data/tests/README.md
CHANGED
@@ -18,18 +18,20 @@ You need to have redis server running on default HTTP port `6379`. To use other
|
|
18
18
|
This tool spawns configured number of Sidekiq workers and when the amount of processed jobs is about half of origin
|
19
19
|
number it will kill all the workers with `kill -9` and then it will spawn new workers again until all the jobs are processed. To track the process and counters we use Redis keys/counters.
|
20
20
|
|
21
|
-
# How to run
|
21
|
+
# How to run interruption tests
|
22
22
|
|
23
23
|
```
|
24
|
-
cd
|
25
|
-
bundle exec ruby retry_test.rb
|
24
|
+
cd tests/interruption
|
26
25
|
|
27
|
-
#
|
28
|
-
bundle exec ruby
|
26
|
+
# Verify "KILL" signal
|
27
|
+
bundle exec ruby test_kill_signal.rb
|
28
|
+
|
29
|
+
# Verify "TERM" signal
|
30
|
+
bundle exec ruby test_term_signal.rb
|
29
31
|
```
|
30
32
|
|
31
33
|
It requires Redis to be running on 6379 port.
|
32
34
|
|
33
35
|
## How it works
|
34
36
|
|
35
|
-
It spawns Sidekiq workers then creates a job that will kill itself after a moment. The reliable fetcher will bring it back. The purpose is to verify that job is run no more then
|
37
|
+
It spawns Sidekiq workers then creates a job that will kill itself after a moment. The reliable fetcher will bring it back. The purpose is to verify that job is run no more then allowed number of times.
|
@@ -4,21 +4,22 @@ require 'sidekiq'
|
|
4
4
|
require_relative 'config'
|
5
5
|
require_relative '../support/utils'
|
6
6
|
|
7
|
-
|
7
|
+
EXPECTED_NUM_TIMES_BEEN_RUN = 3
|
8
|
+
NUM_WORKERS = EXPECTED_NUM_TIMES_BEEN_RUN + 1
|
8
9
|
|
9
10
|
Sidekiq.redis(&:flushdb)
|
10
11
|
|
11
12
|
pids = spawn_workers(NUM_WORKERS)
|
12
13
|
|
13
|
-
|
14
|
+
RetryTestWorker.perform_async
|
14
15
|
|
15
16
|
sleep 300
|
16
17
|
|
17
18
|
Sidekiq.redis do |redis|
|
18
19
|
times_has_been_run = redis.get('times_has_been_run').to_i
|
19
|
-
assert 'The job has been run', times_has_been_run,
|
20
|
+
assert 'The job has been run', times_has_been_run, EXPECTED_NUM_TIMES_BEEN_RUN
|
20
21
|
end
|
21
22
|
|
22
|
-
assert 'Found
|
23
|
+
assert 'Found interruption exhausted jobs', Sidekiq::InterruptedSet.new.size, 1
|
23
24
|
|
24
25
|
stop_workers(pids)
|
@@ -4,21 +4,22 @@ require 'sidekiq'
|
|
4
4
|
require_relative 'config'
|
5
5
|
require_relative '../support/utils'
|
6
6
|
|
7
|
-
|
7
|
+
EXPECTED_NUM_TIMES_BEEN_RUN = 3
|
8
|
+
NUM_WORKERS = EXPECTED_NUM_TIMES_BEEN_RUN + 1
|
8
9
|
|
9
10
|
Sidekiq.redis(&:flushdb)
|
10
11
|
|
11
12
|
pids = spawn_workers(NUM_WORKERS)
|
12
13
|
|
13
|
-
|
14
|
+
RetryTestWorker.perform_async('TERM', 60)
|
14
15
|
|
15
16
|
sleep 300
|
16
17
|
|
17
18
|
Sidekiq.redis do |redis|
|
18
19
|
times_has_been_run = redis.get('times_has_been_run').to_i
|
19
|
-
assert 'The job has been run', times_has_been_run,
|
20
|
+
assert 'The job has been run', times_has_been_run, EXPECTED_NUM_TIMES_BEEN_RUN
|
20
21
|
end
|
21
22
|
|
22
|
-
assert 'Found
|
23
|
+
assert 'Found interruption exhausted jobs', Sidekiq::InterruptedSet.new.size, 1
|
23
24
|
|
24
25
|
stop_workers(pids)
|
@@ -0,0 +1,15 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class RetryTestWorker
|
4
|
+
include Sidekiq::Worker
|
5
|
+
|
6
|
+
def perform(signal = 'KILL', wait_seconds = 1)
|
7
|
+
Sidekiq.redis do |redis|
|
8
|
+
redis.incr('times_has_been_run')
|
9
|
+
end
|
10
|
+
|
11
|
+
Process.kill(signal, Process.pid)
|
12
|
+
|
13
|
+
sleep wait_seconds
|
14
|
+
end
|
15
|
+
end
|
File without changes
|
@@ -0,0 +1,14 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class ReliabilityTestWorker
|
4
|
+
include Sidekiq::Worker
|
5
|
+
|
6
|
+
def perform
|
7
|
+
# To mimic long running job and to increase the probability of losing the job
|
8
|
+
sleep 1
|
9
|
+
|
10
|
+
Sidekiq.redis do |redis|
|
11
|
+
redis.lpush(REDIS_FINISHED_LIST, jid)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
data/tests/support/utils.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: gitlab-sidekiq-fetcher
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.7.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- TEA
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2021-02-18 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: sidekiq
|
@@ -17,14 +17,14 @@ dependencies:
|
|
17
17
|
requirements:
|
18
18
|
- - "~>"
|
19
19
|
- !ruby/object:Gem::Version
|
20
|
-
version: '
|
20
|
+
version: '6.1'
|
21
21
|
type: :runtime
|
22
22
|
prerelease: false
|
23
23
|
version_requirements: !ruby/object:Gem::Requirement
|
24
24
|
requirements:
|
25
25
|
- - "~>"
|
26
26
|
- !ruby/object:Gem::Version
|
27
|
-
version: '
|
27
|
+
version: '6.1'
|
28
28
|
description: Redis reliable queue pattern implemented in Sidekiq
|
29
29
|
email: valery@gitlab.com
|
30
30
|
executables: []
|
@@ -42,6 +42,7 @@ files:
|
|
42
42
|
- gitlab-sidekiq-fetcher.gemspec
|
43
43
|
- lib/sidekiq-reliable-fetch.rb
|
44
44
|
- lib/sidekiq/base_reliable_fetch.rb
|
45
|
+
- lib/sidekiq/interrupted_set.rb
|
45
46
|
- lib/sidekiq/reliable_fetch.rb
|
46
47
|
- lib/sidekiq/semi_reliable_fetch.rb
|
47
48
|
- spec/base_reliable_fetch_spec.rb
|
@@ -50,14 +51,13 @@ files:
|
|
50
51
|
- spec/semi_reliable_fetch_spec.rb
|
51
52
|
- spec/spec_helper.rb
|
52
53
|
- tests/README.md
|
53
|
-
- tests/
|
54
|
-
- tests/
|
55
|
-
- tests/
|
56
|
-
- tests/
|
57
|
-
- tests/
|
58
|
-
- tests/
|
59
|
-
- tests/
|
60
|
-
- tests/retry_test/worker.rb
|
54
|
+
- tests/interruption/config.rb
|
55
|
+
- tests/interruption/test_kill_signal.rb
|
56
|
+
- tests/interruption/test_term_signal.rb
|
57
|
+
- tests/interruption/worker.rb
|
58
|
+
- tests/reliability/config.rb
|
59
|
+
- tests/reliability/reliability_test.rb
|
60
|
+
- tests/reliability/worker.rb
|
61
61
|
- tests/support/utils.rb
|
62
62
|
homepage: https://gitlab.com/gitlab-org/sidekiq-reliable-fetch/
|
63
63
|
licenses:
|
@@ -78,7 +78,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
78
78
|
- !ruby/object:Gem::Version
|
79
79
|
version: '0'
|
80
80
|
requirements: []
|
81
|
-
rubygems_version: 3.
|
81
|
+
rubygems_version: 3.1.4
|
82
82
|
signing_key:
|
83
83
|
specification_version: 4
|
84
84
|
summary: Reliable fetch extension for Sidekiq
|
@@ -1,26 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
class ReliabilityTestWorker
|
4
|
-
include Sidekiq::Worker
|
5
|
-
|
6
|
-
def perform
|
7
|
-
# To mimic long running job and to increase the probability of losing the job
|
8
|
-
sleep 1
|
9
|
-
|
10
|
-
Sidekiq.redis do |redis|
|
11
|
-
redis.lpush(REDIS_FINISHED_LIST, get_sidekiq_job_id)
|
12
|
-
end
|
13
|
-
end
|
14
|
-
|
15
|
-
def get_sidekiq_job_id
|
16
|
-
context_data = Thread.current[:sidekiq_context]&.first
|
17
|
-
|
18
|
-
return unless context_data
|
19
|
-
|
20
|
-
index = context_data.index('JID-')
|
21
|
-
|
22
|
-
return unless index
|
23
|
-
|
24
|
-
context_data[index + 4..-1]
|
25
|
-
end
|
26
|
-
end
|
@@ -1,21 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
class NoRetryTestWorker
|
4
|
-
include Sidekiq::Worker
|
5
|
-
|
6
|
-
sidekiq_options retry: false
|
7
|
-
|
8
|
-
sidekiq_retry_in do |count, exception|
|
9
|
-
1 # retry in one second
|
10
|
-
end
|
11
|
-
|
12
|
-
def perform
|
13
|
-
sleep 1
|
14
|
-
|
15
|
-
Sidekiq.redis do |redis|
|
16
|
-
redis.incr('times_has_been_run')
|
17
|
-
end
|
18
|
-
|
19
|
-
Process.kill('KILL', Process.pid) # Job suicide, OOM killer imitation
|
20
|
-
end
|
21
|
-
end
|
data/tests/retry_test/worker.rb
DELETED
@@ -1,23 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
class RetryTestWorker
|
4
|
-
include Sidekiq::Worker
|
5
|
-
|
6
|
-
EXPECTED_NUM_TIMES_BEEN_RUN = 2
|
7
|
-
|
8
|
-
sidekiq_options retry: EXPECTED_NUM_TIMES_BEEN_RUN
|
9
|
-
|
10
|
-
sidekiq_retry_in do |count, exception|
|
11
|
-
1 # retry in one second
|
12
|
-
end
|
13
|
-
|
14
|
-
def perform
|
15
|
-
sleep 1
|
16
|
-
|
17
|
-
Sidekiq.redis do |redis|
|
18
|
-
redis.incr('times_has_been_run')
|
19
|
-
end
|
20
|
-
|
21
|
-
Process.kill('KILL', Process.pid) # Job suicide, OOM killer imitation
|
22
|
-
end
|
23
|
-
end
|