gitlab-sidekiq-fetcher 0.7.1 → 0.8.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: efdc9461358b538f2b0692cc179639b802a6f0bf4959b73e1d0821a4c40f0713
4
- data.tar.gz: ccbe252394f22e6fafb198ddba9481e23776e7cfa2848807e1e5c34a5269c76d
3
+ metadata.gz: c7be23d59956ffa44288a1c870bcca66fd0119682f810325d71a3ebaa8b76e80
4
+ data.tar.gz: 013a7124f61044572ad93335e95c18357c60804dd89024d987485b2d87775787
5
5
  SHA512:
6
- metadata.gz: 010da1750797c367b01cd8e1d8df281fa70759e9035f0ebaea71f53349f3ebbe54bb04dea74694d50bb595938b049733119ffd5ee4c85d5fd42d719e71a07809
7
- data.tar.gz: d9f89a5dc1e6d6117df1f3704159ea79160a038d8a7a47939d1c772f07c4f3be2426389cd63d14a0ecf469035cc583ba3ea943cf5a14c2232ee3cb44bf09b21c
6
+ metadata.gz: 92653bc5f9b5729f4dd50a8243a869c20d9621a1a9d25c46d729e735895e0f2d4d940c5a766803f1a5fd908ab0d9340f27d255dbd99f31bab4923e2f539c1882
7
+ data.tar.gz: d763b8b0ee3c2522752130fac86b83e67e8513faf919dd361aad9896aac684809650959b1627dd38e0440343c063f95b807b67fc31a10217ce1f15c428759803
data/README.md CHANGED
@@ -8,6 +8,8 @@ It's based on https://github.com/TEA-ebook/sidekiq-reliable-fetch.
8
8
 
9
9
  **IMPORTANT NOTE:** Since version `0.7.0` this gem works only with `sidekiq >= 6.1` (which introduced Fetch API breaking changes). Please use version `~> 0.5` if you use older version of the `sidekiq` .
10
10
 
11
+ **UPGRADE NOTE:** If upgrading from 0.7.0, strongly consider a full deployed step on 0.7.1 before 0.8.0; that fixes a bug in the queue name validation that will hit if sidekiq nodes running 0.7.0 see working queues named by 0.8.0. See https://gitlab.com/gitlab-org/sidekiq-reliable-fetch/-/merge_requests/22
12
+
11
13
  There are two strategies implemented: [Reliable fetch](http://redis.io/commands/rpoplpush#pattern-reliable-queue) using `rpoplpush` command and
12
14
  semi-reliable fetch that uses regular `brpop` and `lpush` to pick the job and put it to working queue. The main benefit of "Reliable" strategy is that `rpoplpush` is atomic, eliminating a race condition in which jobs can be lost.
13
15
  However, it comes at a cost because `rpoplpush` can't watch multiple lists at the same time so we need to iterate over the entire queue list which significantly increases pressure on Redis when there are more than a few queues. The "semi-reliable" strategy is much more reliable than the default Sidekiq fetcher, though. Compared to the reliable fetch strategy, it does not increase pressure on Redis significantly.
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'gitlab-sidekiq-fetcher'
3
- s.version = '0.7.1'
3
+ s.version = '0.8.0'
4
4
  s.authors = ['TEA', 'GitLab']
5
5
  s.email = 'valery@gitlab.com'
6
6
  s.license = 'LGPL-3.0'
@@ -21,6 +21,10 @@ module Sidekiq
21
21
  # How much time a job can be interrupted
22
22
  DEFAULT_MAX_RETRIES_AFTER_INTERRUPTION = 3
23
23
 
24
+ # Regexes for matching working queue keys
25
+ WORKING_QUEUE_REGEX = /#{WORKING_QUEUE_PREFIX}:(queue:.*):([^:]*:[0-9]*:[0-9a-f]*)\z/.freeze
26
+ LEGACY_WORKING_QUEUE_REGEX = /#{WORKING_QUEUE_PREFIX}:(queue:.*):([^:]*:[0-9]*)\z/.freeze
27
+
24
28
  UnitOfWork = Struct.new(:queue, :job) do
25
29
  def acknowledge
26
30
  Sidekiq.redis { |conn| conn.lrem(Sidekiq::BaseReliableFetch.working_queue_name(queue), 1, job) }
@@ -70,32 +74,36 @@ module Sidekiq
70
74
  end
71
75
  end
72
76
 
73
- def self.pid
74
- @pid ||= ::Process.pid
77
+ def self.hostname
78
+ Socket.gethostname
75
79
  end
76
80
 
77
- def self.hostname
78
- @hostname ||= Socket.gethostname
81
+ def self.process_nonce
82
+ @@process_nonce ||= SecureRandom.hex(6)
83
+ end
84
+
85
+ def self.identity
86
+ @@identity ||= "#{hostname}:#{$$}:#{process_nonce}"
79
87
  end
80
88
 
81
89
  def self.heartbeat
82
90
  Sidekiq.redis do |conn|
83
- conn.set(heartbeat_key(hostname, pid), 1, ex: HEARTBEAT_LIFESPAN)
91
+ conn.set(heartbeat_key(identity), 1, ex: HEARTBEAT_LIFESPAN)
84
92
  end
85
93
 
86
- Sidekiq.logger.debug("Heartbeat for hostname: #{hostname} and pid: #{pid}")
94
+ Sidekiq.logger.debug("Heartbeat for #{identity}")
87
95
  end
88
96
 
89
- def self.worker_dead?(hostname, pid, conn)
90
- !conn.get(heartbeat_key(hostname, pid))
97
+ def self.worker_dead?(identity, conn)
98
+ !conn.get(heartbeat_key(identity))
91
99
  end
92
100
 
93
- def self.heartbeat_key(hostname, pid)
94
- "reliable-fetcher-heartbeat-#{hostname}-#{pid}"
101
+ def self.heartbeat_key(identity)
102
+ "reliable-fetcher-heartbeat-#{identity.gsub(':', '-')}"
95
103
  end
96
104
 
97
105
  def self.working_queue_name(queue)
98
- "#{WORKING_QUEUE_PREFIX}:#{queue}:#{hostname}:#{pid}"
106
+ "#{WORKING_QUEUE_PREFIX}:#{queue}:#{identity}"
99
107
  end
100
108
 
101
109
  attr_reader :cleanup_interval, :last_try_to_take_lease_at, :lease_interval,
@@ -166,6 +174,19 @@ module Sidekiq
166
174
  )
167
175
  end
168
176
 
177
+ def extract_queue_and_identity(key)
178
+ # New identity format is "{hostname}:{pid}:{randomhex}
179
+ # Old identity format is "{hostname}:{pid}"
180
+ # Queue names may also have colons (namespaced).
181
+ # Expressing this in a single regex is unreadable
182
+
183
+ # Test the newer expected format first, only checking the older if necessary
184
+ original_queue, identity = key.scan(WORKING_QUEUE_REGEX).flatten
185
+ return original_queue, identity unless original_queue.nil? || identity.nil?
186
+
187
+ key.scan(LEGACY_WORKING_QUEUE_REGEX).flatten
188
+ end
189
+
169
190
  # Detect "old" jobs and requeue them because the worker they were assigned
170
191
  # to probably failed miserably.
171
192
  def clean_working_queues!
@@ -173,19 +194,16 @@ module Sidekiq
173
194
 
174
195
  Sidekiq.redis do |conn|
175
196
  conn.scan_each(match: "#{WORKING_QUEUE_PREFIX}:queue:*", count: SCAN_COUNT) do |key|
176
- # Example: "working:name_of_the_job:queue:{hostname}:{PID}"
177
- hostname, pid = key.scan(/:([^:]*):([0-9]*)\z/).flatten
197
+ original_queue, identity = extract_queue_and_identity(key)
178
198
 
179
- next if hostname.nil? || pid.nil?
199
+ next if original_queue.nil? || identity.nil?
180
200
 
181
- clean_working_queue!(key) if self.class.worker_dead?(hostname, pid, conn)
201
+ clean_working_queue!(original_queue, key) if self.class.worker_dead?(identity, conn)
182
202
  end
183
203
  end
184
204
  end
185
205
 
186
- def clean_working_queue!(working_queue)
187
- original_queue = working_queue.gsub(/#{WORKING_QUEUE_PREFIX}:|:[^:]*:[0-9]*\z/, '')
188
-
206
+ def clean_working_queue!(original_queue, working_queue)
189
207
  Sidekiq.redis do |conn|
190
208
  while job = conn.rpop(working_queue)
191
209
  preprocess_interrupted_job(job, original_queue)
@@ -88,7 +88,7 @@ describe Sidekiq::BaseReliableFetch do
88
88
  Sidekiq.redis do |conn|
89
89
  sleep 0.2 # Give the time to heartbeat thread to make a loop
90
90
 
91
- heartbeat_key = described_class.heartbeat_key(Socket.gethostname, ::Process.pid)
91
+ heartbeat_key = described_class.heartbeat_key(described_class.identity)
92
92
  heartbeat = conn.get(heartbeat_key)
93
93
 
94
94
  expect(heartbeat).not_to be_nil
@@ -7,111 +7,163 @@ shared_examples 'a Sidekiq fetcher' do
7
7
  let(:job) { Sidekiq.dump_json(class: 'Bob', args: [1, 2, 'foo']) }
8
8
  let(:fetcher) { described_class.new(queues: queues) }
9
9
 
10
- it 'retrieves the job and puts it to working queue' do
11
- Sidekiq.redis { |conn| conn.rpush('queue:assigned', job) }
12
-
13
- uow = fetcher.retrieve_work
10
+ it 'does not clean up orphaned jobs more than once per cleanup interval' do
11
+ Sidekiq.redis = Sidekiq::RedisConnection.create(url: REDIS_URL, size: 10)
14
12
 
15
- expect(working_queue_size('assigned')).to eq 1
16
- expect(uow.queue_name).to eq 'assigned'
17
- expect(uow.job).to eq job
18
- expect(Sidekiq::Queue.new('assigned').size).to eq 0
19
- end
13
+ expect(fetcher).to receive(:clean_working_queues!).once
20
14
 
21
- it 'does not retrieve a job from foreign queue' do
22
- Sidekiq.redis { |conn| conn.rpush('queue:not_assigned', job) }
15
+ threads = 10.times.map do
16
+ Thread.new do
17
+ fetcher.retrieve_work
18
+ end
19
+ end
23
20
 
24
- expect(fetcher.retrieve_work).to be_nil
21
+ threads.map(&:join)
25
22
  end
26
23
 
27
- it 'requeues jobs from dead working queue with incremented interrupted_count' do
24
+ it 'retrieves by order when strictly order is enabled' do
25
+ fetcher = described_class.new(strict: true, queues: ['first', 'second'])
26
+
28
27
  Sidekiq.redis do |conn|
29
- conn.rpush(other_process_working_queue_name('assigned'), job)
28
+ conn.rpush('queue:first', ['msg3', 'msg2', 'msg1'])
29
+ conn.rpush('queue:second', 'msg4')
30
30
  end
31
31
 
32
- expected_job = Sidekiq.load_json(job)
33
- expected_job['interrupted_count'] = 1
34
- expected_job = Sidekiq.dump_json(expected_job)
32
+ jobs = (1..4).map { fetcher.retrieve_work.job }
35
33
 
36
- uow = fetcher.retrieve_work
34
+ expect(jobs).to eq ['msg1', 'msg2', 'msg3', 'msg4']
35
+ end
37
36
 
38
- expect(uow.job).to eq expected_job
37
+ it 'does not starve any queue when queues are not strictly ordered' do
38
+ fetcher = described_class.new(queues: ['first', 'second'])
39
39
 
40
40
  Sidekiq.redis do |conn|
41
- expect(conn.llen(other_process_working_queue_name('assigned'))).to eq 0
41
+ conn.rpush('queue:first', (1..200).map { |i| "msg#{i}" })
42
+ conn.rpush('queue:second', 'this_job_should_not_stuck')
42
43
  end
44
+
45
+ jobs = (1..100).map { fetcher.retrieve_work.job }
46
+
47
+ expect(jobs).to include 'this_job_should_not_stuck'
43
48
  end
44
49
 
45
- it 'ignores working queue keys in unknown formats' do
46
- # Add a spurious non-numeric char segment at the end; this simulates any other
47
- # incorrect form in general
48
- malformed_key = "#{other_process_working_queue_name('assigned')}:X"
49
- Sidekiq.redis do |conn|
50
- conn.rpush(malformed_key, job)
51
- end
50
+ shared_examples "basic queue handling" do |queue|
51
+ let (:fetcher) { described_class.new(queues: [queue]) }
52
52
 
53
- uow = fetcher.retrieve_work
53
+ it 'retrieves the job and puts it to working queue' do
54
+ Sidekiq.redis { |conn| conn.rpush("queue:#{queue}", job) }
54
55
 
55
- Sidekiq.redis do |conn|
56
- expect(conn.llen(malformed_key)).to eq 1
56
+ uow = fetcher.retrieve_work
57
+
58
+ expect(working_queue_size(queue)).to eq 1
59
+ expect(uow.queue_name).to eq queue
60
+ expect(uow.job).to eq job
61
+ expect(Sidekiq::Queue.new(queue).size).to eq 0
57
62
  end
58
- end
59
63
 
64
+ it 'does not retrieve a job from foreign queue' do
65
+ Sidekiq.redis { |conn| conn.rpush("'queue:#{queue}:not", job) }
66
+ expect(fetcher.retrieve_work).to be_nil
60
67
 
61
- it 'does not requeue jobs from live working queue' do
62
- working_queue = live_other_process_working_queue_name('assigned')
68
+ Sidekiq.redis { |conn| conn.rpush("'queue:not_#{queue}", job) }
69
+ expect(fetcher.retrieve_work).to be_nil
63
70
 
64
- Sidekiq.redis do |conn|
65
- conn.rpush(working_queue, job)
71
+ Sidekiq.redis { |conn| conn.rpush("'queue:random_name", job) }
72
+ expect(fetcher.retrieve_work).to be_nil
66
73
  end
67
74
 
68
- uow = fetcher.retrieve_work
75
+ it 'requeues jobs from legacy dead working queue with incremented interrupted_count' do
76
+ Sidekiq.redis do |conn|
77
+ conn.rpush(legacy_other_process_working_queue_name(queue), job)
78
+ end
69
79
 
70
- expect(uow).to be_nil
80
+ expected_job = Sidekiq.load_json(job)
81
+ expected_job['interrupted_count'] = 1
82
+ expected_job = Sidekiq.dump_json(expected_job)
71
83
 
72
- Sidekiq.redis do |conn|
73
- expect(conn.llen(working_queue)).to eq 1
84
+ uow = fetcher.retrieve_work
85
+
86
+ expect(uow).to_not be_nil
87
+ expect(uow.job).to eq expected_job
88
+
89
+ Sidekiq.redis do |conn|
90
+ expect(conn.llen(legacy_other_process_working_queue_name(queue))).to eq 0
91
+ end
74
92
  end
75
- end
76
93
 
77
- it 'does not clean up orphaned jobs more than once per cleanup interval' do
78
- Sidekiq.redis = Sidekiq::RedisConnection.create(url: REDIS_URL, size: 10)
94
+ it 'ignores working queue keys in unknown formats' do
95
+ # Add a spurious non-numeric char segment at the end; this simulates any other
96
+ # incorrect form in general
97
+ malformed_key = "#{other_process_working_queue_name(queue)}:X"
98
+ Sidekiq.redis do |conn|
99
+ conn.rpush(malformed_key, job)
100
+ end
79
101
 
80
- expect(fetcher).to receive(:clean_working_queues!).once
102
+ uow = fetcher.retrieve_work
81
103
 
82
- threads = 10.times.map do
83
- Thread.new do
84
- fetcher.retrieve_work
104
+ Sidekiq.redis do |conn|
105
+ expect(conn.llen(malformed_key)).to eq 1
85
106
  end
86
107
  end
87
108
 
88
- threads.map(&:join)
89
- end
109
+ it 'requeues jobs from dead working queue with incremented interrupted_count' do
110
+ Sidekiq.redis do |conn|
111
+ conn.rpush(other_process_working_queue_name(queue), job)
112
+ end
90
113
 
91
- it 'retrieves by order when strictly order is enabled' do
92
- fetcher = described_class.new(strict: true, queues: ['first', 'second'])
114
+ expected_job = Sidekiq.load_json(job)
115
+ expected_job['interrupted_count'] = 1
116
+ expected_job = Sidekiq.dump_json(expected_job)
93
117
 
94
- Sidekiq.redis do |conn|
95
- conn.rpush('queue:first', ['msg3', 'msg2', 'msg1'])
96
- conn.rpush('queue:second', 'msg4')
118
+ uow = fetcher.retrieve_work
119
+
120
+ expect(uow).to_not be_nil
121
+ expect(uow.job).to eq expected_job
122
+
123
+ Sidekiq.redis do |conn|
124
+ expect(conn.llen(other_process_working_queue_name(queue))).to eq 0
125
+ end
97
126
  end
98
127
 
99
- jobs = (1..4).map { fetcher.retrieve_work.job }
128
+ it 'does not requeue jobs from live working queue' do
129
+ working_queue = live_other_process_working_queue_name(queue)
100
130
 
101
- expect(jobs).to eq ['msg1', 'msg2', 'msg3', 'msg4']
102
- end
131
+ Sidekiq.redis do |conn|
132
+ conn.rpush(working_queue, job)
133
+ end
103
134
 
104
- it 'does not starve any queue when queues are not strictly ordered' do
105
- fetcher = described_class.new(queues: ['first', 'second'])
135
+ uow = fetcher.retrieve_work
106
136
 
107
- Sidekiq.redis do |conn|
108
- conn.rpush('queue:first', (1..200).map { |i| "msg#{i}" })
109
- conn.rpush('queue:second', 'this_job_should_not_stuck')
137
+ expect(uow).to be_nil
138
+
139
+ Sidekiq.redis do |conn|
140
+ expect(conn.llen(working_queue)).to eq 1
141
+ end
110
142
  end
143
+ end
111
144
 
112
- jobs = (1..100).map { fetcher.retrieve_work.job }
145
+ context 'with various queues' do
146
+ %w[assigned namespace:assigned namespace:deeper:assigned].each do |queue|
147
+ it_behaves_like "basic queue handling", queue
148
+ end
149
+ end
113
150
 
114
- expect(jobs).to include 'this_job_should_not_stuck'
151
+ context 'with short cleanup interval' do
152
+ let(:short_interval) { 1 }
153
+ let(:fetcher) { described_class.new(queues: queues, lease_interval: short_interval, cleanup_interval: short_interval) }
154
+
155
+ it 'requeues when there is no heartbeat' do
156
+ Sidekiq.redis { |conn| conn.rpush('queue:assigned', job) }
157
+ # Use of retrieve_work twice with a sleep ensures we have exercised the
158
+ # `identity` method to create the working queue key name and that it
159
+ # matches the patterns used in the cleanup
160
+ uow = fetcher.retrieve_work
161
+ sleep(short_interval + 1)
162
+ uow = fetcher.retrieve_work
163
+
164
+ # Will only receive a UnitOfWork if the job was detected as failed and requeued
165
+ expect(uow).to_not be_nil
166
+ end
115
167
  end
116
168
  end
117
169
  end
@@ -122,17 +174,22 @@ def working_queue_size(queue_name)
122
174
  end
123
175
  end
124
176
 
125
- def other_process_working_queue_name(queue)
177
+ def legacy_other_process_working_queue_name(queue)
126
178
  "#{Sidekiq::BaseReliableFetch::WORKING_QUEUE_PREFIX}:queue:#{queue}:#{Socket.gethostname}:#{::Process.pid + 1}"
127
179
  end
128
180
 
181
+ def other_process_working_queue_name(queue)
182
+ "#{Sidekiq::BaseReliableFetch::WORKING_QUEUE_PREFIX}:queue:#{queue}:#{Socket.gethostname}:#{::Process.pid + 1}:#{::SecureRandom.hex(6)}"
183
+ end
184
+
129
185
  def live_other_process_working_queue_name(queue)
130
186
  pid = ::Process.pid + 1
131
187
  hostname = Socket.gethostname
188
+ nonce = SecureRandom.hex(6)
132
189
 
133
190
  Sidekiq.redis do |conn|
134
- conn.set(Sidekiq::BaseReliableFetch.heartbeat_key(hostname, pid), 1)
191
+ conn.set(Sidekiq::BaseReliableFetch.heartbeat_key("#{hostname}-#{pid}-#{nonce}"), 1)
135
192
  end
136
193
 
137
- "#{Sidekiq::BaseReliableFetch::WORKING_QUEUE_PREFIX}:queue:#{queue}:#{hostname}:#{pid}"
194
+ "#{Sidekiq::BaseReliableFetch::WORKING_QUEUE_PREFIX}:queue:#{queue}:#{hostname}:#{pid}:#{nonce}"
138
195
  end
metadata CHANGED
@@ -1,15 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gitlab-sidekiq-fetcher
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.1
4
+ version: 0.8.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - TEA
8
8
  - GitLab
9
- autorequire:
9
+ autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2021-02-18 00:00:00.000000000 Z
12
+ date: 2021-03-02 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: sidekiq
@@ -63,7 +63,7 @@ homepage: https://gitlab.com/gitlab-org/sidekiq-reliable-fetch/
63
63
  licenses:
64
64
  - LGPL-3.0
65
65
  metadata: {}
66
- post_install_message:
66
+ post_install_message:
67
67
  rdoc_options: []
68
68
  require_paths:
69
69
  - lib
@@ -79,7 +79,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
79
79
  version: '0'
80
80
  requirements: []
81
81
  rubygems_version: 3.1.4
82
- signing_key:
82
+ signing_key:
83
83
  specification_version: 4
84
84
  summary: Reliable fetch extension for Sidekiq
85
85
  test_files: []