gitlab-sidekiq-fetcher 0.7.1 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: efdc9461358b538f2b0692cc179639b802a6f0bf4959b73e1d0821a4c40f0713
4
- data.tar.gz: ccbe252394f22e6fafb198ddba9481e23776e7cfa2848807e1e5c34a5269c76d
3
+ metadata.gz: c7be23d59956ffa44288a1c870bcca66fd0119682f810325d71a3ebaa8b76e80
4
+ data.tar.gz: 013a7124f61044572ad93335e95c18357c60804dd89024d987485b2d87775787
5
5
  SHA512:
6
- metadata.gz: 010da1750797c367b01cd8e1d8df281fa70759e9035f0ebaea71f53349f3ebbe54bb04dea74694d50bb595938b049733119ffd5ee4c85d5fd42d719e71a07809
7
- data.tar.gz: d9f89a5dc1e6d6117df1f3704159ea79160a038d8a7a47939d1c772f07c4f3be2426389cd63d14a0ecf469035cc583ba3ea943cf5a14c2232ee3cb44bf09b21c
6
+ metadata.gz: 92653bc5f9b5729f4dd50a8243a869c20d9621a1a9d25c46d729e735895e0f2d4d940c5a766803f1a5fd908ab0d9340f27d255dbd99f31bab4923e2f539c1882
7
+ data.tar.gz: d763b8b0ee3c2522752130fac86b83e67e8513faf919dd361aad9896aac684809650959b1627dd38e0440343c063f95b807b67fc31a10217ce1f15c428759803
data/README.md CHANGED
@@ -8,6 +8,8 @@ It's based on https://github.com/TEA-ebook/sidekiq-reliable-fetch.
8
8
 
9
9
  **IMPORTANT NOTE:** Since version `0.7.0` this gem works only with `sidekiq >= 6.1` (which introduced Fetch API breaking changes). Please use version `~> 0.5` if you use older version of the `sidekiq` .
10
10
 
11
+ **UPGRADE NOTE:** If upgrading from 0.7.0, strongly consider a full deployed step on 0.7.1 before 0.8.0; that fixes a bug in the queue name validation that will hit if sidekiq nodes running 0.7.0 see working queues named by 0.8.0. See https://gitlab.com/gitlab-org/sidekiq-reliable-fetch/-/merge_requests/22
12
+
11
13
  There are two strategies implemented: [Reliable fetch](http://redis.io/commands/rpoplpush#pattern-reliable-queue) using `rpoplpush` command and
12
14
  semi-reliable fetch that uses regular `brpop` and `lpush` to pick the job and put it to working queue. The main benefit of "Reliable" strategy is that `rpoplpush` is atomic, eliminating a race condition in which jobs can be lost.
13
15
  However, it comes at a cost because `rpoplpush` can't watch multiple lists at the same time so we need to iterate over the entire queue list which significantly increases pressure on Redis when there are more than a few queues. The "semi-reliable" strategy is much more reliable than the default Sidekiq fetcher, though. Compared to the reliable fetch strategy, it does not increase pressure on Redis significantly.
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'gitlab-sidekiq-fetcher'
3
- s.version = '0.7.1'
3
+ s.version = '0.8.0'
4
4
  s.authors = ['TEA', 'GitLab']
5
5
  s.email = 'valery@gitlab.com'
6
6
  s.license = 'LGPL-3.0'
@@ -21,6 +21,10 @@ module Sidekiq
21
21
  # How much time a job can be interrupted
22
22
  DEFAULT_MAX_RETRIES_AFTER_INTERRUPTION = 3
23
23
 
24
+ # Regexes for matching working queue keys
25
+ WORKING_QUEUE_REGEX = /#{WORKING_QUEUE_PREFIX}:(queue:.*):([^:]*:[0-9]*:[0-9a-f]*)\z/.freeze
26
+ LEGACY_WORKING_QUEUE_REGEX = /#{WORKING_QUEUE_PREFIX}:(queue:.*):([^:]*:[0-9]*)\z/.freeze
27
+
24
28
  UnitOfWork = Struct.new(:queue, :job) do
25
29
  def acknowledge
26
30
  Sidekiq.redis { |conn| conn.lrem(Sidekiq::BaseReliableFetch.working_queue_name(queue), 1, job) }
@@ -70,32 +74,36 @@ module Sidekiq
70
74
  end
71
75
  end
72
76
 
73
- def self.pid
74
- @pid ||= ::Process.pid
77
+ def self.hostname
78
+ Socket.gethostname
75
79
  end
76
80
 
77
- def self.hostname
78
- @hostname ||= Socket.gethostname
81
+ def self.process_nonce
82
+ @@process_nonce ||= SecureRandom.hex(6)
83
+ end
84
+
85
+ def self.identity
86
+ @@identity ||= "#{hostname}:#{$$}:#{process_nonce}"
79
87
  end
80
88
 
81
89
  def self.heartbeat
82
90
  Sidekiq.redis do |conn|
83
- conn.set(heartbeat_key(hostname, pid), 1, ex: HEARTBEAT_LIFESPAN)
91
+ conn.set(heartbeat_key(identity), 1, ex: HEARTBEAT_LIFESPAN)
84
92
  end
85
93
 
86
- Sidekiq.logger.debug("Heartbeat for hostname: #{hostname} and pid: #{pid}")
94
+ Sidekiq.logger.debug("Heartbeat for #{identity}")
87
95
  end
88
96
 
89
- def self.worker_dead?(hostname, pid, conn)
90
- !conn.get(heartbeat_key(hostname, pid))
97
+ def self.worker_dead?(identity, conn)
98
+ !conn.get(heartbeat_key(identity))
91
99
  end
92
100
 
93
- def self.heartbeat_key(hostname, pid)
94
- "reliable-fetcher-heartbeat-#{hostname}-#{pid}"
101
+ def self.heartbeat_key(identity)
102
+ "reliable-fetcher-heartbeat-#{identity.gsub(':', '-')}"
95
103
  end
96
104
 
97
105
  def self.working_queue_name(queue)
98
- "#{WORKING_QUEUE_PREFIX}:#{queue}:#{hostname}:#{pid}"
106
+ "#{WORKING_QUEUE_PREFIX}:#{queue}:#{identity}"
99
107
  end
100
108
 
101
109
  attr_reader :cleanup_interval, :last_try_to_take_lease_at, :lease_interval,
@@ -166,6 +174,19 @@ module Sidekiq
166
174
  )
167
175
  end
168
176
 
177
+ def extract_queue_and_identity(key)
178
+ # New identity format is "{hostname}:{pid}:{randomhex}
179
+ # Old identity format is "{hostname}:{pid}"
180
+ # Queue names may also have colons (namespaced).
181
+ # Expressing this in a single regex is unreadable
182
+
183
+ # Test the newer expected format first, only checking the older if necessary
184
+ original_queue, identity = key.scan(WORKING_QUEUE_REGEX).flatten
185
+ return original_queue, identity unless original_queue.nil? || identity.nil?
186
+
187
+ key.scan(LEGACY_WORKING_QUEUE_REGEX).flatten
188
+ end
189
+
169
190
  # Detect "old" jobs and requeue them because the worker they were assigned
170
191
  # to probably failed miserably.
171
192
  def clean_working_queues!
@@ -173,19 +194,16 @@ module Sidekiq
173
194
 
174
195
  Sidekiq.redis do |conn|
175
196
  conn.scan_each(match: "#{WORKING_QUEUE_PREFIX}:queue:*", count: SCAN_COUNT) do |key|
176
- # Example: "working:name_of_the_job:queue:{hostname}:{PID}"
177
- hostname, pid = key.scan(/:([^:]*):([0-9]*)\z/).flatten
197
+ original_queue, identity = extract_queue_and_identity(key)
178
198
 
179
- next if hostname.nil? || pid.nil?
199
+ next if original_queue.nil? || identity.nil?
180
200
 
181
- clean_working_queue!(key) if self.class.worker_dead?(hostname, pid, conn)
201
+ clean_working_queue!(original_queue, key) if self.class.worker_dead?(identity, conn)
182
202
  end
183
203
  end
184
204
  end
185
205
 
186
- def clean_working_queue!(working_queue)
187
- original_queue = working_queue.gsub(/#{WORKING_QUEUE_PREFIX}:|:[^:]*:[0-9]*\z/, '')
188
-
206
+ def clean_working_queue!(original_queue, working_queue)
189
207
  Sidekiq.redis do |conn|
190
208
  while job = conn.rpop(working_queue)
191
209
  preprocess_interrupted_job(job, original_queue)
@@ -88,7 +88,7 @@ describe Sidekiq::BaseReliableFetch do
88
88
  Sidekiq.redis do |conn|
89
89
  sleep 0.2 # Give the time to heartbeat thread to make a loop
90
90
 
91
- heartbeat_key = described_class.heartbeat_key(Socket.gethostname, ::Process.pid)
91
+ heartbeat_key = described_class.heartbeat_key(described_class.identity)
92
92
  heartbeat = conn.get(heartbeat_key)
93
93
 
94
94
  expect(heartbeat).not_to be_nil
@@ -7,111 +7,163 @@ shared_examples 'a Sidekiq fetcher' do
7
7
  let(:job) { Sidekiq.dump_json(class: 'Bob', args: [1, 2, 'foo']) }
8
8
  let(:fetcher) { described_class.new(queues: queues) }
9
9
 
10
- it 'retrieves the job and puts it to working queue' do
11
- Sidekiq.redis { |conn| conn.rpush('queue:assigned', job) }
12
-
13
- uow = fetcher.retrieve_work
10
+ it 'does not clean up orphaned jobs more than once per cleanup interval' do
11
+ Sidekiq.redis = Sidekiq::RedisConnection.create(url: REDIS_URL, size: 10)
14
12
 
15
- expect(working_queue_size('assigned')).to eq 1
16
- expect(uow.queue_name).to eq 'assigned'
17
- expect(uow.job).to eq job
18
- expect(Sidekiq::Queue.new('assigned').size).to eq 0
19
- end
13
+ expect(fetcher).to receive(:clean_working_queues!).once
20
14
 
21
- it 'does not retrieve a job from foreign queue' do
22
- Sidekiq.redis { |conn| conn.rpush('queue:not_assigned', job) }
15
+ threads = 10.times.map do
16
+ Thread.new do
17
+ fetcher.retrieve_work
18
+ end
19
+ end
23
20
 
24
- expect(fetcher.retrieve_work).to be_nil
21
+ threads.map(&:join)
25
22
  end
26
23
 
27
- it 'requeues jobs from dead working queue with incremented interrupted_count' do
24
+ it 'retrieves by order when strictly order is enabled' do
25
+ fetcher = described_class.new(strict: true, queues: ['first', 'second'])
26
+
28
27
  Sidekiq.redis do |conn|
29
- conn.rpush(other_process_working_queue_name('assigned'), job)
28
+ conn.rpush('queue:first', ['msg3', 'msg2', 'msg1'])
29
+ conn.rpush('queue:second', 'msg4')
30
30
  end
31
31
 
32
- expected_job = Sidekiq.load_json(job)
33
- expected_job['interrupted_count'] = 1
34
- expected_job = Sidekiq.dump_json(expected_job)
32
+ jobs = (1..4).map { fetcher.retrieve_work.job }
35
33
 
36
- uow = fetcher.retrieve_work
34
+ expect(jobs).to eq ['msg1', 'msg2', 'msg3', 'msg4']
35
+ end
37
36
 
38
- expect(uow.job).to eq expected_job
37
+ it 'does not starve any queue when queues are not strictly ordered' do
38
+ fetcher = described_class.new(queues: ['first', 'second'])
39
39
 
40
40
  Sidekiq.redis do |conn|
41
- expect(conn.llen(other_process_working_queue_name('assigned'))).to eq 0
41
+ conn.rpush('queue:first', (1..200).map { |i| "msg#{i}" })
42
+ conn.rpush('queue:second', 'this_job_should_not_stuck')
42
43
  end
44
+
45
+ jobs = (1..100).map { fetcher.retrieve_work.job }
46
+
47
+ expect(jobs).to include 'this_job_should_not_stuck'
43
48
  end
44
49
 
45
- it 'ignores working queue keys in unknown formats' do
46
- # Add a spurious non-numeric char segment at the end; this simulates any other
47
- # incorrect form in general
48
- malformed_key = "#{other_process_working_queue_name('assigned')}:X"
49
- Sidekiq.redis do |conn|
50
- conn.rpush(malformed_key, job)
51
- end
50
+ shared_examples "basic queue handling" do |queue|
51
+ let (:fetcher) { described_class.new(queues: [queue]) }
52
52
 
53
- uow = fetcher.retrieve_work
53
+ it 'retrieves the job and puts it to working queue' do
54
+ Sidekiq.redis { |conn| conn.rpush("queue:#{queue}", job) }
54
55
 
55
- Sidekiq.redis do |conn|
56
- expect(conn.llen(malformed_key)).to eq 1
56
+ uow = fetcher.retrieve_work
57
+
58
+ expect(working_queue_size(queue)).to eq 1
59
+ expect(uow.queue_name).to eq queue
60
+ expect(uow.job).to eq job
61
+ expect(Sidekiq::Queue.new(queue).size).to eq 0
57
62
  end
58
- end
59
63
 
64
+ it 'does not retrieve a job from foreign queue' do
65
+ Sidekiq.redis { |conn| conn.rpush("'queue:#{queue}:not", job) }
66
+ expect(fetcher.retrieve_work).to be_nil
60
67
 
61
- it 'does not requeue jobs from live working queue' do
62
- working_queue = live_other_process_working_queue_name('assigned')
68
+ Sidekiq.redis { |conn| conn.rpush("'queue:not_#{queue}", job) }
69
+ expect(fetcher.retrieve_work).to be_nil
63
70
 
64
- Sidekiq.redis do |conn|
65
- conn.rpush(working_queue, job)
71
+ Sidekiq.redis { |conn| conn.rpush("'queue:random_name", job) }
72
+ expect(fetcher.retrieve_work).to be_nil
66
73
  end
67
74
 
68
- uow = fetcher.retrieve_work
75
+ it 'requeues jobs from legacy dead working queue with incremented interrupted_count' do
76
+ Sidekiq.redis do |conn|
77
+ conn.rpush(legacy_other_process_working_queue_name(queue), job)
78
+ end
69
79
 
70
- expect(uow).to be_nil
80
+ expected_job = Sidekiq.load_json(job)
81
+ expected_job['interrupted_count'] = 1
82
+ expected_job = Sidekiq.dump_json(expected_job)
71
83
 
72
- Sidekiq.redis do |conn|
73
- expect(conn.llen(working_queue)).to eq 1
84
+ uow = fetcher.retrieve_work
85
+
86
+ expect(uow).to_not be_nil
87
+ expect(uow.job).to eq expected_job
88
+
89
+ Sidekiq.redis do |conn|
90
+ expect(conn.llen(legacy_other_process_working_queue_name(queue))).to eq 0
91
+ end
74
92
  end
75
- end
76
93
 
77
- it 'does not clean up orphaned jobs more than once per cleanup interval' do
78
- Sidekiq.redis = Sidekiq::RedisConnection.create(url: REDIS_URL, size: 10)
94
+ it 'ignores working queue keys in unknown formats' do
95
+ # Add a spurious non-numeric char segment at the end; this simulates any other
96
+ # incorrect form in general
97
+ malformed_key = "#{other_process_working_queue_name(queue)}:X"
98
+ Sidekiq.redis do |conn|
99
+ conn.rpush(malformed_key, job)
100
+ end
79
101
 
80
- expect(fetcher).to receive(:clean_working_queues!).once
102
+ uow = fetcher.retrieve_work
81
103
 
82
- threads = 10.times.map do
83
- Thread.new do
84
- fetcher.retrieve_work
104
+ Sidekiq.redis do |conn|
105
+ expect(conn.llen(malformed_key)).to eq 1
85
106
  end
86
107
  end
87
108
 
88
- threads.map(&:join)
89
- end
109
+ it 'requeues jobs from dead working queue with incremented interrupted_count' do
110
+ Sidekiq.redis do |conn|
111
+ conn.rpush(other_process_working_queue_name(queue), job)
112
+ end
90
113
 
91
- it 'retrieves by order when strictly order is enabled' do
92
- fetcher = described_class.new(strict: true, queues: ['first', 'second'])
114
+ expected_job = Sidekiq.load_json(job)
115
+ expected_job['interrupted_count'] = 1
116
+ expected_job = Sidekiq.dump_json(expected_job)
93
117
 
94
- Sidekiq.redis do |conn|
95
- conn.rpush('queue:first', ['msg3', 'msg2', 'msg1'])
96
- conn.rpush('queue:second', 'msg4')
118
+ uow = fetcher.retrieve_work
119
+
120
+ expect(uow).to_not be_nil
121
+ expect(uow.job).to eq expected_job
122
+
123
+ Sidekiq.redis do |conn|
124
+ expect(conn.llen(other_process_working_queue_name(queue))).to eq 0
125
+ end
97
126
  end
98
127
 
99
- jobs = (1..4).map { fetcher.retrieve_work.job }
128
+ it 'does not requeue jobs from live working queue' do
129
+ working_queue = live_other_process_working_queue_name(queue)
100
130
 
101
- expect(jobs).to eq ['msg1', 'msg2', 'msg3', 'msg4']
102
- end
131
+ Sidekiq.redis do |conn|
132
+ conn.rpush(working_queue, job)
133
+ end
103
134
 
104
- it 'does not starve any queue when queues are not strictly ordered' do
105
- fetcher = described_class.new(queues: ['first', 'second'])
135
+ uow = fetcher.retrieve_work
106
136
 
107
- Sidekiq.redis do |conn|
108
- conn.rpush('queue:first', (1..200).map { |i| "msg#{i}" })
109
- conn.rpush('queue:second', 'this_job_should_not_stuck')
137
+ expect(uow).to be_nil
138
+
139
+ Sidekiq.redis do |conn|
140
+ expect(conn.llen(working_queue)).to eq 1
141
+ end
110
142
  end
143
+ end
111
144
 
112
- jobs = (1..100).map { fetcher.retrieve_work.job }
145
+ context 'with various queues' do
146
+ %w[assigned namespace:assigned namespace:deeper:assigned].each do |queue|
147
+ it_behaves_like "basic queue handling", queue
148
+ end
149
+ end
113
150
 
114
- expect(jobs).to include 'this_job_should_not_stuck'
151
+ context 'with short cleanup interval' do
152
+ let(:short_interval) { 1 }
153
+ let(:fetcher) { described_class.new(queues: queues, lease_interval: short_interval, cleanup_interval: short_interval) }
154
+
155
+ it 'requeues when there is no heartbeat' do
156
+ Sidekiq.redis { |conn| conn.rpush('queue:assigned', job) }
157
+ # Use of retrieve_work twice with a sleep ensures we have exercised the
158
+ # `identity` method to create the working queue key name and that it
159
+ # matches the patterns used in the cleanup
160
+ uow = fetcher.retrieve_work
161
+ sleep(short_interval + 1)
162
+ uow = fetcher.retrieve_work
163
+
164
+ # Will only receive a UnitOfWork if the job was detected as failed and requeued
165
+ expect(uow).to_not be_nil
166
+ end
115
167
  end
116
168
  end
117
169
  end
@@ -122,17 +174,22 @@ def working_queue_size(queue_name)
122
174
  end
123
175
  end
124
176
 
125
- def other_process_working_queue_name(queue)
177
+ def legacy_other_process_working_queue_name(queue)
126
178
  "#{Sidekiq::BaseReliableFetch::WORKING_QUEUE_PREFIX}:queue:#{queue}:#{Socket.gethostname}:#{::Process.pid + 1}"
127
179
  end
128
180
 
181
+ def other_process_working_queue_name(queue)
182
+ "#{Sidekiq::BaseReliableFetch::WORKING_QUEUE_PREFIX}:queue:#{queue}:#{Socket.gethostname}:#{::Process.pid + 1}:#{::SecureRandom.hex(6)}"
183
+ end
184
+
129
185
  def live_other_process_working_queue_name(queue)
130
186
  pid = ::Process.pid + 1
131
187
  hostname = Socket.gethostname
188
+ nonce = SecureRandom.hex(6)
132
189
 
133
190
  Sidekiq.redis do |conn|
134
- conn.set(Sidekiq::BaseReliableFetch.heartbeat_key(hostname, pid), 1)
191
+ conn.set(Sidekiq::BaseReliableFetch.heartbeat_key("#{hostname}-#{pid}-#{nonce}"), 1)
135
192
  end
136
193
 
137
- "#{Sidekiq::BaseReliableFetch::WORKING_QUEUE_PREFIX}:queue:#{queue}:#{hostname}:#{pid}"
194
+ "#{Sidekiq::BaseReliableFetch::WORKING_QUEUE_PREFIX}:queue:#{queue}:#{hostname}:#{pid}:#{nonce}"
138
195
  end
metadata CHANGED
@@ -1,15 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gitlab-sidekiq-fetcher
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.1
4
+ version: 0.8.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - TEA
8
8
  - GitLab
9
- autorequire:
9
+ autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2021-02-18 00:00:00.000000000 Z
12
+ date: 2021-03-02 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: sidekiq
@@ -63,7 +63,7 @@ homepage: https://gitlab.com/gitlab-org/sidekiq-reliable-fetch/
63
63
  licenses:
64
64
  - LGPL-3.0
65
65
  metadata: {}
66
- post_install_message:
66
+ post_install_message:
67
67
  rdoc_options: []
68
68
  require_paths:
69
69
  - lib
@@ -79,7 +79,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
79
79
  version: '0'
80
80
  requirements: []
81
81
  rubygems_version: 3.1.4
82
- signing_key:
82
+ signing_key:
83
83
  specification_version: 4
84
84
  summary: Reliable fetch extension for Sidekiq
85
85
  test_files: []