karafka 2.0.0.beta4 → 2.0.0.beta5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +0 -0
- data/.github/workflows/ci.yml +18 -1
- data/CHANGELOG.md +15 -0
- data/Gemfile.lock +1 -1
- data/bin/benchmarks +2 -2
- data/bin/integrations +10 -3
- data/bin/{stress → stress_many} +0 -0
- data/bin/stress_one +13 -0
- data/docker-compose.yml +23 -18
- data/lib/karafka/active_job/routing/extensions.rb +1 -1
- data/lib/karafka/app.rb +2 -1
- data/lib/karafka/base_consumer.rb +26 -19
- data/lib/karafka/connection/client.rb +24 -4
- data/lib/karafka/connection/listener.rb +49 -11
- data/lib/karafka/connection/pauses_manager.rb +8 -0
- data/lib/karafka/connection/rebalance_manager.rb +20 -19
- data/lib/karafka/contracts/config.rb +17 -4
- data/lib/karafka/contracts/server_cli_options.rb +1 -1
- data/lib/karafka/errors.rb +3 -0
- data/lib/karafka/pro/active_job/consumer.rb +1 -8
- data/lib/karafka/pro/base_consumer.rb +10 -13
- data/lib/karafka/pro/loader.rb +11 -6
- data/lib/karafka/pro/processing/coordinator.rb +12 -0
- data/lib/karafka/pro/processing/jobs_builder.rb +3 -2
- data/lib/karafka/pro/processing/scheduler.rb +56 -0
- data/lib/karafka/processing/coordinator.rb +84 -0
- data/lib/karafka/processing/coordinators_buffer.rb +58 -0
- data/lib/karafka/processing/executor.rb +6 -16
- data/lib/karafka/processing/executors_buffer.rb +46 -15
- data/lib/karafka/processing/jobs/consume.rb +4 -2
- data/lib/karafka/processing/jobs_builder.rb +3 -2
- data/lib/karafka/processing/result.rb +0 -5
- data/lib/karafka/processing/scheduler.rb +22 -0
- data/lib/karafka/routing/consumer_group.rb +1 -1
- data/lib/karafka/routing/topic.rb +9 -0
- data/lib/karafka/setup/config.rb +18 -10
- data/lib/karafka/version.rb +1 -1
- data.tar.gz.sig +0 -0
- metadata +9 -5
- metadata.gz.sig +4 -1
- data/lib/karafka/pro/scheduler.rb +0 -54
- data/lib/karafka/scheduler.rb +0 -20
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2c8e680ffdf69f88899a715c84cc484e8f568f4a93da9284195f4bf55a283ee1
|
4
|
+
data.tar.gz: 974356226a10ba2c77de770351a47180716533021a89040bcdc1aae57f452121
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2427aaae1b1b07430df7c9f042d290bbae8380fb1f6ec7c26eecee92b8fe79e13ea9f3a99a36bf89b314ffba809c556618b22c0a87f0c0c83bb73cf8af72321b
|
7
|
+
data.tar.gz: 55e18448b5645acd38c4194967ea7df657c142d82a105699f7b204f222f8dfb2dbd14cce82b1f424ec177afb78049b3e7588642013674a3c2923a8848b6b87e7
|
checksums.yaml.gz.sig
CHANGED
Binary file
|
data/.github/workflows/ci.yml
CHANGED
@@ -8,6 +8,10 @@ on:
|
|
8
8
|
schedule:
|
9
9
|
- cron: '0 1 * * *'
|
10
10
|
|
11
|
+
env:
|
12
|
+
BUNDLE_RETRY: 6
|
13
|
+
BUNDLE_JOBS: 4
|
14
|
+
|
11
15
|
jobs:
|
12
16
|
diffend:
|
13
17
|
runs-on: ubuntu-latest
|
@@ -17,13 +21,16 @@ jobs:
|
|
17
21
|
- uses: actions/checkout@v2
|
18
22
|
with:
|
19
23
|
fetch-depth: 0
|
24
|
+
|
20
25
|
- name: Set up Ruby
|
21
26
|
uses: ruby/setup-ruby@v1
|
22
27
|
with:
|
23
28
|
ruby-version: 3.1
|
24
29
|
bundler-cache: true
|
30
|
+
|
25
31
|
- name: Install Diffend plugin
|
26
32
|
run: bundle plugin install diffend
|
33
|
+
|
27
34
|
- name: Bundle Secure
|
28
35
|
run: bundle secure
|
29
36
|
|
@@ -101,7 +108,17 @@ jobs:
|
|
101
108
|
uses: ruby/setup-ruby@v1
|
102
109
|
with:
|
103
110
|
ruby-version: ${{matrix.ruby}}
|
104
|
-
|
111
|
+
|
112
|
+
- name: Install latest Bundler
|
113
|
+
run: |
|
114
|
+
gem install bundler --no-document
|
115
|
+
gem update --system --no-document
|
116
|
+
bundle config set without 'tools benchmarks docs'
|
117
|
+
|
118
|
+
- name: Bundle install
|
119
|
+
run: |
|
120
|
+
bundle config set without development
|
121
|
+
bundle install
|
105
122
|
|
106
123
|
- name: Ensure all needed Kafka topics are created and wait if not
|
107
124
|
run: |
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,20 @@
|
|
1
1
|
# Karafka framework changelog
|
2
2
|
|
3
|
+
## 2.0.0-beta5 (2022-07-05)
|
4
|
+
- Always resume processing of a revoked partition upon assignment.
|
5
|
+
- Improve specs stability.
|
6
|
+
- Fix a case where revocation job would be executed on partition for which we never did any work.
|
7
|
+
- Introduce a jobs group coordinator for easier jobs management.
|
8
|
+
- Improve stability of resuming paused partitions that were revoked and re-assigned.
|
9
|
+
- Optimize reaction time on partition ownership changes.
|
10
|
+
- Fix a bug where despite setting long max wait time, we would return messages prior to it while not reaching the desired max messages count.
|
11
|
+
- Add more integration specs related to polling limits.
|
12
|
+
- Remove auto-detection of re-assigned partitions upon rebalance as for too fast rebalances it could not be accurate enough. It would also mess up in case of rebalances that would happen right after a `#seek` was issued for a partition.
|
13
|
+
- Optimize the removal of pre-buffered lost partitions data.
|
14
|
+
- Always rune `#revoked` when rebalance with revocation happens.
|
15
|
+
- Evict executors upon rebalance, to prevent race-conditions.
|
16
|
+
- Align topics names for integration specs.
|
17
|
+
|
3
18
|
## 2.0.0-beta4 (2022-06-20)
|
4
19
|
- Rename job internal api methods from `#prepare` to `#before_call` and from `#teardown` to `#after_call` to abstract away jobs execution from any type of executors and consumers logic
|
5
20
|
- Remove ability of running `before_consume` and `after_consume` completely. Those should be for internal usage only.
|
data/Gemfile.lock
CHANGED
data/bin/benchmarks
CHANGED
@@ -39,8 +39,8 @@ if ENV['SEED']
|
|
39
39
|
|
40
40
|
# We do not populate data of benchmarks_0_10 as we use it with life-stream data only
|
41
41
|
%w[
|
42
|
-
|
43
|
-
|
42
|
+
benchmarks_00_01
|
43
|
+
benchmarks_00_05
|
44
44
|
].each do |topic_name|
|
45
45
|
partitions_count = topic_name.split('_').last.to_i
|
46
46
|
|
data/bin/integrations
CHANGED
@@ -21,6 +21,9 @@ ROOT_PATH = Pathname.new(File.expand_path(File.join(File.dirname(__FILE__), '../
|
|
21
21
|
# of CPU
|
22
22
|
CONCURRENCY = ENV.key?('CI') ? 5 : Etc.nprocessors * 2
|
23
23
|
|
24
|
+
# How may bytes do we want to keep from the stdout in the buffer for when we need to print it
|
25
|
+
MAX_BUFFER_OUTPUT = 10_240
|
26
|
+
|
24
27
|
# Abstraction around a single test scenario execution process
|
25
28
|
class Scenario
|
26
29
|
# How long a scenario can run before we kill it
|
@@ -84,9 +87,9 @@ class Scenario
|
|
84
87
|
# We read it so it won't grow as we use our default logger that prints to both test.log and
|
85
88
|
# to stdout. Otherwise after reaching the buffer size, it would hang
|
86
89
|
buffer = ''
|
87
|
-
@stdout.read_nonblock(
|
90
|
+
@stdout.read_nonblock(MAX_BUFFER_OUTPUT, buffer, exception: false)
|
88
91
|
@stdout_tail << buffer
|
89
|
-
@stdout_tail = @stdout_tail[-
|
92
|
+
@stdout_tail = @stdout_tail[-MAX_BUFFER_OUTPUT..-1] || @stdout_tail
|
90
93
|
|
91
94
|
!@wait_thr.alive?
|
92
95
|
end
|
@@ -114,11 +117,15 @@ class Scenario
|
|
114
117
|
if success?
|
115
118
|
print "\e[#{32}m#{'.'}\e[0m"
|
116
119
|
else
|
120
|
+
buffer = ''
|
121
|
+
|
122
|
+
@stderr.read_nonblock(MAX_BUFFER_OUTPUT, buffer, exception: false)
|
123
|
+
|
117
124
|
puts
|
118
125
|
puts "\e[#{31}m#{'[FAILED]'}\e[0m #{name}"
|
119
126
|
puts "Exit code: #{exit_code}"
|
120
127
|
puts @stdout_tail
|
121
|
-
puts
|
128
|
+
puts buffer
|
122
129
|
puts
|
123
130
|
end
|
124
131
|
end
|
data/bin/{stress → stress_many}
RENAMED
File without changes
|
data/bin/stress_one
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
|
3
|
+
# Runs a single integration spec in an endless loop
|
4
|
+
# This allows us to ensure (after long enough time) that the integration spec is stable and
|
5
|
+
# that there are no anomalies when running it for a long period of time
|
6
|
+
|
7
|
+
set -e
|
8
|
+
|
9
|
+
while :
|
10
|
+
do
|
11
|
+
reset
|
12
|
+
bin/scenario $1
|
13
|
+
done
|
data/docker-compose.yml
CHANGED
@@ -16,26 +16,31 @@ services:
|
|
16
16
|
KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181
|
17
17
|
KAFKA_AUTO_CREATE_TOPICS_ENABLE: 'true'
|
18
18
|
KAFKA_CREATE_TOPICS:
|
19
|
-
"
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
19
|
+
"integrations_00_02:2:1,\
|
20
|
+
integrations_01_02:2:1,\
|
21
|
+
integrations_02_02:2:1,\
|
22
|
+
integrations_03_02:2:1,\
|
23
|
+
integrations_04_02:2:1,\
|
24
|
+
integrations_05_02:2:1,\
|
25
|
+
integrations_06_02:2:1,\
|
26
|
+
integrations_07_02:2:1,\
|
27
|
+
integrations_08_02:2:1,\
|
28
|
+
integrations_09_02:2:1,\
|
29
29
|
integrations_10_02:2:1,\
|
30
30
|
integrations_11_02:2:1,\
|
31
31
|
integrations_12_02:2:1,\
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
32
|
+
integrations_13_02:2:1,\
|
33
|
+
integrations_14_02:2:1,\
|
34
|
+
integrations_15_02:2:1,\
|
35
|
+
integrations_16_02:2:1,\
|
36
|
+
integrations_00_03:3:1,\
|
37
|
+
integrations_01_03:3:1,\
|
38
|
+
integrations_02_03:3:1,\
|
39
|
+
integrations_03_03:3:1,\
|
40
|
+
integrations_00_10:10:1,\
|
41
|
+
integrations_01_10:10:1,\
|
42
|
+
benchmarks_00_01:1:1,\
|
43
|
+
benchmarks_00_05:5:1,\
|
44
|
+
benchmarks_00_10:10:1"
|
40
45
|
volumes:
|
41
46
|
- /var/run/docker.sock:/var/run/docker.sock
|
@@ -13,7 +13,7 @@ module Karafka
|
|
13
13
|
# @param block [Proc] block that we can use for some extra configuration
|
14
14
|
def active_job_topic(name, &block)
|
15
15
|
topic(name) do
|
16
|
-
consumer App.config.internal.active_job.
|
16
|
+
consumer App.config.internal.active_job.consumer_class
|
17
17
|
|
18
18
|
next unless block
|
19
19
|
|
data/lib/karafka/app.rb
CHANGED
@@ -10,17 +10,11 @@ module Karafka
|
|
10
10
|
attr_accessor :messages
|
11
11
|
# @return [Karafka::Connection::Client] kafka connection client
|
12
12
|
attr_accessor :client
|
13
|
-
# @return [Karafka::
|
14
|
-
attr_accessor :
|
13
|
+
# @return [Karafka::Processing::Coordinator] coordinator
|
14
|
+
attr_accessor :coordinator
|
15
15
|
# @return [Waterdrop::Producer] producer instance
|
16
16
|
attr_accessor :producer
|
17
17
|
|
18
|
-
def initialize
|
19
|
-
# We re-use one to save on object allocation
|
20
|
-
# It also allows us to transfer the consumption notion to another batch
|
21
|
-
@consumption = Processing::Result.new
|
22
|
-
end
|
23
|
-
|
24
18
|
# Can be used to run preparation code
|
25
19
|
#
|
26
20
|
# @private
|
@@ -41,9 +35,9 @@ module Karafka
|
|
41
35
|
consume
|
42
36
|
end
|
43
37
|
|
44
|
-
@consumption.success!
|
38
|
+
@coordinator.consumption(self).success!
|
45
39
|
rescue StandardError => e
|
46
|
-
@consumption.failure!
|
40
|
+
@coordinator.consumption(self).failure!
|
47
41
|
|
48
42
|
Karafka.monitor.instrument(
|
49
43
|
'error.occurred',
|
@@ -51,14 +45,19 @@ module Karafka
|
|
51
45
|
caller: self,
|
52
46
|
type: 'consumer.consume.error'
|
53
47
|
)
|
48
|
+
ensure
|
49
|
+
# We need to decrease number of jobs that this coordinator coordinates as it has finished
|
50
|
+
@coordinator.decrement
|
54
51
|
end
|
55
52
|
|
56
53
|
# @private
|
57
54
|
# @note This should not be used by the end users as it is part of the lifecycle of things but
|
58
55
|
# not as part of the public api.
|
59
56
|
def on_after_consume
|
60
|
-
if
|
61
|
-
|
57
|
+
return if revoked?
|
58
|
+
|
59
|
+
if @coordinator.success?
|
60
|
+
coordinator.pause_tracker.reset
|
62
61
|
|
63
62
|
# Mark as consumed only if manual offset management is not on
|
64
63
|
return if topic.manual_offset_management?
|
@@ -75,6 +74,10 @@ module Karafka
|
|
75
74
|
#
|
76
75
|
# @private
|
77
76
|
def on_revoked
|
77
|
+
coordinator.revoke
|
78
|
+
|
79
|
+
resume
|
80
|
+
|
78
81
|
Karafka.monitor.instrument('consumer.revoked', caller: self) do
|
79
82
|
revoked
|
80
83
|
end
|
@@ -132,9 +135,11 @@ module Karafka
|
|
132
135
|
# processed but rather at the next one. This applies to both sync and async versions of this
|
133
136
|
# method.
|
134
137
|
def mark_as_consumed(message)
|
135
|
-
|
138
|
+
unless client.mark_as_consumed(message)
|
139
|
+
coordinator.revoke
|
136
140
|
|
137
|
-
|
141
|
+
return false
|
142
|
+
end
|
138
143
|
|
139
144
|
@seek_offset = message.offset + 1
|
140
145
|
|
@@ -147,9 +152,11 @@ module Karafka
|
|
147
152
|
# @return [Boolean] true if we were able to mark the offset, false otherwise. False indicates
|
148
153
|
# that we were not able and that we have lost the partition.
|
149
154
|
def mark_as_consumed!(message)
|
150
|
-
|
155
|
+
unless client.mark_as_consumed!(message)
|
156
|
+
coordinator.revoke
|
151
157
|
|
152
|
-
|
158
|
+
return false
|
159
|
+
end
|
153
160
|
|
154
161
|
@seek_offset = message.offset + 1
|
155
162
|
|
@@ -163,7 +170,7 @@ module Karafka
|
|
163
170
|
# @param timeout [Integer, nil] how long in milliseconds do we want to pause or nil to use the
|
164
171
|
# default exponential pausing strategy defined for retries
|
165
172
|
def pause(offset, timeout = nil)
|
166
|
-
timeout ? pause_tracker.pause(timeout) : pause_tracker.pause
|
173
|
+
timeout ? coordinator.pause_tracker.pause(timeout) : coordinator.pause_tracker.pause
|
167
174
|
|
168
175
|
client.pause(
|
169
176
|
messages.metadata.topic,
|
@@ -176,7 +183,7 @@ module Karafka
|
|
176
183
|
def resume
|
177
184
|
# This is sufficient to expire a partition pause, as with it will be resumed by the listener
|
178
185
|
# thread before the next poll.
|
179
|
-
pause_tracker.expire
|
186
|
+
coordinator.pause_tracker.expire
|
180
187
|
end
|
181
188
|
|
182
189
|
# Seeks in the context of current topic and partition
|
@@ -196,7 +203,7 @@ module Karafka
|
|
196
203
|
# @note We know that partition got revoked because when we try to mark message as consumed,
|
197
204
|
# unless if is successful, it will return false
|
198
205
|
def revoked?
|
199
|
-
|
206
|
+
coordinator.revoked?
|
200
207
|
end
|
201
208
|
end
|
202
209
|
end
|
@@ -36,6 +36,12 @@ module Karafka
|
|
36
36
|
# Marks if we need to offset. If we did not store offsets, we should not commit the offset
|
37
37
|
# position as it will crash rdkafka
|
38
38
|
@offsetting = false
|
39
|
+
# We need to keep track of what we have paused for resuming
|
40
|
+
# In case we loose partition, we still need to resume it, otherwise it won't be fetched
|
41
|
+
# again if we get reassigned to it later on. We need to keep them as after revocation we
|
42
|
+
# no longer may be able to fetch them from Kafka. We could build them but it is easier
|
43
|
+
# to just keep them here and use if needed when cannot be obtained
|
44
|
+
@paused_tpls = Hash.new { |h, k| h[k] = {} }
|
39
45
|
end
|
40
46
|
|
41
47
|
# Fetches messages within boundaries defined by the settings (time, size, topics, etc).
|
@@ -45,12 +51,13 @@ module Karafka
|
|
45
51
|
# @note This method should not be executed from many threads at the same time
|
46
52
|
def batch_poll
|
47
53
|
time_poll = TimeTrackers::Poll.new(@subscription_group.max_wait_time)
|
48
|
-
time_poll.start
|
49
54
|
|
50
55
|
@buffer.clear
|
51
56
|
@rebalance_manager.clear
|
52
57
|
|
53
58
|
loop do
|
59
|
+
time_poll.start
|
60
|
+
|
54
61
|
# Don't fetch more messages if we do not have any time left
|
55
62
|
break if time_poll.exceeded?
|
56
63
|
# Don't fetch more messages if we've fetched max as we've wanted
|
@@ -69,7 +76,11 @@ module Karafka
|
|
69
76
|
# If partition revocation happens, we need to remove messages from revoked partitions
|
70
77
|
# as well as ensure we do not have duplicated due to the offset reset for partitions
|
71
78
|
# that we got assigned
|
72
|
-
|
79
|
+
# We also do early break, so the information about rebalance is used as soon as possible
|
80
|
+
if @rebalance_manager.changed?
|
81
|
+
remove_revoked_and_duplicated_messages
|
82
|
+
break
|
83
|
+
end
|
73
84
|
|
74
85
|
# Finally once we've (potentially) removed revoked, etc, if no messages were returned
|
75
86
|
# we can break.
|
@@ -144,10 +155,14 @@ module Karafka
|
|
144
155
|
|
145
156
|
internal_commit_offsets(async: false)
|
146
157
|
|
158
|
+
# Here we do not use our cached tpls because we should not try to pause something we do
|
159
|
+
# not own anymore.
|
147
160
|
tpl = topic_partition_list(topic, partition)
|
148
161
|
|
149
162
|
return unless tpl
|
150
163
|
|
164
|
+
@paused_tpls[topic][partition] = tpl
|
165
|
+
|
151
166
|
@kafka.pause(tpl)
|
152
167
|
|
153
168
|
@kafka.seek(pause_msg)
|
@@ -169,9 +184,13 @@ module Karafka
|
|
169
184
|
# We can skip performance penalty since resuming should not happen too often
|
170
185
|
internal_commit_offsets(async: false)
|
171
186
|
|
172
|
-
|
187
|
+
# If we were not able, let's try to reuse the one we have (if we have)
|
188
|
+
tpl = topic_partition_list(topic, partition) || @paused_tpls[topic][partition]
|
173
189
|
|
174
190
|
return unless tpl
|
191
|
+
# If we did not have it, it means we never paused this partition, thus no resume should
|
192
|
+
# happen in the first place
|
193
|
+
return unless @paused_tpls[topic].delete(partition)
|
175
194
|
|
176
195
|
@kafka.resume(tpl)
|
177
196
|
ensure
|
@@ -214,6 +233,7 @@ module Karafka
|
|
214
233
|
@mutex.synchronize do
|
215
234
|
@closed = false
|
216
235
|
@offsetting = false
|
236
|
+
@paused_tpls.clear
|
217
237
|
@kafka = build_consumer
|
218
238
|
end
|
219
239
|
end
|
@@ -369,7 +389,7 @@ module Karafka
|
|
369
389
|
# we are no longer responsible in a given process for processing those messages and they
|
370
390
|
# should have been picked up by a different process.
|
371
391
|
def remove_revoked_and_duplicated_messages
|
372
|
-
@rebalance_manager.
|
392
|
+
@rebalance_manager.lost_partitions.each do |topic, partitions|
|
373
393
|
partitions.each do |partition|
|
374
394
|
@buffer.delete(topic, partition)
|
375
395
|
end
|
@@ -21,12 +21,12 @@ module Karafka
|
|
21
21
|
@id = SecureRandom.uuid
|
22
22
|
@subscription_group = subscription_group
|
23
23
|
@jobs_queue = jobs_queue
|
24
|
-
@jobs_builder = ::Karafka::App.config.internal.jobs_builder
|
25
|
-
@
|
24
|
+
@jobs_builder = ::Karafka::App.config.internal.processing.jobs_builder
|
25
|
+
@coordinators = Processing::CoordinatorsBuffer.new
|
26
26
|
@client = Client.new(@subscription_group)
|
27
27
|
@executors = Processing::ExecutorsBuffer.new(@client, subscription_group)
|
28
28
|
# We reference scheduler here as it is much faster than fetching this each time
|
29
|
-
@scheduler = ::Karafka::App.config.internal.scheduler
|
29
|
+
@scheduler = ::Karafka::App.config.internal.processing.scheduler
|
30
30
|
# We keep one buffer for messages to preserve memory and not allocate extra objects
|
31
31
|
# We can do this that way because we always first schedule jobs using messages before we
|
32
32
|
# fetch another batch.
|
@@ -79,6 +79,10 @@ module Karafka
|
|
79
79
|
poll_and_remap_messages
|
80
80
|
end
|
81
81
|
|
82
|
+
# This will ensure, that in the next poll, we continue processing (if we get them back)
|
83
|
+
# partitions that we have paused
|
84
|
+
resume_assigned_partitions
|
85
|
+
|
82
86
|
# If there were revoked partitions, we need to wait on their jobs to finish before
|
83
87
|
# distributing consuming jobs as upon revoking, we might get assigned to the same
|
84
88
|
# partitions, thus getting their jobs. The revoking jobs need to finish before
|
@@ -86,6 +90,9 @@ module Karafka
|
|
86
90
|
build_and_schedule_revoke_lost_partitions_jobs
|
87
91
|
|
88
92
|
# We wait only on jobs from our subscription group. Other groups are independent.
|
93
|
+
# This will block on revoked jobs until they are finished. Those are not meant to last
|
94
|
+
# long and should not have any bigger impact on the system. Doing this in a blocking way
|
95
|
+
# simplifies the overall design and prevents from race conditions
|
89
96
|
wait
|
90
97
|
|
91
98
|
build_and_schedule_consumption_jobs
|
@@ -136,7 +143,7 @@ module Karafka
|
|
136
143
|
|
137
144
|
# Resumes processing of partitions that were paused due to an error.
|
138
145
|
def resume_paused_partitions
|
139
|
-
@
|
146
|
+
@coordinators.resume do |topic, partition|
|
140
147
|
@client.resume(topic, partition)
|
141
148
|
end
|
142
149
|
end
|
@@ -152,9 +159,23 @@ module Karafka
|
|
152
159
|
|
153
160
|
revoked_partitions.each do |topic, partitions|
|
154
161
|
partitions.each do |partition|
|
155
|
-
|
156
|
-
|
157
|
-
|
162
|
+
# We revoke the coordinator here, so we do not have to revoke it in the revoke job
|
163
|
+
# itself (this happens prior to scheduling those jobs)
|
164
|
+
@coordinators.revoke(topic, partition)
|
165
|
+
|
166
|
+
# There may be a case where we have lost partition of which data we have never
|
167
|
+
# processed (if it was assigned and revoked really fast), thus we may not have it
|
168
|
+
# here. In cases like this, we do not run a revocation job
|
169
|
+
@executors.find_all(topic, partition).each do |executor|
|
170
|
+
jobs << @jobs_builder.revoked(executor)
|
171
|
+
end
|
172
|
+
|
173
|
+
# We need to remove all the executors of a given topic partition that we have lost, so
|
174
|
+
# next time we pick up it's work, new executors kick in. This may be needed especially
|
175
|
+
# for LRJ where we could end up with a race condition
|
176
|
+
# This revocation needs to happen after the jobs are scheduled, otherwise they would
|
177
|
+
# be scheduled with new executors instead of old
|
178
|
+
@executors.revoke(topic, partition)
|
158
179
|
end
|
159
180
|
end
|
160
181
|
|
@@ -183,6 +204,17 @@ module Karafka
|
|
183
204
|
)
|
184
205
|
end
|
185
206
|
|
207
|
+
# Revoked partition needs to be resumed if we were processing them earlier. This will do
|
208
|
+
# nothing to things that we are planning to process. Without this, things we get
|
209
|
+
# re-assigned would not be polled.
|
210
|
+
def resume_assigned_partitions
|
211
|
+
@client.rebalance_manager.assigned_partitions.each do |topic, partitions|
|
212
|
+
partitions.each do |partition|
|
213
|
+
@client.resume(topic, partition)
|
214
|
+
end
|
215
|
+
end
|
216
|
+
end
|
217
|
+
|
186
218
|
# Takes the messages per topic partition and enqueues processing jobs in threads using
|
187
219
|
# given scheduler.
|
188
220
|
def build_and_schedule_consumption_jobs
|
@@ -191,11 +223,17 @@ module Karafka
|
|
191
223
|
jobs = []
|
192
224
|
|
193
225
|
@messages_buffer.each do |topic, partition, messages|
|
194
|
-
|
226
|
+
coordinator = @coordinators.find_or_create(topic, partition)
|
227
|
+
|
228
|
+
# Start work coordination for this topic partition
|
229
|
+
coordinator.start
|
230
|
+
|
231
|
+
# Count the job we're going to create here
|
232
|
+
coordinator.increment
|
195
233
|
|
196
|
-
executor = @executors.
|
234
|
+
executor = @executors.find_or_create(topic, partition, 0)
|
197
235
|
|
198
|
-
jobs << @jobs_builder.consume(executor, messages)
|
236
|
+
jobs << @jobs_builder.consume(executor, messages, coordinator)
|
199
237
|
end
|
200
238
|
|
201
239
|
@scheduler.schedule_consumption(@jobs_queue, jobs)
|
@@ -231,7 +269,7 @@ module Karafka
|
|
231
269
|
@jobs_queue.wait(@subscription_group.id)
|
232
270
|
@jobs_queue.clear(@subscription_group.id)
|
233
271
|
@client.reset
|
234
|
-
@
|
272
|
+
@coordinators.reset
|
235
273
|
@executors = Processing::ExecutorsBuffer.new(@client, @subscription_group)
|
236
274
|
end
|
237
275
|
end
|
@@ -25,6 +25,14 @@ module Karafka
|
|
25
25
|
)
|
26
26
|
end
|
27
27
|
|
28
|
+
# Revokes pause tracker for a given topic partition
|
29
|
+
#
|
30
|
+
# @param topic [String] topic name
|
31
|
+
# @param partition [Integer] partition number
|
32
|
+
def revoke(topic, partition)
|
33
|
+
@pauses[topic].delete(partition)
|
34
|
+
end
|
35
|
+
|
28
36
|
# Resumes processing of partitions for which pause time has ended.
|
29
37
|
#
|
30
38
|
# @yieldparam [String] topic name
|
@@ -18,13 +18,15 @@ module Karafka
|
|
18
18
|
# Empty array for internal usage not to create new objects
|
19
19
|
EMPTY_ARRAY = [].freeze
|
20
20
|
|
21
|
+
attr_reader :assigned_partitions, :revoked_partitions
|
22
|
+
|
21
23
|
private_constant :EMPTY_ARRAY
|
22
24
|
|
23
25
|
# @return [RebalanceManager]
|
24
26
|
def initialize
|
25
27
|
@assigned_partitions = {}
|
26
28
|
@revoked_partitions = {}
|
27
|
-
@
|
29
|
+
@changed = false
|
28
30
|
end
|
29
31
|
|
30
32
|
# Resets the rebalance manager state
|
@@ -33,26 +35,12 @@ module Karafka
|
|
33
35
|
def clear
|
34
36
|
@assigned_partitions.clear
|
35
37
|
@revoked_partitions.clear
|
36
|
-
@
|
37
|
-
end
|
38
|
-
|
39
|
-
# @return [Hash<String, Array<Integer>>] hash where the keys are the names of topics for
|
40
|
-
# which we've lost partitions and array with ids of the partitions as the value
|
41
|
-
# @note We do not consider as lost topics and partitions that got revoked and assigned
|
42
|
-
def revoked_partitions
|
43
|
-
return @revoked_partitions if @revoked_partitions.empty?
|
44
|
-
return @lost_partitions unless @lost_partitions.empty?
|
45
|
-
|
46
|
-
@revoked_partitions.each do |topic, partitions|
|
47
|
-
@lost_partitions[topic] = partitions - @assigned_partitions.fetch(topic, EMPTY_ARRAY)
|
48
|
-
end
|
49
|
-
|
50
|
-
@lost_partitions
|
38
|
+
@changed = false
|
51
39
|
end
|
52
40
|
|
53
|
-
# @return [Boolean]
|
54
|
-
def
|
55
|
-
|
41
|
+
# @return [Boolean] indicates a state change in the partitions assignment
|
42
|
+
def changed?
|
43
|
+
@changed
|
56
44
|
end
|
57
45
|
|
58
46
|
# Callback that kicks in inside of rdkafka, when new partitions are assigned.
|
@@ -62,6 +50,7 @@ module Karafka
|
|
62
50
|
# @param partitions [Rdkafka::Consumer::TopicPartitionList]
|
63
51
|
def on_partitions_assigned(_, partitions)
|
64
52
|
@assigned_partitions = partitions.to_h.transform_values { |part| part.map(&:partition) }
|
53
|
+
@changed = true
|
65
54
|
end
|
66
55
|
|
67
56
|
# Callback that kicks in inside of rdkafka, when partitions are revoked.
|
@@ -71,6 +60,18 @@ module Karafka
|
|
71
60
|
# @param partitions [Rdkafka::Consumer::TopicPartitionList]
|
72
61
|
def on_partitions_revoked(_, partitions)
|
73
62
|
@revoked_partitions = partitions.to_h.transform_values { |part| part.map(&:partition) }
|
63
|
+
@changed = true
|
64
|
+
end
|
65
|
+
|
66
|
+
# We consider as lost only partitions that were taken away and not re-assigned back to us
|
67
|
+
def lost_partitions
|
68
|
+
lost_partitions = {}
|
69
|
+
|
70
|
+
revoked_partitions.each do |topic, partitions|
|
71
|
+
lost_partitions[topic] = partitions - assigned_partitions.fetch(topic, EMPTY_ARRAY)
|
72
|
+
end
|
73
|
+
|
74
|
+
lost_partitions
|
74
75
|
end
|
75
76
|
end
|
76
77
|
end
|