waterdrop 2.8.15 → 2.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -2
- data/.rubocop.yml +48 -0
- data/.ruby-version +1 -1
- data/.yard-lint.yml +172 -72
- data/CHANGELOG.md +19 -0
- data/Gemfile +9 -9
- data/Gemfile.lint +14 -0
- data/Gemfile.lint.lock +108 -0
- data/Gemfile.lock +52 -76
- data/README.md +1 -1
- data/Rakefile +14 -2
- data/bin/integrations +31 -30
- data/bin/verify_topics_naming +8 -8
- data/config/locales/errors.yml +13 -0
- data/docker-compose.oauth.yml +56 -0
- data/docker-compose.yml +1 -1
- data/lib/waterdrop/clients/dummy.rb +9 -0
- data/lib/waterdrop/clients/rdkafka.rb +19 -3
- data/lib/waterdrop/config.rb +50 -6
- data/lib/waterdrop/connection_pool.rb +13 -11
- data/lib/waterdrop/contracts/config.rb +33 -6
- data/lib/waterdrop/contracts/message.rb +2 -2
- data/lib/waterdrop/contracts/poller_config.rb +26 -0
- data/lib/waterdrop/contracts/transactional_offset.rb +2 -2
- data/lib/waterdrop/contracts/variant.rb +18 -18
- data/lib/waterdrop/errors.rb +3 -0
- data/lib/waterdrop/instrumentation/callbacks/delivery.rb +8 -8
- data/lib/waterdrop/instrumentation/callbacks/error.rb +5 -5
- data/lib/waterdrop/instrumentation/callbacks/oauthbearer_token_refresh.rb +4 -4
- data/lib/waterdrop/instrumentation/callbacks/statistics.rb +10 -8
- data/lib/waterdrop/instrumentation/idle_disconnector_listener.rb +4 -4
- data/lib/waterdrop/instrumentation/logger_listener.rb +10 -10
- data/lib/waterdrop/instrumentation/notifications.rb +3 -0
- data/lib/waterdrop/instrumentation/vendors/datadog/metrics_listener.rb +19 -19
- data/lib/waterdrop/polling/config.rb +52 -0
- data/lib/waterdrop/polling/latch.rb +49 -0
- data/lib/waterdrop/polling/poller.rb +415 -0
- data/lib/waterdrop/polling/queue_pipe.rb +63 -0
- data/lib/waterdrop/polling/state.rb +151 -0
- data/lib/waterdrop/polling.rb +22 -0
- data/lib/waterdrop/producer/async.rb +6 -6
- data/lib/waterdrop/producer/buffer.rb +8 -8
- data/lib/waterdrop/producer/idempotence.rb +3 -3
- data/lib/waterdrop/producer/sync.rb +15 -8
- data/lib/waterdrop/producer/testing.rb +1 -1
- data/lib/waterdrop/producer/transactions.rb +6 -6
- data/lib/waterdrop/producer/variant.rb +2 -2
- data/lib/waterdrop/producer.rb +113 -30
- data/lib/waterdrop/version.rb +1 -1
- data/lib/waterdrop.rb +15 -10
- data/package-lock.json +331 -0
- data/package.json +9 -0
- data/renovate.json +26 -7
- data/waterdrop.gemspec +23 -23
- metadata +19 -17
- data/.coditsu/ci.yml +0 -3
- data/.github/CODEOWNERS +0 -3
- data/.github/FUNDING.yml +0 -1
- data/.github/ISSUE_TEMPLATE/bug_report.md +0 -43
- data/.github/ISSUE_TEMPLATE/feature_request.md +0 -20
- data/.github/workflows/ci.yml +0 -143
- data/.github/workflows/push.yml +0 -35
- data/.github/workflows/trigger-wiki-refresh.yml +0 -30
- data/.github/workflows/verify-action-pins.yml +0 -16
- data/.rspec +0 -2
- data/log/.gitkeep +0 -0
|
@@ -20,7 +20,7 @@ module WaterDrop
|
|
|
20
20
|
RdKafkaMetric = Struct.new(:type, :scope, :name, :key_location)
|
|
21
21
|
|
|
22
22
|
# Namespace under which the DD metrics should be published
|
|
23
|
-
setting :namespace, default:
|
|
23
|
+
setting :namespace, default: "waterdrop"
|
|
24
24
|
|
|
25
25
|
# Datadog client that we should use to publish the metrics
|
|
26
26
|
setting :client
|
|
@@ -35,19 +35,19 @@ module WaterDrop
|
|
|
35
35
|
# Note, that the once with `_d` come from WaterDrop, not rdkafka or Kafka
|
|
36
36
|
setting :rd_kafka_metrics, default: [
|
|
37
37
|
# Client metrics
|
|
38
|
-
RdKafkaMetric.new(:count, :root,
|
|
39
|
-
RdKafkaMetric.new(:histogram, :root,
|
|
38
|
+
RdKafkaMetric.new(:count, :root, "calls", "tx_d"),
|
|
39
|
+
RdKafkaMetric.new(:histogram, :root, "queue.size", "msg_cnt"),
|
|
40
40
|
|
|
41
41
|
# Broker metrics
|
|
42
|
-
RdKafkaMetric.new(:count, :brokers,
|
|
43
|
-
RdKafkaMetric.new(:count, :brokers,
|
|
44
|
-
RdKafkaMetric.new(:count, :brokers,
|
|
45
|
-
RdKafkaMetric.new(:gauge, :brokers,
|
|
46
|
-
RdKafkaMetric.new(:gauge, :brokers,
|
|
47
|
-
RdKafkaMetric.new(:gauge, :brokers,
|
|
48
|
-
RdKafkaMetric.new(:gauge, :brokers,
|
|
49
|
-
RdKafkaMetric.new(:gauge, :brokers,
|
|
50
|
-
RdKafkaMetric.new(:gauge, :brokers,
|
|
42
|
+
RdKafkaMetric.new(:count, :brokers, "deliver.attempts", "txretries_d"),
|
|
43
|
+
RdKafkaMetric.new(:count, :brokers, "deliver.errors", "txerrs_d"),
|
|
44
|
+
RdKafkaMetric.new(:count, :brokers, "receive.errors", "rxerrs_d"),
|
|
45
|
+
RdKafkaMetric.new(:gauge, :brokers, "queue.latency.avg", %w[outbuf_latency avg]),
|
|
46
|
+
RdKafkaMetric.new(:gauge, :brokers, "queue.latency.p95", %w[outbuf_latency p95]),
|
|
47
|
+
RdKafkaMetric.new(:gauge, :brokers, "queue.latency.p99", %w[outbuf_latency p99]),
|
|
48
|
+
RdKafkaMetric.new(:gauge, :brokers, "network.latency.avg", %w[rtt avg]),
|
|
49
|
+
RdKafkaMetric.new(:gauge, :brokers, "network.latency.p95", %w[rtt p95]),
|
|
50
|
+
RdKafkaMetric.new(:gauge, :brokers, "network.latency.p99", %w[rtt p99])
|
|
51
51
|
].freeze
|
|
52
52
|
|
|
53
53
|
configure
|
|
@@ -78,13 +78,13 @@ module WaterDrop
|
|
|
78
78
|
#
|
|
79
79
|
# @param _event [Karafka::Core::Monitoring::Event]
|
|
80
80
|
def on_error_occurred(_event)
|
|
81
|
-
count(
|
|
81
|
+
count("error_occurred", 1, tags: default_tags)
|
|
82
82
|
end
|
|
83
83
|
|
|
84
84
|
# Increases acknowledged messages counter
|
|
85
85
|
# @param _event [Karafka::Core::Monitoring::Event]
|
|
86
86
|
def on_message_acknowledged(_event)
|
|
87
|
-
increment(
|
|
87
|
+
increment("acknowledged", tags: default_tags)
|
|
88
88
|
end
|
|
89
89
|
|
|
90
90
|
%i[
|
|
@@ -216,26 +216,26 @@ module WaterDrop
|
|
|
216
216
|
tags: default_tags
|
|
217
217
|
)
|
|
218
218
|
when :brokers
|
|
219
|
-
statistics.fetch(
|
|
219
|
+
statistics.fetch("brokers").each_value do |broker_statistics|
|
|
220
220
|
# Skip bootstrap nodes
|
|
221
221
|
# Bootstrap nodes have nodeid -1, other nodes have positive
|
|
222
222
|
# node ids
|
|
223
|
-
next if broker_statistics[
|
|
223
|
+
next if broker_statistics["nodeid"] == -1
|
|
224
224
|
|
|
225
225
|
public_send(
|
|
226
226
|
metric.type,
|
|
227
227
|
metric.name,
|
|
228
228
|
broker_statistics.dig(*metric.key_location),
|
|
229
|
-
tags: default_tags + ["broker:#{broker_statistics[
|
|
229
|
+
tags: default_tags + ["broker:#{broker_statistics["nodename"]}"]
|
|
230
230
|
)
|
|
231
231
|
end
|
|
232
232
|
when :topics
|
|
233
|
-
statistics.fetch(
|
|
233
|
+
statistics.fetch("topics").each_value do |topic_statistics|
|
|
234
234
|
public_send(
|
|
235
235
|
metric.type,
|
|
236
236
|
metric.name,
|
|
237
237
|
topic_statistics.dig(*metric.key_location),
|
|
238
|
-
tags: default_tags + ["topic:#{topic_statistics[
|
|
238
|
+
tags: default_tags + ["topic:#{topic_statistics["topic"]}"]
|
|
239
239
|
)
|
|
240
240
|
end
|
|
241
241
|
else
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# WaterDrop main module
|
|
4
|
+
module WaterDrop
|
|
5
|
+
# Namespace for FD-based polling components
|
|
6
|
+
module Polling
|
|
7
|
+
# Configuration for the global FD poller singleton
|
|
8
|
+
# These settings apply to all producers using FD polling mode
|
|
9
|
+
#
|
|
10
|
+
# @example Configure before creating any producers
|
|
11
|
+
# WaterDrop::Polling::Config.setup do |config|
|
|
12
|
+
# config.thread_priority = -1
|
|
13
|
+
# config.poll_timeout = 500
|
|
14
|
+
# end
|
|
15
|
+
class Config
|
|
16
|
+
extend ::Karafka::Core::Configurable
|
|
17
|
+
|
|
18
|
+
# Ruby thread priority for the poller thread
|
|
19
|
+
# Valid range: -3 to 3 (Ruby's thread priority range)
|
|
20
|
+
# Higher values = higher priority
|
|
21
|
+
setting :thread_priority, default: 0
|
|
22
|
+
|
|
23
|
+
# IO.select timeout in milliseconds
|
|
24
|
+
# Controls how often periodic polling happens when no FD events occur
|
|
25
|
+
# Lower values = more responsive OAuth/stats callbacks but higher CPU
|
|
26
|
+
setting :poll_timeout, default: 1_000
|
|
27
|
+
|
|
28
|
+
# Initial backoff delay in milliseconds after a polling error
|
|
29
|
+
setting :backoff_min, default: 100
|
|
30
|
+
|
|
31
|
+
# Maximum backoff delay in milliseconds after repeated errors
|
|
32
|
+
# Backoff doubles on each consecutive error up to this limit
|
|
33
|
+
setting :backoff_max, default: 30_000
|
|
34
|
+
|
|
35
|
+
class << self
|
|
36
|
+
# Configures the poller settings
|
|
37
|
+
# @yield [config] Configuration block
|
|
38
|
+
# @yieldparam config [Karafka::Core::Configurable::Node] config node
|
|
39
|
+
def setup
|
|
40
|
+
configure do |config|
|
|
41
|
+
yield(config)
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
Contracts::PollerConfig.new.validate!(
|
|
45
|
+
self.config.to_h,
|
|
46
|
+
Errors::ConfigurationInvalidError
|
|
47
|
+
)
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
end
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module WaterDrop
|
|
4
|
+
module Polling
|
|
5
|
+
# A thread-safe latch for synchronizing producer close operations
|
|
6
|
+
#
|
|
7
|
+
# When a producer is closed, two threads are involved:
|
|
8
|
+
# 1. The caller thread (user code calling producer.close)
|
|
9
|
+
# 2. The poller thread (background thread running IO.select)
|
|
10
|
+
#
|
|
11
|
+
# The close sequence:
|
|
12
|
+
# 1. Caller calls producer.close -> unregister_from_poller -> Poller#unregister
|
|
13
|
+
# 2. Poller#unregister signals via control pipe and calls state.wait_for_close (blocks on latch)
|
|
14
|
+
# 3. Poller thread receives control signal, drains queue, calls state.close
|
|
15
|
+
# 4. state.close releases the latch via release!
|
|
16
|
+
# 5. Caller's wait_for_close returns, unregister completes
|
|
17
|
+
#
|
|
18
|
+
# This ensures the producer is fully drained and removed from the poller
|
|
19
|
+
# before returning control to the caller, preventing race conditions.
|
|
20
|
+
class Latch
|
|
21
|
+
def initialize
|
|
22
|
+
@mutex = Mutex.new
|
|
23
|
+
@cv = ConditionVariable.new
|
|
24
|
+
@released = false
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# Releases the latch and wakes any waiting threads
|
|
28
|
+
def release!
|
|
29
|
+
@mutex.synchronize do
|
|
30
|
+
@released = true
|
|
31
|
+
@cv.broadcast
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# Waits until the latch is released
|
|
36
|
+
# Returns immediately if already released
|
|
37
|
+
def wait
|
|
38
|
+
@mutex.synchronize do
|
|
39
|
+
@cv.wait(@mutex) until @released
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# @return [Boolean] whether the latch has been released
|
|
44
|
+
def released?
|
|
45
|
+
@released
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|
|
@@ -0,0 +1,415 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module WaterDrop
|
|
4
|
+
# Namespace for FD-based polling components
|
|
5
|
+
# Contains the global Poller singleton and State class for managing producer polling
|
|
6
|
+
module Polling
|
|
7
|
+
# Global poller singleton that manages a single polling thread for all FD-mode producers
|
|
8
|
+
# This replaces librdkafka's native background polling threads with a single Ruby thread
|
|
9
|
+
# that uses IO.select for efficient multiplexing
|
|
10
|
+
#
|
|
11
|
+
# Spawning one thread per producer is acceptable for 1-2 producers but in case of a system
|
|
12
|
+
# with several (transactional for example) the cost becomes bigger and bigger.
|
|
13
|
+
#
|
|
14
|
+
# This implementation handles things by being event-driven instead of GVL releasing blocking.
|
|
15
|
+
#
|
|
16
|
+
# @note Newly registered producers may experience up to 1 second delay before their first
|
|
17
|
+
# poll cycle, as the poller thread only rebuilds its IO list when IO.select times out.
|
|
18
|
+
# This is acceptable because producers are expected to be long-lived and the initial
|
|
19
|
+
# connection overhead to Kafka typically exceeds this delay anyway.
|
|
20
|
+
class Poller
|
|
21
|
+
include Singleton
|
|
22
|
+
include ::Karafka::Core::Helpers::Time
|
|
23
|
+
|
|
24
|
+
# Make new public so users can create dedicated poller instances for isolation
|
|
25
|
+
# The singleton instance remains available via Poller.instance for the default behavior
|
|
26
|
+
public_class_method :new
|
|
27
|
+
|
|
28
|
+
# Mutex for thread-safe ID generation - initialized at class load time
|
|
29
|
+
# to avoid race conditions with lazy initialization
|
|
30
|
+
ID_MUTEX = Mutex.new
|
|
31
|
+
|
|
32
|
+
# Counter for generating unique poller IDs
|
|
33
|
+
@id_counter = 0
|
|
34
|
+
|
|
35
|
+
class << self
|
|
36
|
+
# Generates incremental IDs for poller instances (starting from 0)
|
|
37
|
+
# @return [Integer] next poller ID
|
|
38
|
+
def next_id
|
|
39
|
+
ID_MUTEX.synchronize do
|
|
40
|
+
id = @id_counter
|
|
41
|
+
@id_counter += 1
|
|
42
|
+
id
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# @return [Integer] unique identifier for this poller instance
|
|
48
|
+
attr_reader :id
|
|
49
|
+
|
|
50
|
+
def initialize
|
|
51
|
+
@id = self.class.next_id
|
|
52
|
+
@mutex = Mutex.new
|
|
53
|
+
@producers = {}
|
|
54
|
+
@thread = nil
|
|
55
|
+
@shutdown = false
|
|
56
|
+
@pid = Process.pid
|
|
57
|
+
|
|
58
|
+
# Cached collections - rebuilt only when producers change
|
|
59
|
+
@cached_ios = []
|
|
60
|
+
@cached_io_to_state = {}
|
|
61
|
+
@cached_states = []
|
|
62
|
+
@cached_result = nil
|
|
63
|
+
@ios_dirty = true
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# Checks if the current thread is the poller thread
|
|
67
|
+
# Used to detect when close is called from within a callback to avoid deadlock
|
|
68
|
+
# @return [Boolean] true if current thread is the poller thread
|
|
69
|
+
def in_poller_thread?
|
|
70
|
+
Thread.current == @thread
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# Checks if the poller thread is alive
|
|
74
|
+
# @return [Boolean] true if the poller thread is running
|
|
75
|
+
def alive?
|
|
76
|
+
@thread&.alive? || false
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# Returns the number of registered producers
|
|
80
|
+
# @return [Integer] number of producers
|
|
81
|
+
def count
|
|
82
|
+
@mutex.synchronize { @producers.size }
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
# Shuts down the poller and resets state
|
|
86
|
+
# @note This is primarily for testing to reset singleton state between tests
|
|
87
|
+
def shutdown!
|
|
88
|
+
@mutex.synchronize { @shutdown = true }
|
|
89
|
+
|
|
90
|
+
thread = @thread
|
|
91
|
+
if thread&.alive?
|
|
92
|
+
thread.join(5)
|
|
93
|
+
thread.kill if thread.alive?
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
@mutex.synchronize do
|
|
97
|
+
@producers.each_value { |state| state.close unless state.closed? }
|
|
98
|
+
@producers.clear
|
|
99
|
+
@thread = nil
|
|
100
|
+
@shutdown = false
|
|
101
|
+
@ios_dirty = true
|
|
102
|
+
@cached_ios = []
|
|
103
|
+
@cached_io_to_state = {}
|
|
104
|
+
@cached_states = []
|
|
105
|
+
@cached_result = nil
|
|
106
|
+
@poll_timeout_s = nil
|
|
107
|
+
end
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
# Registers a producer with the poller
|
|
111
|
+
# @param producer [WaterDrop::Producer] the producer instance
|
|
112
|
+
# @param client [Rdkafka::Producer] the rdkafka client
|
|
113
|
+
def register(producer, client)
|
|
114
|
+
ensure_same_process!
|
|
115
|
+
|
|
116
|
+
state = State.new(
|
|
117
|
+
producer.id,
|
|
118
|
+
client,
|
|
119
|
+
producer.monitor,
|
|
120
|
+
producer.config.polling.fd.max_time,
|
|
121
|
+
producer.config.polling.fd.periodic_poll_interval
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
@mutex.synchronize do
|
|
125
|
+
@producers[producer.id] = state
|
|
126
|
+
@ios_dirty = true
|
|
127
|
+
# Reset shutdown flag in case thread is exiting but hasn't yet
|
|
128
|
+
# This prevents race where new producer is closed by exiting thread
|
|
129
|
+
@shutdown = false
|
|
130
|
+
ensure_thread_running!
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
producer.monitor.instrument(
|
|
134
|
+
"poller.producer_registered",
|
|
135
|
+
producer_id: producer.id
|
|
136
|
+
)
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
# Unregisters a producer from the poller
|
|
140
|
+
# This method blocks until the producer is fully removed from the poller
|
|
141
|
+
# to prevent race conditions when disconnect/reconnect happens in quick succession
|
|
142
|
+
# This matches the threaded polling behavior which drains without timeout
|
|
143
|
+
# @param producer [WaterDrop::Producer] the producer instance
|
|
144
|
+
def unregister(producer)
|
|
145
|
+
state, thread = @mutex.synchronize { [@producers[producer.id], @thread] }
|
|
146
|
+
|
|
147
|
+
return unless state
|
|
148
|
+
|
|
149
|
+
# Signal the poller thread to handle removal
|
|
150
|
+
state.signal_close
|
|
151
|
+
|
|
152
|
+
# Wait for the state to be fully closed by the poller thread
|
|
153
|
+
# This prevents race conditions where a new registration with the same
|
|
154
|
+
# producer_id could be deleted by a pending close signal
|
|
155
|
+
# Skip waiting if called from within the poller thread itself (e.g., from a callback)
|
|
156
|
+
# to avoid deadlock - the poller thread can't wait for itself
|
|
157
|
+
# The cleanup will happen after the callback returns
|
|
158
|
+
state.wait_for_close unless Thread.current == thread
|
|
159
|
+
|
|
160
|
+
producer.monitor.instrument(
|
|
161
|
+
"poller.producer_unregistered",
|
|
162
|
+
producer_id: producer.id
|
|
163
|
+
)
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
private
|
|
167
|
+
|
|
168
|
+
# Ensures we're in the same process (for fork safety)
|
|
169
|
+
def ensure_same_process!
|
|
170
|
+
return if @pid == Process.pid
|
|
171
|
+
|
|
172
|
+
# Reset state after fork - parent's thread and producers are not valid in child
|
|
173
|
+
@mutex = Mutex.new
|
|
174
|
+
@producers = {}
|
|
175
|
+
@thread = nil
|
|
176
|
+
@shutdown = false
|
|
177
|
+
@pid = Process.pid
|
|
178
|
+
@cached_ios = []
|
|
179
|
+
@cached_io_to_state = {}
|
|
180
|
+
@cached_states = []
|
|
181
|
+
@cached_result = nil
|
|
182
|
+
@ios_dirty = true
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
# Ensures the polling thread is running
|
|
186
|
+
# Must be called within @mutex.synchronize
|
|
187
|
+
def ensure_thread_running!
|
|
188
|
+
return if @thread&.alive?
|
|
189
|
+
|
|
190
|
+
@shutdown = false
|
|
191
|
+
@thread = Thread.new { polling_loop }
|
|
192
|
+
@thread.name = "waterdrop.poller##{@id}"
|
|
193
|
+
@thread.priority = Config.config.thread_priority
|
|
194
|
+
end
|
|
195
|
+
|
|
196
|
+
# Main polling loop that runs in a dedicated thread
|
|
197
|
+
def polling_loop
|
|
198
|
+
backoff_ms = 0
|
|
199
|
+
|
|
200
|
+
loop do
|
|
201
|
+
break if @shutdown
|
|
202
|
+
|
|
203
|
+
# Apply backoff from previous error
|
|
204
|
+
if backoff_ms > 0
|
|
205
|
+
sleep(backoff_ms / 1_000.0)
|
|
206
|
+
backoff_ms = 0
|
|
207
|
+
end
|
|
208
|
+
|
|
209
|
+
# Collect readable IOs (queue FDs)
|
|
210
|
+
readable_ios, io_to_state = collect_readable_ios
|
|
211
|
+
|
|
212
|
+
# Exit when no producers registered
|
|
213
|
+
# New registrations will start a fresh thread via ensure_thread_running!
|
|
214
|
+
break if readable_ios.empty?
|
|
215
|
+
|
|
216
|
+
poll_with_select(readable_ios, io_to_state)
|
|
217
|
+
rescue => e
|
|
218
|
+
# Report error and apply exponential backoff to prevent spam
|
|
219
|
+
broadcast_error("poller.polling_loop", e)
|
|
220
|
+
backoff_ms =
|
|
221
|
+
if backoff_ms.zero?
|
|
222
|
+
Config.config.backoff_min
|
|
223
|
+
else
|
|
224
|
+
[backoff_ms * 2, Config.config.backoff_max].min
|
|
225
|
+
end
|
|
226
|
+
end
|
|
227
|
+
ensure
|
|
228
|
+
# Clear thread reference first so new registrations will start a fresh thread
|
|
229
|
+
# This prevents race where register sees old thread as alive during cleanup
|
|
230
|
+
@mutex.synchronize { @thread = nil }
|
|
231
|
+
|
|
232
|
+
# When the poller thread exits (error or clean shutdown), close all remaining states
|
|
233
|
+
# This releases any latches that might be waiting in unregister calls
|
|
234
|
+
close_all_states
|
|
235
|
+
end
|
|
236
|
+
|
|
237
|
+
# Broadcasts an error to all registered producers' monitors
|
|
238
|
+
# @param type [String] error type identifier
|
|
239
|
+
# @param error [Exception] the error to report
|
|
240
|
+
def broadcast_error(type, error)
|
|
241
|
+
@cached_states.each do |state|
|
|
242
|
+
state.monitor.instrument(
|
|
243
|
+
"error.occurred",
|
|
244
|
+
type: type,
|
|
245
|
+
error: error,
|
|
246
|
+
producer_id: state.producer_id
|
|
247
|
+
)
|
|
248
|
+
end
|
|
249
|
+
end
|
|
250
|
+
|
|
251
|
+
# Collects all IOs to monitor and builds a mapping from IO to State
|
|
252
|
+
# Uses cached arrays when possible to avoid allocations in the hot path
|
|
253
|
+
# @return [Array<Array<IO>, Hash{IO => State}, Array<State>>] tuple of ios, io-to-state map, states
|
|
254
|
+
def collect_readable_ios
|
|
255
|
+
# Fast path: return cached result if not dirty (no mutex needed)
|
|
256
|
+
# Safe because @cached_result is frozen and assigned atomically
|
|
257
|
+
return @cached_result unless @ios_dirty
|
|
258
|
+
|
|
259
|
+
@mutex.synchronize do
|
|
260
|
+
@cached_ios = []
|
|
261
|
+
@cached_io_to_state = {}
|
|
262
|
+
@cached_states = []
|
|
263
|
+
|
|
264
|
+
@producers.each_value do |state|
|
|
265
|
+
io = state.io
|
|
266
|
+
@cached_ios << io
|
|
267
|
+
@cached_io_to_state[io] = state
|
|
268
|
+
@cached_states << state
|
|
269
|
+
end
|
|
270
|
+
|
|
271
|
+
@cached_result = [@cached_ios, @cached_io_to_state, @cached_states].freeze
|
|
272
|
+
@ios_dirty = false
|
|
273
|
+
end
|
|
274
|
+
|
|
275
|
+
@cached_result
|
|
276
|
+
end
|
|
277
|
+
|
|
278
|
+
# Poll producers using IO.select for efficient multiplexing
|
|
279
|
+
# @param readable_ios [Array<IO>] IOs to monitor
|
|
280
|
+
# @param io_to_state [Hash{IO => State}] mapping from IO to state
|
|
281
|
+
def poll_with_select(readable_ios, io_to_state)
|
|
282
|
+
begin
|
|
283
|
+
ready = IO.select(readable_ios, nil, nil, poll_timeout_s)
|
|
284
|
+
rescue IOError, Errno::EBADF
|
|
285
|
+
# An IO was closed - mark dirty to rebuild on next iteration
|
|
286
|
+
@ios_dirty = true
|
|
287
|
+
return
|
|
288
|
+
end
|
|
289
|
+
|
|
290
|
+
if ready.nil?
|
|
291
|
+
# Timeout: poll ALL producers to ensure OAuth/stats fire
|
|
292
|
+
poll_all_producers
|
|
293
|
+
else
|
|
294
|
+
# FDs ready: handle close signals and poll active producers
|
|
295
|
+
any_polled = false
|
|
296
|
+
|
|
297
|
+
ready[0].each do |io|
|
|
298
|
+
state = io_to_state[io]
|
|
299
|
+
next unless state
|
|
300
|
+
|
|
301
|
+
# Drain the pipe first (clears librdkafka signals + our signals)
|
|
302
|
+
state.drain
|
|
303
|
+
|
|
304
|
+
# Check if this producer is closing (flag set before signal)
|
|
305
|
+
if state.closing?
|
|
306
|
+
handle_close_signal(state)
|
|
307
|
+
else
|
|
308
|
+
poll_producer(state)
|
|
309
|
+
# Check if callback signaled close while we were polling
|
|
310
|
+
# (e.g., user code closed producer from within delivery callback)
|
|
311
|
+
if state.closing?
|
|
312
|
+
handle_close_signal(state)
|
|
313
|
+
else
|
|
314
|
+
any_polled = true
|
|
315
|
+
end
|
|
316
|
+
end
|
|
317
|
+
end
|
|
318
|
+
|
|
319
|
+
# Check for stale producers when actively polling
|
|
320
|
+
# Skip when single producer (most common case) - no other producers to become stale
|
|
321
|
+
# (ensures OAuth/stats fire for idle producers when others are busy)
|
|
322
|
+
poll_stale_producers if any_polled && @cached_states.size > 1
|
|
323
|
+
end
|
|
324
|
+
end
|
|
325
|
+
|
|
326
|
+
# Polls all registered producers
|
|
327
|
+
# Called when IO.select times out to ensure periodic polling happens
|
|
328
|
+
# This ensures OAuth token refresh and statistics callbacks fire for all producers
|
|
329
|
+
def poll_all_producers
|
|
330
|
+
@cached_states.each { |state| poll_producer(state) }
|
|
331
|
+
end
|
|
332
|
+
|
|
333
|
+
# Polls producers that haven't been polled recently
|
|
334
|
+
# Called when processing continue signals to prevent starvation of idle producers
|
|
335
|
+
# when one producer is very busy
|
|
336
|
+
# Each State internally throttles the check to avoid excessive overhead
|
|
337
|
+
def poll_stale_producers
|
|
338
|
+
@cached_states.each do |state|
|
|
339
|
+
poll_producer(state) if state.needs_periodic_poll?
|
|
340
|
+
end
|
|
341
|
+
end
|
|
342
|
+
|
|
343
|
+
# Drains the producer's event queue by polling until empty or time quanta exceeded
|
|
344
|
+
# @param state [State] the producer state
|
|
345
|
+
def poll_producer(state)
|
|
346
|
+
# state.poll returns:
|
|
347
|
+
# - true when queue is empty (fully drained)
|
|
348
|
+
# - false when timeout hit (more events may remain)
|
|
349
|
+
drained = state.poll
|
|
350
|
+
state.mark_polled!
|
|
351
|
+
|
|
352
|
+
# Hit time limit but still have events - signal to continue polling
|
|
353
|
+
state.signal_continue unless drained
|
|
354
|
+
rescue Rdkafka::ClosedProducerError
|
|
355
|
+
# Producer was closed, will be cleaned up
|
|
356
|
+
end
|
|
357
|
+
|
|
358
|
+
# Handles a close signal from a producer
|
|
359
|
+
# @param state [State] the producer state
|
|
360
|
+
def handle_close_signal(state)
|
|
361
|
+
# Drain remaining events before closing
|
|
362
|
+
# This matches rdkafka's native polling thread behavior: keep polling until outq_len is zero
|
|
363
|
+
drain_producer_queue(state)
|
|
364
|
+
|
|
365
|
+
# Remove producer from registry and clean up
|
|
366
|
+
# If this was the last producer, signal shutdown to stop the thread immediately
|
|
367
|
+
@mutex.synchronize do
|
|
368
|
+
@producers.delete(state.producer_id)
|
|
369
|
+
@ios_dirty = true
|
|
370
|
+
|
|
371
|
+
# Stop thread immediately when last producer unregisters to prevent resource leakage
|
|
372
|
+
@shutdown = true if @producers.empty?
|
|
373
|
+
end
|
|
374
|
+
|
|
375
|
+
state.close
|
|
376
|
+
end
|
|
377
|
+
|
|
378
|
+
# Closes all remaining producer states
|
|
379
|
+
# Called when the poller thread exits to release any pending latches
|
|
380
|
+
# This prevents deadlocks if producers are waiting in unregister
|
|
381
|
+
def close_all_states
|
|
382
|
+
states = @mutex.synchronize do
|
|
383
|
+
to_close = @producers.values.dup
|
|
384
|
+
@producers.clear
|
|
385
|
+
@ios_dirty = true
|
|
386
|
+
to_close
|
|
387
|
+
end
|
|
388
|
+
|
|
389
|
+
states.each do |state|
|
|
390
|
+
state.close unless state.closed?
|
|
391
|
+
rescue
|
|
392
|
+
# Ignore errors during cleanup
|
|
393
|
+
end
|
|
394
|
+
end
|
|
395
|
+
|
|
396
|
+
# Drains the producer's event queue completely before closing
|
|
397
|
+
# Matches rdkafka's native polling thread behavior: keep polling until queue is empty
|
|
398
|
+
# @param state [State] the producer state
|
|
399
|
+
def drain_producer_queue(state)
|
|
400
|
+
loop do
|
|
401
|
+
break if state.queue_empty?
|
|
402
|
+
|
|
403
|
+
state.poll
|
|
404
|
+
end
|
|
405
|
+
rescue Rdkafka::ClosedProducerError
|
|
406
|
+
# Producer was already closed, nothing more to drain
|
|
407
|
+
end
|
|
408
|
+
|
|
409
|
+
# @return [Float] poll_timeout converted to seconds (cached)
|
|
410
|
+
def poll_timeout_s
|
|
411
|
+
@poll_timeout_s ||= Config.config.poll_timeout / 1_000.0
|
|
412
|
+
end
|
|
413
|
+
end
|
|
414
|
+
end
|
|
415
|
+
end
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module WaterDrop
|
|
4
|
+
module Polling
|
|
5
|
+
# A pipe connected to librdkafka's queue event notification system
|
|
6
|
+
# When events (delivery reports, statistics) arrive, librdkafka writes to the pipe
|
|
7
|
+
# allowing IO.select to wake up immediately
|
|
8
|
+
#
|
|
9
|
+
# This pipe is also used by WaterDrop to signal:
|
|
10
|
+
# - Continue: when poll hits time limit but more events remain
|
|
11
|
+
# - Close: when producer is being closed (combined with @closing flag in State)
|
|
12
|
+
#
|
|
13
|
+
# Reusing the same pipe reduces file descriptors and IO.select monitoring overhead
|
|
14
|
+
class QueuePipe
|
|
15
|
+
# @return [IO] the readable end of the pipe for use with IO.select
|
|
16
|
+
attr_reader :reader
|
|
17
|
+
|
|
18
|
+
# Creates a new queue pipe and connects it to the client's event queue
|
|
19
|
+
# @param client [Rdkafka::Producer] the rdkafka client
|
|
20
|
+
# @raise [StandardError] if enable_queue_io_events fails
|
|
21
|
+
def initialize(client)
|
|
22
|
+
@reader, @writer = IO.pipe
|
|
23
|
+
|
|
24
|
+
# Tell librdkafka to write to our pipe when events arrive on the main queue
|
|
25
|
+
client.enable_queue_io_events(@writer.fileno)
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# Signals by writing a byte to the pipe
|
|
29
|
+
# Used to wake IO.select for continue/close signals
|
|
30
|
+
# Thread-safe and non-blocking; silently ignores errors
|
|
31
|
+
def signal
|
|
32
|
+
@writer.write_nonblock("W", exception: false)
|
|
33
|
+
rescue IOError, Errno::EBADF
|
|
34
|
+
# Pipe closed
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# Drains all pending bytes from the pipe
|
|
38
|
+
# Called after IO.select returns to clear the notification
|
|
39
|
+
# Uses a single large read to drain in one syscall (pipe buffers are typically 64KB)
|
|
40
|
+
def drain
|
|
41
|
+
@reader.read_nonblock(1_048_576, exception: false)
|
|
42
|
+
rescue IOError, Errno::EBADF
|
|
43
|
+
# Pipe closed during drain
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
# Closes both ends of the pipe
|
|
47
|
+
def close
|
|
48
|
+
close_io(@reader)
|
|
49
|
+
close_io(@writer)
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
private
|
|
53
|
+
|
|
54
|
+
# Safely closes an IO object
|
|
55
|
+
# @param io [IO] the IO to close
|
|
56
|
+
def close_io(io)
|
|
57
|
+
io.close
|
|
58
|
+
rescue IOError, Errno::EBADF
|
|
59
|
+
# Already closed, ignore
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
end
|