waterdrop 2.8.15 → 2.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -2
  3. data/.rubocop.yml +48 -0
  4. data/.ruby-version +1 -1
  5. data/.yard-lint.yml +172 -72
  6. data/CHANGELOG.md +19 -0
  7. data/Gemfile +9 -9
  8. data/Gemfile.lint +14 -0
  9. data/Gemfile.lint.lock +108 -0
  10. data/Gemfile.lock +52 -76
  11. data/README.md +1 -1
  12. data/Rakefile +14 -2
  13. data/bin/integrations +31 -30
  14. data/bin/verify_topics_naming +8 -8
  15. data/config/locales/errors.yml +13 -0
  16. data/docker-compose.oauth.yml +56 -0
  17. data/docker-compose.yml +1 -1
  18. data/lib/waterdrop/clients/dummy.rb +9 -0
  19. data/lib/waterdrop/clients/rdkafka.rb +19 -3
  20. data/lib/waterdrop/config.rb +50 -6
  21. data/lib/waterdrop/connection_pool.rb +13 -11
  22. data/lib/waterdrop/contracts/config.rb +33 -6
  23. data/lib/waterdrop/contracts/message.rb +2 -2
  24. data/lib/waterdrop/contracts/poller_config.rb +26 -0
  25. data/lib/waterdrop/contracts/transactional_offset.rb +2 -2
  26. data/lib/waterdrop/contracts/variant.rb +18 -18
  27. data/lib/waterdrop/errors.rb +3 -0
  28. data/lib/waterdrop/instrumentation/callbacks/delivery.rb +8 -8
  29. data/lib/waterdrop/instrumentation/callbacks/error.rb +5 -5
  30. data/lib/waterdrop/instrumentation/callbacks/oauthbearer_token_refresh.rb +4 -4
  31. data/lib/waterdrop/instrumentation/callbacks/statistics.rb +10 -8
  32. data/lib/waterdrop/instrumentation/idle_disconnector_listener.rb +4 -4
  33. data/lib/waterdrop/instrumentation/logger_listener.rb +10 -10
  34. data/lib/waterdrop/instrumentation/notifications.rb +3 -0
  35. data/lib/waterdrop/instrumentation/vendors/datadog/metrics_listener.rb +19 -19
  36. data/lib/waterdrop/polling/config.rb +52 -0
  37. data/lib/waterdrop/polling/latch.rb +49 -0
  38. data/lib/waterdrop/polling/poller.rb +415 -0
  39. data/lib/waterdrop/polling/queue_pipe.rb +63 -0
  40. data/lib/waterdrop/polling/state.rb +151 -0
  41. data/lib/waterdrop/polling.rb +22 -0
  42. data/lib/waterdrop/producer/async.rb +6 -6
  43. data/lib/waterdrop/producer/buffer.rb +8 -8
  44. data/lib/waterdrop/producer/idempotence.rb +3 -3
  45. data/lib/waterdrop/producer/sync.rb +15 -8
  46. data/lib/waterdrop/producer/testing.rb +1 -1
  47. data/lib/waterdrop/producer/transactions.rb +6 -6
  48. data/lib/waterdrop/producer/variant.rb +2 -2
  49. data/lib/waterdrop/producer.rb +113 -30
  50. data/lib/waterdrop/version.rb +1 -1
  51. data/lib/waterdrop.rb +15 -10
  52. data/package-lock.json +331 -0
  53. data/package.json +9 -0
  54. data/renovate.json +26 -7
  55. data/waterdrop.gemspec +23 -23
  56. metadata +19 -17
  57. data/.coditsu/ci.yml +0 -3
  58. data/.github/CODEOWNERS +0 -3
  59. data/.github/FUNDING.yml +0 -1
  60. data/.github/ISSUE_TEMPLATE/bug_report.md +0 -43
  61. data/.github/ISSUE_TEMPLATE/feature_request.md +0 -20
  62. data/.github/workflows/ci.yml +0 -143
  63. data/.github/workflows/push.yml +0 -35
  64. data/.github/workflows/trigger-wiki-refresh.yml +0 -30
  65. data/.github/workflows/verify-action-pins.yml +0 -16
  66. data/.rspec +0 -2
  67. data/log/.gitkeep +0 -0
@@ -20,7 +20,7 @@ module WaterDrop
20
20
  RdKafkaMetric = Struct.new(:type, :scope, :name, :key_location)
21
21
 
22
22
  # Namespace under which the DD metrics should be published
23
- setting :namespace, default: 'waterdrop'
23
+ setting :namespace, default: "waterdrop"
24
24
 
25
25
  # Datadog client that we should use to publish the metrics
26
26
  setting :client
@@ -35,19 +35,19 @@ module WaterDrop
35
35
  # Note, that the once with `_d` come from WaterDrop, not rdkafka or Kafka
36
36
  setting :rd_kafka_metrics, default: [
37
37
  # Client metrics
38
- RdKafkaMetric.new(:count, :root, 'calls', 'tx_d'),
39
- RdKafkaMetric.new(:histogram, :root, 'queue.size', 'msg_cnt'),
38
+ RdKafkaMetric.new(:count, :root, "calls", "tx_d"),
39
+ RdKafkaMetric.new(:histogram, :root, "queue.size", "msg_cnt"),
40
40
 
41
41
  # Broker metrics
42
- RdKafkaMetric.new(:count, :brokers, 'deliver.attempts', 'txretries_d'),
43
- RdKafkaMetric.new(:count, :brokers, 'deliver.errors', 'txerrs_d'),
44
- RdKafkaMetric.new(:count, :brokers, 'receive.errors', 'rxerrs_d'),
45
- RdKafkaMetric.new(:gauge, :brokers, 'queue.latency.avg', %w[outbuf_latency avg]),
46
- RdKafkaMetric.new(:gauge, :brokers, 'queue.latency.p95', %w[outbuf_latency p95]),
47
- RdKafkaMetric.new(:gauge, :brokers, 'queue.latency.p99', %w[outbuf_latency p99]),
48
- RdKafkaMetric.new(:gauge, :brokers, 'network.latency.avg', %w[rtt avg]),
49
- RdKafkaMetric.new(:gauge, :brokers, 'network.latency.p95', %w[rtt p95]),
50
- RdKafkaMetric.new(:gauge, :brokers, 'network.latency.p99', %w[rtt p99])
42
+ RdKafkaMetric.new(:count, :brokers, "deliver.attempts", "txretries_d"),
43
+ RdKafkaMetric.new(:count, :brokers, "deliver.errors", "txerrs_d"),
44
+ RdKafkaMetric.new(:count, :brokers, "receive.errors", "rxerrs_d"),
45
+ RdKafkaMetric.new(:gauge, :brokers, "queue.latency.avg", %w[outbuf_latency avg]),
46
+ RdKafkaMetric.new(:gauge, :brokers, "queue.latency.p95", %w[outbuf_latency p95]),
47
+ RdKafkaMetric.new(:gauge, :brokers, "queue.latency.p99", %w[outbuf_latency p99]),
48
+ RdKafkaMetric.new(:gauge, :brokers, "network.latency.avg", %w[rtt avg]),
49
+ RdKafkaMetric.new(:gauge, :brokers, "network.latency.p95", %w[rtt p95]),
50
+ RdKafkaMetric.new(:gauge, :brokers, "network.latency.p99", %w[rtt p99])
51
51
  ].freeze
52
52
 
53
53
  configure
@@ -78,13 +78,13 @@ module WaterDrop
78
78
  #
79
79
  # @param _event [Karafka::Core::Monitoring::Event]
80
80
  def on_error_occurred(_event)
81
- count('error_occurred', 1, tags: default_tags)
81
+ count("error_occurred", 1, tags: default_tags)
82
82
  end
83
83
 
84
84
  # Increases acknowledged messages counter
85
85
  # @param _event [Karafka::Core::Monitoring::Event]
86
86
  def on_message_acknowledged(_event)
87
- increment('acknowledged', tags: default_tags)
87
+ increment("acknowledged", tags: default_tags)
88
88
  end
89
89
 
90
90
  %i[
@@ -216,26 +216,26 @@ module WaterDrop
216
216
  tags: default_tags
217
217
  )
218
218
  when :brokers
219
- statistics.fetch('brokers').each_value do |broker_statistics|
219
+ statistics.fetch("brokers").each_value do |broker_statistics|
220
220
  # Skip bootstrap nodes
221
221
  # Bootstrap nodes have nodeid -1, other nodes have positive
222
222
  # node ids
223
- next if broker_statistics['nodeid'] == -1
223
+ next if broker_statistics["nodeid"] == -1
224
224
 
225
225
  public_send(
226
226
  metric.type,
227
227
  metric.name,
228
228
  broker_statistics.dig(*metric.key_location),
229
- tags: default_tags + ["broker:#{broker_statistics['nodename']}"]
229
+ tags: default_tags + ["broker:#{broker_statistics["nodename"]}"]
230
230
  )
231
231
  end
232
232
  when :topics
233
- statistics.fetch('topics').each_value do |topic_statistics|
233
+ statistics.fetch("topics").each_value do |topic_statistics|
234
234
  public_send(
235
235
  metric.type,
236
236
  metric.name,
237
237
  topic_statistics.dig(*metric.key_location),
238
- tags: default_tags + ["topic:#{topic_statistics['topic']}"]
238
+ tags: default_tags + ["topic:#{topic_statistics["topic"]}"]
239
239
  )
240
240
  end
241
241
  else
@@ -0,0 +1,52 @@
1
+ # frozen_string_literal: true
2
+
3
+ # WaterDrop main module
4
+ module WaterDrop
5
+ # Namespace for FD-based polling components
6
+ module Polling
7
+ # Configuration for the global FD poller singleton
8
+ # These settings apply to all producers using FD polling mode
9
+ #
10
+ # @example Configure before creating any producers
11
+ # WaterDrop::Polling::Config.setup do |config|
12
+ # config.thread_priority = -1
13
+ # config.poll_timeout = 500
14
+ # end
15
+ class Config
16
+ extend ::Karafka::Core::Configurable
17
+
18
+ # Ruby thread priority for the poller thread
19
+ # Valid range: -3 to 3 (Ruby's thread priority range)
20
+ # Higher values = higher priority
21
+ setting :thread_priority, default: 0
22
+
23
+ # IO.select timeout in milliseconds
24
+ # Controls how often periodic polling happens when no FD events occur
25
+ # Lower values = more responsive OAuth/stats callbacks but higher CPU
26
+ setting :poll_timeout, default: 1_000
27
+
28
+ # Initial backoff delay in milliseconds after a polling error
29
+ setting :backoff_min, default: 100
30
+
31
+ # Maximum backoff delay in milliseconds after repeated errors
32
+ # Backoff doubles on each consecutive error up to this limit
33
+ setting :backoff_max, default: 30_000
34
+
35
+ class << self
36
+ # Configures the poller settings
37
+ # @yield [config] Configuration block
38
+ # @yieldparam config [Karafka::Core::Configurable::Node] config node
39
+ def setup
40
+ configure do |config|
41
+ yield(config)
42
+ end
43
+
44
+ Contracts::PollerConfig.new.validate!(
45
+ self.config.to_h,
46
+ Errors::ConfigurationInvalidError
47
+ )
48
+ end
49
+ end
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,49 @@
1
+ # frozen_string_literal: true
2
+
3
+ module WaterDrop
4
+ module Polling
5
+ # A thread-safe latch for synchronizing producer close operations
6
+ #
7
+ # When a producer is closed, two threads are involved:
8
+ # 1. The caller thread (user code calling producer.close)
9
+ # 2. The poller thread (background thread running IO.select)
10
+ #
11
+ # The close sequence:
12
+ # 1. Caller calls producer.close -> unregister_from_poller -> Poller#unregister
13
+ # 2. Poller#unregister signals via control pipe and calls state.wait_for_close (blocks on latch)
14
+ # 3. Poller thread receives control signal, drains queue, calls state.close
15
+ # 4. state.close releases the latch via release!
16
+ # 5. Caller's wait_for_close returns, unregister completes
17
+ #
18
+ # This ensures the producer is fully drained and removed from the poller
19
+ # before returning control to the caller, preventing race conditions.
20
+ class Latch
21
+ def initialize
22
+ @mutex = Mutex.new
23
+ @cv = ConditionVariable.new
24
+ @released = false
25
+ end
26
+
27
+ # Releases the latch and wakes any waiting threads
28
+ def release!
29
+ @mutex.synchronize do
30
+ @released = true
31
+ @cv.broadcast
32
+ end
33
+ end
34
+
35
+ # Waits until the latch is released
36
+ # Returns immediately if already released
37
+ def wait
38
+ @mutex.synchronize do
39
+ @cv.wait(@mutex) until @released
40
+ end
41
+ end
42
+
43
+ # @return [Boolean] whether the latch has been released
44
+ def released?
45
+ @released
46
+ end
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,415 @@
1
+ # frozen_string_literal: true
2
+
3
+ module WaterDrop
4
+ # Namespace for FD-based polling components
5
+ # Contains the global Poller singleton and State class for managing producer polling
6
+ module Polling
7
+ # Global poller singleton that manages a single polling thread for all FD-mode producers
8
+ # This replaces librdkafka's native background polling threads with a single Ruby thread
9
+ # that uses IO.select for efficient multiplexing
10
+ #
11
+ # Spawning one thread per producer is acceptable for 1-2 producers but in case of a system
12
+ # with several (transactional for example) the cost becomes bigger and bigger.
13
+ #
14
+ # This implementation handles things by being event-driven instead of GVL releasing blocking.
15
+ #
16
+ # @note Newly registered producers may experience up to 1 second delay before their first
17
+ # poll cycle, as the poller thread only rebuilds its IO list when IO.select times out.
18
+ # This is acceptable because producers are expected to be long-lived and the initial
19
+ # connection overhead to Kafka typically exceeds this delay anyway.
20
+ class Poller
21
+ include Singleton
22
+ include ::Karafka::Core::Helpers::Time
23
+
24
+ # Make new public so users can create dedicated poller instances for isolation
25
+ # The singleton instance remains available via Poller.instance for the default behavior
26
+ public_class_method :new
27
+
28
+ # Mutex for thread-safe ID generation - initialized at class load time
29
+ # to avoid race conditions with lazy initialization
30
+ ID_MUTEX = Mutex.new
31
+
32
+ # Counter for generating unique poller IDs
33
+ @id_counter = 0
34
+
35
+ class << self
36
+ # Generates incremental IDs for poller instances (starting from 0)
37
+ # @return [Integer] next poller ID
38
+ def next_id
39
+ ID_MUTEX.synchronize do
40
+ id = @id_counter
41
+ @id_counter += 1
42
+ id
43
+ end
44
+ end
45
+ end
46
+
47
+ # @return [Integer] unique identifier for this poller instance
48
+ attr_reader :id
49
+
50
+ def initialize
51
+ @id = self.class.next_id
52
+ @mutex = Mutex.new
53
+ @producers = {}
54
+ @thread = nil
55
+ @shutdown = false
56
+ @pid = Process.pid
57
+
58
+ # Cached collections - rebuilt only when producers change
59
+ @cached_ios = []
60
+ @cached_io_to_state = {}
61
+ @cached_states = []
62
+ @cached_result = nil
63
+ @ios_dirty = true
64
+ end
65
+
66
+ # Checks if the current thread is the poller thread
67
+ # Used to detect when close is called from within a callback to avoid deadlock
68
+ # @return [Boolean] true if current thread is the poller thread
69
+ def in_poller_thread?
70
+ Thread.current == @thread
71
+ end
72
+
73
+ # Checks if the poller thread is alive
74
+ # @return [Boolean] true if the poller thread is running
75
+ def alive?
76
+ @thread&.alive? || false
77
+ end
78
+
79
+ # Returns the number of registered producers
80
+ # @return [Integer] number of producers
81
+ def count
82
+ @mutex.synchronize { @producers.size }
83
+ end
84
+
85
+ # Shuts down the poller and resets state
86
+ # @note This is primarily for testing to reset singleton state between tests
87
+ def shutdown!
88
+ @mutex.synchronize { @shutdown = true }
89
+
90
+ thread = @thread
91
+ if thread&.alive?
92
+ thread.join(5)
93
+ thread.kill if thread.alive?
94
+ end
95
+
96
+ @mutex.synchronize do
97
+ @producers.each_value { |state| state.close unless state.closed? }
98
+ @producers.clear
99
+ @thread = nil
100
+ @shutdown = false
101
+ @ios_dirty = true
102
+ @cached_ios = []
103
+ @cached_io_to_state = {}
104
+ @cached_states = []
105
+ @cached_result = nil
106
+ @poll_timeout_s = nil
107
+ end
108
+ end
109
+
110
+ # Registers a producer with the poller
111
+ # @param producer [WaterDrop::Producer] the producer instance
112
+ # @param client [Rdkafka::Producer] the rdkafka client
113
+ def register(producer, client)
114
+ ensure_same_process!
115
+
116
+ state = State.new(
117
+ producer.id,
118
+ client,
119
+ producer.monitor,
120
+ producer.config.polling.fd.max_time,
121
+ producer.config.polling.fd.periodic_poll_interval
122
+ )
123
+
124
+ @mutex.synchronize do
125
+ @producers[producer.id] = state
126
+ @ios_dirty = true
127
+ # Reset shutdown flag in case thread is exiting but hasn't yet
128
+ # This prevents race where new producer is closed by exiting thread
129
+ @shutdown = false
130
+ ensure_thread_running!
131
+ end
132
+
133
+ producer.monitor.instrument(
134
+ "poller.producer_registered",
135
+ producer_id: producer.id
136
+ )
137
+ end
138
+
139
+ # Unregisters a producer from the poller
140
+ # This method blocks until the producer is fully removed from the poller
141
+ # to prevent race conditions when disconnect/reconnect happens in quick succession
142
+ # This matches the threaded polling behavior which drains without timeout
143
+ # @param producer [WaterDrop::Producer] the producer instance
144
+ def unregister(producer)
145
+ state, thread = @mutex.synchronize { [@producers[producer.id], @thread] }
146
+
147
+ return unless state
148
+
149
+ # Signal the poller thread to handle removal
150
+ state.signal_close
151
+
152
+ # Wait for the state to be fully closed by the poller thread
153
+ # This prevents race conditions where a new registration with the same
154
+ # producer_id could be deleted by a pending close signal
155
+ # Skip waiting if called from within the poller thread itself (e.g., from a callback)
156
+ # to avoid deadlock - the poller thread can't wait for itself
157
+ # The cleanup will happen after the callback returns
158
+ state.wait_for_close unless Thread.current == thread
159
+
160
+ producer.monitor.instrument(
161
+ "poller.producer_unregistered",
162
+ producer_id: producer.id
163
+ )
164
+ end
165
+
166
+ private
167
+
168
+ # Ensures we're in the same process (for fork safety)
169
+ def ensure_same_process!
170
+ return if @pid == Process.pid
171
+
172
+ # Reset state after fork - parent's thread and producers are not valid in child
173
+ @mutex = Mutex.new
174
+ @producers = {}
175
+ @thread = nil
176
+ @shutdown = false
177
+ @pid = Process.pid
178
+ @cached_ios = []
179
+ @cached_io_to_state = {}
180
+ @cached_states = []
181
+ @cached_result = nil
182
+ @ios_dirty = true
183
+ end
184
+
185
+ # Ensures the polling thread is running
186
+ # Must be called within @mutex.synchronize
187
+ def ensure_thread_running!
188
+ return if @thread&.alive?
189
+
190
+ @shutdown = false
191
+ @thread = Thread.new { polling_loop }
192
+ @thread.name = "waterdrop.poller##{@id}"
193
+ @thread.priority = Config.config.thread_priority
194
+ end
195
+
196
+ # Main polling loop that runs in a dedicated thread
197
+ def polling_loop
198
+ backoff_ms = 0
199
+
200
+ loop do
201
+ break if @shutdown
202
+
203
+ # Apply backoff from previous error
204
+ if backoff_ms > 0
205
+ sleep(backoff_ms / 1_000.0)
206
+ backoff_ms = 0
207
+ end
208
+
209
+ # Collect readable IOs (queue FDs)
210
+ readable_ios, io_to_state = collect_readable_ios
211
+
212
+ # Exit when no producers registered
213
+ # New registrations will start a fresh thread via ensure_thread_running!
214
+ break if readable_ios.empty?
215
+
216
+ poll_with_select(readable_ios, io_to_state)
217
+ rescue => e
218
+ # Report error and apply exponential backoff to prevent spam
219
+ broadcast_error("poller.polling_loop", e)
220
+ backoff_ms =
221
+ if backoff_ms.zero?
222
+ Config.config.backoff_min
223
+ else
224
+ [backoff_ms * 2, Config.config.backoff_max].min
225
+ end
226
+ end
227
+ ensure
228
+ # Clear thread reference first so new registrations will start a fresh thread
229
+ # This prevents race where register sees old thread as alive during cleanup
230
+ @mutex.synchronize { @thread = nil }
231
+
232
+ # When the poller thread exits (error or clean shutdown), close all remaining states
233
+ # This releases any latches that might be waiting in unregister calls
234
+ close_all_states
235
+ end
236
+
237
+ # Broadcasts an error to all registered producers' monitors
238
+ # @param type [String] error type identifier
239
+ # @param error [Exception] the error to report
240
+ def broadcast_error(type, error)
241
+ @cached_states.each do |state|
242
+ state.monitor.instrument(
243
+ "error.occurred",
244
+ type: type,
245
+ error: error,
246
+ producer_id: state.producer_id
247
+ )
248
+ end
249
+ end
250
+
251
+ # Collects all IOs to monitor and builds a mapping from IO to State
252
+ # Uses cached arrays when possible to avoid allocations in the hot path
253
+ # @return [Array<Array<IO>, Hash{IO => State}, Array<State>>] tuple of ios, io-to-state map, states
254
+ def collect_readable_ios
255
+ # Fast path: return cached result if not dirty (no mutex needed)
256
+ # Safe because @cached_result is frozen and assigned atomically
257
+ return @cached_result unless @ios_dirty
258
+
259
+ @mutex.synchronize do
260
+ @cached_ios = []
261
+ @cached_io_to_state = {}
262
+ @cached_states = []
263
+
264
+ @producers.each_value do |state|
265
+ io = state.io
266
+ @cached_ios << io
267
+ @cached_io_to_state[io] = state
268
+ @cached_states << state
269
+ end
270
+
271
+ @cached_result = [@cached_ios, @cached_io_to_state, @cached_states].freeze
272
+ @ios_dirty = false
273
+ end
274
+
275
+ @cached_result
276
+ end
277
+
278
+ # Poll producers using IO.select for efficient multiplexing
279
+ # @param readable_ios [Array<IO>] IOs to monitor
280
+ # @param io_to_state [Hash{IO => State}] mapping from IO to state
281
+ def poll_with_select(readable_ios, io_to_state)
282
+ begin
283
+ ready = IO.select(readable_ios, nil, nil, poll_timeout_s)
284
+ rescue IOError, Errno::EBADF
285
+ # An IO was closed - mark dirty to rebuild on next iteration
286
+ @ios_dirty = true
287
+ return
288
+ end
289
+
290
+ if ready.nil?
291
+ # Timeout: poll ALL producers to ensure OAuth/stats fire
292
+ poll_all_producers
293
+ else
294
+ # FDs ready: handle close signals and poll active producers
295
+ any_polled = false
296
+
297
+ ready[0].each do |io|
298
+ state = io_to_state[io]
299
+ next unless state
300
+
301
+ # Drain the pipe first (clears librdkafka signals + our signals)
302
+ state.drain
303
+
304
+ # Check if this producer is closing (flag set before signal)
305
+ if state.closing?
306
+ handle_close_signal(state)
307
+ else
308
+ poll_producer(state)
309
+ # Check if callback signaled close while we were polling
310
+ # (e.g., user code closed producer from within delivery callback)
311
+ if state.closing?
312
+ handle_close_signal(state)
313
+ else
314
+ any_polled = true
315
+ end
316
+ end
317
+ end
318
+
319
+ # Check for stale producers when actively polling
320
+ # Skip when single producer (most common case) - no other producers to become stale
321
+ # (ensures OAuth/stats fire for idle producers when others are busy)
322
+ poll_stale_producers if any_polled && @cached_states.size > 1
323
+ end
324
+ end
325
+
326
+ # Polls all registered producers
327
+ # Called when IO.select times out to ensure periodic polling happens
328
+ # This ensures OAuth token refresh and statistics callbacks fire for all producers
329
+ def poll_all_producers
330
+ @cached_states.each { |state| poll_producer(state) }
331
+ end
332
+
333
+ # Polls producers that haven't been polled recently
334
+ # Called when processing continue signals to prevent starvation of idle producers
335
+ # when one producer is very busy
336
+ # Each State internally throttles the check to avoid excessive overhead
337
+ def poll_stale_producers
338
+ @cached_states.each do |state|
339
+ poll_producer(state) if state.needs_periodic_poll?
340
+ end
341
+ end
342
+
343
+ # Drains the producer's event queue by polling until empty or time quanta exceeded
344
+ # @param state [State] the producer state
345
+ def poll_producer(state)
346
+ # state.poll returns:
347
+ # - true when queue is empty (fully drained)
348
+ # - false when timeout hit (more events may remain)
349
+ drained = state.poll
350
+ state.mark_polled!
351
+
352
+ # Hit time limit but still have events - signal to continue polling
353
+ state.signal_continue unless drained
354
+ rescue Rdkafka::ClosedProducerError
355
+ # Producer was closed, will be cleaned up
356
+ end
357
+
358
+ # Handles a close signal from a producer
359
+ # @param state [State] the producer state
360
+ def handle_close_signal(state)
361
+ # Drain remaining events before closing
362
+ # This matches rdkafka's native polling thread behavior: keep polling until outq_len is zero
363
+ drain_producer_queue(state)
364
+
365
+ # Remove producer from registry and clean up
366
+ # If this was the last producer, signal shutdown to stop the thread immediately
367
+ @mutex.synchronize do
368
+ @producers.delete(state.producer_id)
369
+ @ios_dirty = true
370
+
371
+ # Stop thread immediately when last producer unregisters to prevent resource leakage
372
+ @shutdown = true if @producers.empty?
373
+ end
374
+
375
+ state.close
376
+ end
377
+
378
+ # Closes all remaining producer states
379
+ # Called when the poller thread exits to release any pending latches
380
+ # This prevents deadlocks if producers are waiting in unregister
381
+ def close_all_states
382
+ states = @mutex.synchronize do
383
+ to_close = @producers.values.dup
384
+ @producers.clear
385
+ @ios_dirty = true
386
+ to_close
387
+ end
388
+
389
+ states.each do |state|
390
+ state.close unless state.closed?
391
+ rescue
392
+ # Ignore errors during cleanup
393
+ end
394
+ end
395
+
396
+ # Drains the producer's event queue completely before closing
397
+ # Matches rdkafka's native polling thread behavior: keep polling until queue is empty
398
+ # @param state [State] the producer state
399
+ def drain_producer_queue(state)
400
+ loop do
401
+ break if state.queue_empty?
402
+
403
+ state.poll
404
+ end
405
+ rescue Rdkafka::ClosedProducerError
406
+ # Producer was already closed, nothing more to drain
407
+ end
408
+
409
+ # @return [Float] poll_timeout converted to seconds (cached)
410
+ def poll_timeout_s
411
+ @poll_timeout_s ||= Config.config.poll_timeout / 1_000.0
412
+ end
413
+ end
414
+ end
415
+ end
@@ -0,0 +1,63 @@
1
+ # frozen_string_literal: true
2
+
3
+ module WaterDrop
4
+ module Polling
5
+ # A pipe connected to librdkafka's queue event notification system
6
+ # When events (delivery reports, statistics) arrive, librdkafka writes to the pipe
7
+ # allowing IO.select to wake up immediately
8
+ #
9
+ # This pipe is also used by WaterDrop to signal:
10
+ # - Continue: when poll hits time limit but more events remain
11
+ # - Close: when producer is being closed (combined with @closing flag in State)
12
+ #
13
+ # Reusing the same pipe reduces file descriptors and IO.select monitoring overhead
14
+ class QueuePipe
15
+ # @return [IO] the readable end of the pipe for use with IO.select
16
+ attr_reader :reader
17
+
18
+ # Creates a new queue pipe and connects it to the client's event queue
19
+ # @param client [Rdkafka::Producer] the rdkafka client
20
+ # @raise [StandardError] if enable_queue_io_events fails
21
+ def initialize(client)
22
+ @reader, @writer = IO.pipe
23
+
24
+ # Tell librdkafka to write to our pipe when events arrive on the main queue
25
+ client.enable_queue_io_events(@writer.fileno)
26
+ end
27
+
28
+ # Signals by writing a byte to the pipe
29
+ # Used to wake IO.select for continue/close signals
30
+ # Thread-safe and non-blocking; silently ignores errors
31
+ def signal
32
+ @writer.write_nonblock("W", exception: false)
33
+ rescue IOError, Errno::EBADF
34
+ # Pipe closed
35
+ end
36
+
37
+ # Drains all pending bytes from the pipe
38
+ # Called after IO.select returns to clear the notification
39
+ # Uses a single large read to drain in one syscall (pipe buffers are typically 64KB)
40
+ def drain
41
+ @reader.read_nonblock(1_048_576, exception: false)
42
+ rescue IOError, Errno::EBADF
43
+ # Pipe closed during drain
44
+ end
45
+
46
+ # Closes both ends of the pipe
47
+ def close
48
+ close_io(@reader)
49
+ close_io(@writer)
50
+ end
51
+
52
+ private
53
+
54
+ # Safely closes an IO object
55
+ # @param io [IO] the IO to close
56
+ def close_io(io)
57
+ io.close
58
+ rescue IOError, Errno::EBADF
59
+ # Already closed, ignore
60
+ end
61
+ end
62
+ end
63
+ end