protobuf-nats 0.13.0.pre4 → 0.13.0.pre5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0a264426e2c04180f86c1006cd19441e51c36bd5e736cc55252c898e620c2428
4
- data.tar.gz: 861ed68ab3c036afb2ce8fcfff3248f699b7cb4989df945c9113e4a7c3652223
3
+ metadata.gz: bac03c214ceddc05b8ad2ac60e1db25f4179156a78a437f6338f9b1c3404dc8c
4
+ data.tar.gz: d26d42acfd6b9804093a368ebb08ea41df22896fa67f1d2e79f767c6f9aa7f00
5
5
  SHA512:
6
- metadata.gz: 255fa1e7942741ee80f9e0aebbef0f2eed943b7be84ee95d7517dddb9fbfe692bd4dcad7c70a2d05aae953988fd203b8b6bb6b5c7ee295f084f1801f30e019b1
7
- data.tar.gz: 18aa2c4a2a166f769ae080d4cac177de891a26fcd77251d007b2e382927bf468bf7541a86c1a84b0c7f26217bef7104618dcd04faf5698046fff81894ba66f8e
6
+ metadata.gz: b8992b0220a24d05c0c1f0e67f816ea06865817a8ba0d18dcab0ba2ac36a4b561d69a144e97a7ccf3362bd33f6c4ac9b5458777aec85d2846720f71645f6952f
7
+ data.tar.gz: ef9d92af39b7a59c6f7c14a47ccaada1807fcca53e3f54cbf044ff264d73b698aeaad31d51e07ad731ac865aacfb901ef498a7d63cc678d91f615ab2859e7662
@@ -0,0 +1,219 @@
1
+ # Concurrency microbenchmarks for the protobuf-nats hot paths.
2
+ #
3
+ # Drives the REAL ResponseMuxer / Client subscription cache / ThreadPool so the
4
+ # same file measures both the baseline and the patched implementations, on both
5
+ # CRuby and JRuby. No NATS server required (NATS is faked for the muxer bench).
6
+ #
7
+ # Usage:
8
+ # bundle exec ruby -Ilib bench/concurrency_bench.rb
9
+ # BENCH_THREADS=1,4,8,16 BENCH_DURATION=4 BENCH_WARMUP=2 ruby -Ilib bench/concurrency_bench.rb
10
+ #
11
+ # Each cell runs for BENCH_DURATION seconds (after BENCH_WARMUP seconds of
12
+ # untimed warmup) and reports aggregate ops/sec across all threads plus an
13
+ # error count.
14
+
15
+ require "bundler/setup"
16
+ require "protobuf/nats"
17
+ require "nats/client" # for NATS::Msg / NATS::Subscription
18
+
19
+ ::Protobuf::Logging.logger = ::Logger.new(nil)
20
+
21
+ DURATION = Float(ENV.fetch("BENCH_DURATION", "4"))
22
+ WARMUP = Float(ENV.fetch("BENCH_WARMUP", "2"))
23
+ THREADS = ENV.fetch("BENCH_THREADS", "1,4,8,16").split(",").map(&:to_i)
24
+ PAYLOAD = ("x" * 64).freeze
25
+
26
+ def mono
27
+ ::Process.clock_gettime(::Process::CLOCK_MONOTONIC)
28
+ end
29
+
30
+ # Run `block` (which returns [ops, errors]) on `threads` workers. Counters are
31
+ # only accumulated during the measured window (after an untimed warmup), using
32
+ # atomics so the stop/measure flags are visible across threads on JRuby too.
33
+ # Returns { ops_per_sec:, errors: }.
34
+ def run_cell(threads, seconds, warmup)
35
+ go = ::Concurrent::AtomicBoolean.new(false)
36
+ measure = ::Concurrent::AtomicBoolean.new(false)
37
+ stop = ::Concurrent::AtomicBoolean.new(false)
38
+ totals = ::Array.new(threads)
39
+
40
+ workers = threads.times.map do |i|
41
+ ::Thread.new do
42
+ ops = 0
43
+ errs = 0
44
+ sleep 0.0005 until go.true?
45
+ until stop.true?
46
+ o, e = yield
47
+ if measure.true?
48
+ ops += o
49
+ errs += e
50
+ end
51
+ end
52
+ totals[i] = [ops, errs]
53
+ end
54
+ end
55
+
56
+ go.make_true
57
+ sleep warmup
58
+ t0 = mono
59
+ measure.make_true
60
+ sleep seconds
61
+ stop.make_true
62
+ elapsed = mono - t0
63
+ workers.each(&:join)
64
+
65
+ ops = totals.sum { |t| t ? t[0] : 0 }
66
+ errs = totals.sum { |t| t ? t[1] : 0 }
67
+ { ops_per_sec: ops / elapsed, errors: errs }
68
+ end
69
+
70
+ def print_table(title, rows)
71
+ puts
72
+ puts title
73
+ puts " threads | ops/sec | errors"
74
+ puts " --------+----------------+--------"
75
+ rows.each do |t, r|
76
+ printf(" %7d | %14.0f | %7d\n", t, r[:ops_per_sec], r[:errors])
77
+ end
78
+ end
79
+
80
+ # --------------------------------------------------------------------------
81
+ # Fake NATS connection for the muxer round-trip benchmark.
82
+ # new_inbox + subscribe("<prefix>.*") feed the muxer; publish echoes a reply
83
+ # onto the muxer's response subscription queue, simulating the server response.
84
+ # --------------------------------------------------------------------------
85
+ class BenchNats
86
+ def initialize
87
+ @inbox = 0
88
+ @resp_queue = nil
89
+ end
90
+
91
+ def new_inbox
92
+ @inbox += 1
93
+ "_INBOX.bench.#{@inbox}"
94
+ end
95
+
96
+ def subscribe(_subject, *_args)
97
+ sub = ::NATS::Subscription.new
98
+ sub.pending_queue = ::SizedQueue.new(8192)
99
+ @resp_queue = sub.pending_queue # muxer's response subscription
100
+ sub
101
+ end
102
+
103
+ def publish(_subject, data, reply_to = nil)
104
+ return unless reply_to && @resp_queue
105
+ @resp_queue.push(::NATS::Msg.new(:subject => reply_to, :data => data))
106
+ end
107
+
108
+ def flush(*); end
109
+ end
110
+
111
+ # --------------------------------------------------------------------------
112
+ # A. ResponseMuxer round-trip (exercises #1 map lock, #2 dispatcher, #4, #6)
113
+ # --------------------------------------------------------------------------
114
+ def bench_muxer
115
+ rows = THREADS.map do |t|
116
+ ::Protobuf::Nats.client_nats_connection = BenchNats.new
117
+ muxer = ::Protobuf::Nats::ResponseMuxer.new
118
+ muxer.start
119
+
120
+ result = run_cell(t, DURATION, WARMUP) do
121
+ ops = 0
122
+ errs = 0
123
+ begin
124
+ req = muxer.new_request
125
+ req.publish("rpc.bench", PAYLOAD)
126
+ msg = req.next_message(5)
127
+ ops += 1 if msg
128
+ rescue => _e
129
+ errs += 1
130
+ ensure
131
+ req.cleanup if req
132
+ end
133
+ [ops, errs]
134
+ end
135
+
136
+ muxer.stop
137
+ [t, result]
138
+ end
139
+ print_table("A. ResponseMuxer round-trip (new_request -> publish -> next_message -> cleanup)", rows)
140
+ end
141
+
142
+ # --------------------------------------------------------------------------
143
+ # B. Subscription-key cache (exercises #3: nested-Hash ||= vs Concurrent::Map).
144
+ # Drives the real Client#cached_subscription_key. Each measured iteration clears
145
+ # the shared class cache and races all threads to refill it (the cold-start
146
+ # write race that triggers ConcurrentModificationError on JRuby + plain Hash).
147
+ # --------------------------------------------------------------------------
148
+ def bench_subscription_cache
149
+ # Build named dummy service classes + methods used as cache keys.
150
+ svc_classes = 20.times.map do |i|
151
+ name = "BenchSvc#{i}"
152
+ ::Object.const_set(name, Class.new) unless ::Object.const_defined?(name)
153
+ ::Object.const_get(name)
154
+ end
155
+ methods = [:create, :read, :update, :delete, :list]
156
+ combos = svc_classes.product(methods)
157
+
158
+ cache = ::Protobuf::Nats::Client.subscription_key_cache
159
+
160
+ rows = THREADS.map do |t|
161
+ iteration_lock = ::Mutex.new
162
+ result = run_cell(t, DURATION, WARMUP) do
163
+ ops = 0
164
+ errs = 0
165
+ # Clear occasionally to keep the write path hot (cold-fill race).
166
+ iteration_lock.synchronize { cache.clear } if rand(combos.size) == 0
167
+ combos.each do |klass, meth|
168
+ begin
169
+ client = ::Protobuf::Nats::Client.allocate
170
+ client.instance_variable_set(:@options, { :service => klass, :method => meth })
171
+ client.cached_subscription_key
172
+ ops += 1
173
+ rescue => _e
174
+ errs += 1
175
+ end
176
+ end
177
+ [ops, errs]
178
+ end
179
+ [t, result]
180
+ end
181
+ print_table("B. Subscription-key cache fill+read race (Client#cached_subscription_key)", rows)
182
+ end
183
+
184
+ # --------------------------------------------------------------------------
185
+ # C. ThreadPool throughput (exercises #5: mutex counter vs AtomicFixnum)
186
+ # --------------------------------------------------------------------------
187
+ def bench_thread_pool
188
+ workers = (ENV["BENCH_POOL_WORKERS"] || "8").to_i
189
+ noop = ->{}
190
+ rows = THREADS.map do |t|
191
+ pool = ::Protobuf::Nats::ThreadPool.new(workers, :max_queue => 100_000)
192
+ result = run_cell(t, DURATION, WARMUP) do
193
+ ops = 0
194
+ errs = 0
195
+ if pool.push(&noop)
196
+ ops += 1
197
+ else
198
+ errs += 1 # pool full (backpressure); not a real error, tracked separately
199
+ end
200
+ [ops, errs]
201
+ end
202
+ pool.shutdown
203
+ [t, result]
204
+ end
205
+ print_table("C. ThreadPool push throughput (errors column = pool-full/backpressure)", rows)
206
+ end
207
+
208
+ puts "=" * 72
209
+ puts "protobuf-nats concurrency bench"
210
+ puts "engine=#{RUBY_ENGINE} #{RUBY_VERSION} duration=#{DURATION}s warmup=#{WARMUP}s threads=#{THREADS.inspect}"
211
+ puts "processor_count=#{::Concurrent.processor_count}"
212
+ puts "=" * 72
213
+
214
+ bench_muxer
215
+ bench_subscription_cache
216
+ bench_thread_pool
217
+
218
+ puts
219
+ puts "done."
@@ -0,0 +1,78 @@
1
+ # End-to-end throughput benchmark against a real nats-server.
2
+ #
3
+ # Starts a real protobuf-nats Server (warehouse service) in a background thread
4
+ # and drives it with N client threads for a fixed duration, reporting req/sec.
5
+ # Exercises the full patched stack: client ResponseMuxer + dispatchers, server
6
+ # SuperSubscriptionManager + ThreadPool.
7
+ #
8
+ # Requires a nats-server already running on 127.0.0.1:4222.
9
+ #
10
+ # Usage:
11
+ # E2E_DURATION=10 E2E_THREADS=16 bundle exec ruby -Ilib bench/e2e_bench.rb
12
+
13
+ ENV["PB_CLIENT_TYPE"] = "protobuf/nats/client"
14
+ ENV["PB_SERVER_TYPE"] = "protobuf/nats/runner"
15
+ # Skip the server's slow-start ramp so the benchmark starts promptly.
16
+ ENV["PB_NATS_SERVER_SUBSCRIPTIONS_PER_RPC_ENDPOINT"] ||= "1"
17
+ ENV["PB_NATS_SERVER_SLOW_START_DELAY"] ||= "0"
18
+
19
+ require "./examples/warehouse/app"
20
+ require "concurrent"
21
+
22
+ ::Protobuf::Logging.logger = ::Logger.new(nil)
23
+
24
+ DURATION = Float(ENV.fetch("E2E_DURATION", "10"))
25
+ WARMUP = Float(ENV.fetch("E2E_WARMUP", "3"))
26
+ THREADS = Integer(ENV.fetch("E2E_THREADS", "16"))
27
+ SERVER_THREADS = Integer(ENV.fetch("E2E_SERVER_THREADS", "50"))
28
+
29
+ def mono
30
+ ::Process.clock_gettime(::Process::CLOCK_MONOTONIC)
31
+ end
32
+
33
+ puts "=" * 72
34
+ puts "protobuf-nats e2e bench engine=#{RUBY_ENGINE} #{RUBY_VERSION}"
35
+ puts "client_threads=#{THREADS} server_threads=#{SERVER_THREADS} duration=#{DURATION}s warmup=#{WARMUP}s"
36
+ puts "dispatchers=#{ENV['PB_NATS_RESPONSE_MUXER_DISPATCHERS'] || '(auto)'}"
37
+ puts "=" * 72
38
+
39
+ server = ::Protobuf::Nats::Server.new(:threads => SERVER_THREADS)
40
+ server_thread = ::Thread.new { server.run }
41
+
42
+ # Give the server time to connect + subscribe.
43
+ sleep 2
44
+
45
+ count = ::Concurrent::AtomicFixnum.new(0)
46
+ errors = ::Concurrent::AtomicFixnum.new(0)
47
+ measure = ::Concurrent::AtomicBoolean.new(false)
48
+ stop = ::Concurrent::AtomicBoolean.new(false)
49
+
50
+ workers = THREADS.times.map do
51
+ ::Thread.new do
52
+ until stop.true?
53
+ begin
54
+ req = ::Warehouse::Shipment.new(:guid => ::SecureRandom.uuid, :sleep_time_ms => 0)
55
+ ::Warehouse::ShipmentService.client.create(req)
56
+ count.increment if measure.true?
57
+ rescue => _e
58
+ errors.increment if measure.true?
59
+ end
60
+ end
61
+ end
62
+ end
63
+
64
+ sleep WARMUP
65
+ t0 = mono
66
+ measure.make_true
67
+ sleep DURATION
68
+ stop.make_true
69
+ elapsed = mono - t0
70
+ workers.each(&:join)
71
+
72
+ rps = count.value / elapsed
73
+ puts
74
+ printf("req/sec = %.0f (completed=%d errors=%d in %.2fs)\n", rps, count.value, errors.value, elapsed)
75
+
76
+ server.stop
77
+ server_thread.join(5)
78
+ puts "done."
@@ -1,5 +1,6 @@
1
1
  require 'securerandom'
2
2
  require "connection_pool"
3
+ require "concurrent"
3
4
  require "protobuf/nats"
4
5
  require "protobuf/rpc/connectors/base"
5
6
  require "monitor"
@@ -13,7 +14,14 @@ module Protobuf
13
14
 
14
15
  RESPONSE_MUXER = ::Protobuf::Nats::ResponseMuxer.new
15
16
 
16
- @subscription_key_cache = {}
17
+ # On JRuby (true parallelism) concurrent writes to a plain nested Hash can
18
+ # raise ConcurrentModificationError / corrupt the map, so the cache must be
19
+ # a Concurrent::Map. On CRuby the GVL makes plain-Hash reads/writes atomic
20
+ # (a racing `||=` at worst recomputes an identical value), and a plain Hash
21
+ # is meaningfully faster than Concurrent::Map, so we keep the Hash there.
22
+ CONCURRENT_SUBSCRIPTION_CACHE = (::RUBY_ENGINE == "jruby")
23
+
24
+ @subscription_key_cache = CONCURRENT_SUBSCRIPTION_CACHE ? ::Concurrent::Map.new : {}
17
25
  @subscription_pool_lock = ::Mutex.new
18
26
 
19
27
  # Structure to hold subscription and inbox to use within pool
@@ -208,9 +216,15 @@ module Protobuf
208
216
  klass = @options[:service]
209
217
  method_name = @options[:method]
210
218
 
211
- method_name_cache = self.class.subscription_key_cache[klass] ||= {}
212
- method_name_cache[method_name] ||= begin
213
- ::Protobuf::Nats.subscription_key(klass, method_name)
219
+ cache = self.class.subscription_key_cache
220
+ if CONCURRENT_SUBSCRIPTION_CACHE
221
+ method_name_cache = cache.compute_if_absent(klass) { ::Concurrent::Map.new }
222
+ method_name_cache.compute_if_absent(method_name) do
223
+ ::Protobuf::Nats.subscription_key(klass, method_name)
224
+ end
225
+ else
226
+ method_name_cache = cache[klass] ||= {}
227
+ method_name_cache[method_name] ||= ::Protobuf::Nats.subscription_key(klass, method_name)
214
228
  end
215
229
  end
216
230
 
@@ -3,7 +3,6 @@ require "connection_pool"
3
3
  require "protobuf/nats"
4
4
  require "protobuf/rpc/connectors/base"
5
5
  require "monitor"
6
- require "uuid7"
7
6
  require "protobuf/nats/uuidv7_helper"
8
7
  require "concurrent"
9
8
  require "concurrent/collection/timeout_queue"
@@ -16,11 +15,13 @@ module Protobuf
16
15
  TOKEN_TTL_SECONDS = 600 # 10 minutes
17
16
 
18
17
  def initialize
19
- # Per-token response queues for lock-free message delivery
20
- # Each token gets its own Queue, eliminating lock contention between different tokens
21
- @resp_map = Hash.new { |h,k| h[k] = { } }
18
+ # Per-token response queues for lock-free message delivery. @resp_map is a
19
+ # Concurrent::Map so request threads and dispatcher threads can insert,
20
+ # look up, and delete tokens without serializing on a single mutex (on
21
+ # JRuby it is backed by java.util.concurrent.ConcurrentHashMap). Each
22
+ # value is a Hash { queue:, created_at: }.
23
+ @resp_map = ::Concurrent::Map.new
22
24
  @resp_handlers = []
23
- @map_lock = ::Mutex.new # Lightweight lock only for map structure changes
24
25
  @cleanup_thread = nil
25
26
  @shutdown = false
26
27
  @cleanup_mutex = ::Mutex.new
@@ -32,18 +33,38 @@ module Protobuf
32
33
  ::Protobuf::Logging.logger
33
34
  end
34
35
 
35
- def cleanup(token)
36
- @map_lock.synchronize do
37
- # Close the queue to wake any waiting threads
38
- queue = @resp_map.dig(token, :queue)
39
- queue&.close
40
- @resp_map.delete(token)
36
+ # Monotonic clock for token TTL accounting. Cheaper than Time.now (no Time
37
+ # object / timezone work per request) and immune to wall-clock jumps.
38
+ def monotonic_now
39
+ ::Process.clock_gettime(::Process::CLOCK_MONOTONIC)
40
+ end
41
+
42
+ # Number of dispatcher threads draining the response subscription. On JRuby
43
+ # (true parallelism) a single dispatcher is a hard throughput ceiling, so we
44
+ # fan out to processor_count; on CRuby the GVL makes extra dispatchers
45
+ # pointless, so we stay at 1. Overridable via env for tuning/tests.
46
+ def dispatcher_count
47
+ @dispatcher_count ||= begin
48
+ if ::ENV.key?("PB_NATS_RESPONSE_MUXER_DISPATCHERS")
49
+ [::ENV["PB_NATS_RESPONSE_MUXER_DISPATCHERS"].to_i, 1].max
50
+ elsif ::RUBY_ENGINE == "jruby"
51
+ [::Concurrent.processor_count, 1].max
52
+ else
53
+ 1
54
+ end
41
55
  end
42
56
  end
43
57
 
58
+ def cleanup(token)
59
+ # Atomic remove-and-return; close the queue to wake any waiting threads.
60
+ entry = @resp_map.delete(token)
61
+ entry[:queue]&.close if entry
62
+ end
63
+
44
64
  def next_message(token, timeout)
45
- # Get the queue for this token with minimal locking
46
- queue = @map_lock.synchronize { @resp_map.dig(token, :queue) }
65
+ # Lock-free get of the per-token queue.
66
+ entry = @resp_map[token]
67
+ queue = entry && entry[:queue]
47
68
 
48
69
  unless queue
49
70
  logger.warn "Token #{token} not found or already cleaned up during next_message"
@@ -83,22 +104,16 @@ module Protobuf
83
104
  end
84
105
  end
85
106
 
86
- def new_uuidv7
87
- UUID7.generate
88
- end
89
-
90
107
  def new_request
91
108
  # Use UUIDv7 so we can figure out what time a message was originally created in-memory.
92
- token = new_uuidv7 # nats.new_inbox with nuid is not threadsafe.
93
-
94
- @map_lock.synchronize do
95
- # Create a dedicated queue for this token
96
- # TimeoutQueue provides native timeout support for efficient blocking
97
- @resp_map[token] = {
98
- queue: Concurrent::Collection::TimeoutQueue.new,
99
- created_at: Time.now
100
- }
101
- end
109
+ token = UUIDv7Helper.generate # nats.new_inbox with nuid is not threadsafe.
110
+
111
+ # Create a dedicated queue for this token. Concurrent::Map#[]= is atomic,
112
+ # so no surrounding lock is required.
113
+ @resp_map[token] = {
114
+ queue: ::Concurrent::Collection::TimeoutQueue.new,
115
+ created_at: monotonic_now
116
+ }
102
117
 
103
118
  ResponseMuxerRequest.new(self, token)
104
119
  end
@@ -193,117 +208,12 @@ module Protobuf
193
208
  # Start the cleanup thread
194
209
  start_cleanup_thread
195
210
 
211
+ # Top up the dispatcher pool to dispatcher_count. Prunes dead threads
212
+ # first so self-healing restarts converge to the target count instead of
213
+ # multiplying threads.
196
214
  LOCK.synchronize do
197
215
  @resp_handlers.select!(&:alive?)
198
- @resp_handlers << Thread.new do
199
- # Unique thread name for debugging
200
- Thread.current.name = "response-muxer-#{Thread.current.object_id}"
201
- begin
202
- # Reset crash count on successful start
203
- @crash_count = 0
204
-
205
- loop do
206
- begin
207
- # --- Start of per-message block ---
208
- msg = @resp_sub.pending_queue.pop
209
-
210
- # ACK means the message has been picked up and put into the waiting thread_pool
211
- next if msg.nil?
212
-
213
- # Decrease pending size since consumed already
214
- # NOTE: This is outside the lock since it's just updating metrics
215
- @resp_sub.pending_size -= msg.data.size if @resp_sub
216
-
217
- # Validate message subject before processing
218
- unless msg.subject.is_a?(String) && msg.subject.include?('.')
219
- ::ActiveSupport::Notifications.instrument "client.invalid_message.protobuf-nats", 1
220
-
221
- logger.warn "Received message with invalid subject: #{msg.subject}. Dropping."
222
- next
223
- end
224
-
225
- # example(random data):
226
- # _INBOX.{random_data}.{random_data_msg_id}
227
- token = msg.subject.split('.').last
228
-
229
- logger.debug { "token: #{token}, resp_map.keys:#{@resp_map.keys}" } if logger.debug?
230
-
231
- # Get the queue for this token with minimal locking
232
- queue = @map_lock.synchronize do
233
- unless @resp_map.key?(token)
234
- # Try to decode the UUIDv7 timestamp to calculate message age
235
- delay_seconds = UUIDv7Helper.age_in_seconds(token)
236
-
237
- ::ActiveSupport::Notifications.instrument "client.unexpected_message.protobuf-nats", delay_seconds || 1
238
-
239
- if delay_seconds
240
- logger.warn "Received unexpected message (#{delay_seconds.round(3)}s old). MSG.subject=#{msg.subject}. RESP_SUBJ.subject=#{@resp_sub.subject rescue 'unknown'}. Dropping unexpected message."
241
- else
242
- logger.warn "Received unexpected message. MSG.subject=#{msg.subject}. RESP_SUBJ.subject=#{@resp_sub.subject rescue 'unknown'}. Dropping unexpected message."
243
- end
244
- nil
245
- else
246
- @resp_map[token][:queue]
247
- end
248
- end
249
-
250
- # Skip if token wasn't found
251
- next unless queue
252
-
253
- # Push message onto the queue - this is lock-free and thread-safe
254
- # The Queue implementation handles all synchronization internally
255
- begin
256
- # Check queue size to prevent memory bloat
257
- if queue.size >= MAX_RESPONSES_PER_TOKEN
258
- logger.warn "Token #{token} has #{queue.size} queued responses. Possible duplicate messages or slow consumer. Dropping message."
259
- next
260
- end
261
-
262
- queue.push(msg)
263
- rescue ThreadError => e
264
- # Queue was closed (cleanup happened) - this is fine, just drop the message
265
- logger.debug "Queue closed for token #{token}, dropping message"
266
- end
267
-
268
- # --- End of per-message block ---
269
- rescue => per_message_error
270
- # ThreadError is fatal, it means the queue is closed and the loop cannot continue.
271
- raise if per_message_error.is_a?(::ThreadError)
272
-
273
- # Log the error for the specific message, but DON'T kill the thread.
274
- logger.error("ResponseMuxer failed to process a message. Error: #{per_message_error.message}")
275
- ::Protobuf::Nats.notify_error_callbacks(per_message_error)
276
- end
277
- end
278
- rescue => fatal_error
279
- # This block is now only for truly fatal errors that kill the loop itself.
280
- logger.error("ResponseMuxer thread crashed fatally. Error: #{fatal_error.message}")
281
- ::Protobuf::Nats.notify_error_callbacks(fatal_error)
282
-
283
- # --- Self-healing logic ---
284
- @crash_count = (@crash_count || 0) + 1
285
- # Exponential backoff, e.g., 1, 4, 9, 16s... capped at 60s.
286
- sleep_duration = [(@crash_count**2), 60].min
287
- logger.warn("Waiting #{sleep_duration}s before attempting to restart ResponseMuxer.")
288
- sleep sleep_duration
289
- # --- End of self-healing logic ---
290
-
291
- # After sleeping, reset the state and try to start again.
292
- LOCK.synchronize do
293
- if @resp_sub
294
- begin
295
- @resp_sub.unsubscribe
296
- rescue => e
297
- logger.warn "Failed to unsubscribe old response muxer subscription during self-healing: #{e.message}"
298
- ensure
299
- @resp_sub = nil
300
- end
301
- end
302
- @started = false
303
- end
304
- start
305
- end
306
- end
216
+ @resp_handlers << spawn_dispatcher while @resp_handlers.size < dispatcher_count
307
217
  end
308
218
  end
309
219
 
@@ -313,25 +223,29 @@ module Protobuf
313
223
 
314
224
  # Periodic cleanup of stale tokens
315
225
  def cleanup_stale_tokens
316
- cutoff = Time.now - TOKEN_TTL_SECONDS
317
-
318
- @map_lock.synchronize do
319
- stale_count = 0
320
- @resp_map.delete_if do |token, data|
321
- if data[:created_at] && data[:created_at] < cutoff
322
- stale_count += 1
323
- logger.warn "Cleaning up stale token #{token} created at #{data[:created_at]}"
324
- # Close the queue to wake any waiting threads
325
- data[:queue]&.close
326
- true
327
- else
328
- false
329
- end
330
- end
226
+ cutoff = monotonic_now - TOKEN_TTL_SECONDS
227
+
228
+ # Collect stale tokens first, then delete. Concurrent::Map iteration does
229
+ # not hold a global lock, so request threads are never blocked across this
230
+ # O(n) scan (unlike the previous single-mutex implementation).
231
+ stale_tokens = []
232
+ @resp_map.each_pair do |token, data|
233
+ created_at = data[:created_at]
234
+ stale_tokens << token if created_at && created_at < cutoff
235
+ end
331
236
 
332
- if stale_count > 0
333
- ::ActiveSupport::Notifications.instrument "response_muxer.stale_tokens_cleaned.protobuf-nats", stale_count
334
- end
237
+ stale_count = 0
238
+ stale_tokens.each do |token|
239
+ data = @resp_map.delete(token)
240
+ next unless data
241
+ stale_count += 1
242
+ logger.warn "Cleaning up stale token #{token} created at #{data[:created_at]}"
243
+ # Close the queue to wake any waiting threads
244
+ data[:queue]&.close
245
+ end
246
+
247
+ if stale_count > 0
248
+ ::ActiveSupport::Notifications.instrument "response_muxer.stale_tokens_cleaned.protobuf-nats", stale_count
335
249
  end
336
250
  end
337
251
 
@@ -360,6 +274,124 @@ module Protobuf
360
274
  !!@started
361
275
  end
362
276
 
277
+ # Spawn a single dispatcher thread. Multiple dispatchers safely share the
278
+ # one @resp_sub.pending_queue (Queue is thread-safe) and route via the
279
+ # lock-free @resp_map.
280
+ def spawn_dispatcher
281
+ Thread.new do
282
+ # Unique thread name for debugging
283
+ Thread.current.name = "response-muxer-#{Thread.current.object_id}"
284
+ begin
285
+ # Reset crash count on successful start
286
+ @crash_count = 0
287
+ run_dispatch_loop
288
+ rescue => fatal_error
289
+ # Only truly fatal errors that kill the loop reach here (ThreadError
290
+ # from the shared pending_queue being closed).
291
+ logger.error("ResponseMuxer thread crashed fatally. Error: #{fatal_error.message}")
292
+ ::Protobuf::Nats.notify_error_callbacks(fatal_error)
293
+
294
+ # --- Self-healing logic ---
295
+ @crash_count = (@crash_count || 0) + 1
296
+ # Exponential backoff, e.g., 1, 4, 9, 16s... capped at 60s.
297
+ sleep_duration = [(@crash_count**2), 60].min
298
+ logger.warn("Waiting #{sleep_duration}s before attempting to restart ResponseMuxer.")
299
+ sleep sleep_duration
300
+ # --- End of self-healing logic ---
301
+
302
+ # After sleeping, reset the state and try to start again.
303
+ LOCK.synchronize do
304
+ if @resp_sub
305
+ begin
306
+ @resp_sub.unsubscribe
307
+ rescue => e
308
+ logger.warn "Failed to unsubscribe old response muxer subscription during self-healing: #{e.message}"
309
+ ensure
310
+ @resp_sub = nil
311
+ end
312
+ end
313
+ @started = false
314
+ end
315
+ start
316
+ end
317
+ end
318
+ end
319
+
320
+ def run_dispatch_loop
321
+ loop do
322
+ begin
323
+ # --- Start of per-message block ---
324
+ msg = @resp_sub.pending_queue.pop
325
+
326
+ # ACK means the message has been picked up and put into the waiting thread_pool
327
+ next if msg.nil?
328
+
329
+ # Decrease pending size since consumed already.
330
+ # NOTE: advisory metric only; with multiple dispatchers this is a
331
+ # benign lost-update race on the NATS subscription's counter.
332
+ @resp_sub.pending_size -= msg.data.size if @resp_sub
333
+
334
+ dispatch_message(msg)
335
+ # --- End of per-message block ---
336
+ rescue => per_message_error
337
+ # ThreadError is fatal, it means the queue is closed and the loop cannot continue.
338
+ raise if per_message_error.is_a?(::ThreadError)
339
+
340
+ # Log the error for the specific message, but DON'T kill the thread.
341
+ logger.error("ResponseMuxer failed to process a message. Error: #{per_message_error.message}")
342
+ ::Protobuf::Nats.notify_error_callbacks(per_message_error)
343
+ end
344
+ end
345
+ end
346
+
347
+ def dispatch_message(msg)
348
+ # Validate message subject before processing
349
+ unless msg.subject.is_a?(String) && msg.subject.include?('.')
350
+ ::ActiveSupport::Notifications.instrument "client.invalid_message.protobuf-nats", 1
351
+
352
+ logger.warn "Received message with invalid subject: #{msg.subject}. Dropping."
353
+ return
354
+ end
355
+
356
+ # example(random data):
357
+ # _INBOX.{random_data}.{random_data_msg_id}
358
+ token = msg.subject.split('.').last
359
+
360
+ logger.debug { "token: #{token}, resp_map.keys:#{@resp_map.keys}" } if logger.debug?
361
+
362
+ # Lock-free get of the per-token queue.
363
+ entry = @resp_map[token]
364
+ queue = entry && entry[:queue]
365
+
366
+ unless queue
367
+ # Try to decode the UUIDv7 timestamp to calculate message age
368
+ delay_seconds = UUIDv7Helper.age_in_seconds(token)
369
+
370
+ ::ActiveSupport::Notifications.instrument "client.unexpected_message.protobuf-nats", delay_seconds || 1
371
+
372
+ if delay_seconds
373
+ logger.warn "Received unexpected message (#{delay_seconds.round(3)}s old). MSG.subject=#{msg.subject}. RESP_SUBJ.subject=#{@resp_sub.subject rescue 'unknown'}. Dropping unexpected message."
374
+ else
375
+ logger.warn "Received unexpected message. MSG.subject=#{msg.subject}. RESP_SUBJ.subject=#{@resp_sub.subject rescue 'unknown'}. Dropping unexpected message."
376
+ end
377
+ return
378
+ end
379
+
380
+ # Push message onto the queue - this is lock-free and thread-safe.
381
+ begin
382
+ # Check queue size to prevent memory bloat
383
+ if queue.size >= MAX_RESPONSES_PER_TOKEN
384
+ logger.warn "Token #{token} has #{queue.size} queued responses. Possible duplicate messages or slow consumer. Dropping message."
385
+ return
386
+ end
387
+
388
+ queue.push(msg)
389
+ rescue ThreadError
390
+ # Queue was closed (cleanup happened) - this is fine, just drop the message
391
+ logger.debug "Queue closed for token #{token}, dropping message"
392
+ end
393
+ end
394
+
363
395
  def start_cleanup_thread
364
396
  # Only start if not already running
365
397
  return if @cleanup_thread&.alive?
@@ -1,10 +1,15 @@
1
+ require "concurrent"
2
+
1
3
  module Protobuf
2
4
  module Nats
3
5
  class ThreadPool
4
6
 
5
7
  def initialize(size, opts = {})
6
8
  @queue = ::Queue.new
7
- @active_work = 0
9
+ # Lock-free counter of in-flight work. Replaces a mutex-guarded integer so
10
+ # that N workers running in true parallel (JRuby) don't serialize on every
11
+ # task completion.
12
+ @active_work = ::Concurrent::AtomicFixnum.new(0)
8
13
 
9
14
  # Callbacks
10
15
  @error_cb = lambda do |error|
@@ -14,14 +19,14 @@ module Protobuf
14
19
  end
15
20
 
16
21
  # Synchronization
17
- @mutex = ::Mutex.new
22
+ @mutex = ::Mutex.new # guards the @workers array only
18
23
  @cb_mutex = ::Mutex.new
19
24
 
20
25
  # Let's get this party started
21
26
  queue_size = opts[:max_queue].to_i || 0
22
27
  @max_size = size + queue_size
23
28
  @max_workers = size
24
- @shutting_down = false
29
+ @shutting_down = ::Concurrent::AtomicBoolean.new(false)
25
30
  @workers = []
26
31
  supervise_workers
27
32
  end
@@ -32,7 +37,7 @@ module Protobuf
32
37
 
33
38
  # Thread-safe access to check if the pool is full.
34
39
  def full?
35
- @mutex.synchronize { @active_work >= @max_size }
40
+ @active_work.value >= @max_size
36
41
  end
37
42
 
38
43
  def max_size
@@ -41,33 +46,33 @@ module Protobuf
41
46
 
42
47
  # This method is now thread-safe.
43
48
  def push(&work_cb)
44
- @mutex.synchronize do
45
- # Re-check conditions inside the lock to guarantee safety.
46
- return false if @active_work >= @max_size
47
- return false if @shutting_down
48
-
49
- @queue << [:work, work_cb]
50
- @active_work += 1
49
+ return false if @shutting_down.true?
50
+
51
+ # Optimistically claim a slot; back off if we exceeded the cap. This admits
52
+ # work only while active_work < max_size, matching the original guard, but
53
+ # without holding a mutex across the enqueue.
54
+ if @active_work.increment > @max_size
55
+ @active_work.decrement
56
+ return false
51
57
  end
52
58
 
53
- # Supervise outside the lock to avoid holding it during thread creation.
59
+ @queue << [:work, work_cb]
60
+
61
+ # Supervise outside any lock-held section to avoid holding it during thread creation.
54
62
  supervise_workers
55
63
  true
56
64
  end
57
65
 
58
66
  # This method is now thread-safe.
59
67
  def shutdown
60
- @mutex.synchronize do
61
- return if @shutting_down # Prevent sending stop messages multiple times
62
- @shutting_down = true
63
- end
68
+ # CAS ensures the poison pills are pushed exactly once.
69
+ return unless @shutting_down.make_true
64
70
 
65
- # Pushing poison pills can happen outside the lock.
66
71
  @max_workers.times { @queue << [:stop, nil] }
67
72
  end
68
73
 
69
74
  def kill
70
- @shutting_down = true
75
+ @shutting_down.make_true
71
76
  @workers.map(&:kill)
72
77
  end
73
78
 
@@ -88,7 +93,7 @@ module Protobuf
88
93
 
89
94
  # Thread-safe access to the current active work size.
90
95
  def size
91
- @mutex.synchronize { @active_work }
96
+ @active_work.value
92
97
  end
93
98
 
94
99
  private
@@ -98,7 +103,7 @@ module Protobuf
98
103
  end
99
104
 
100
105
  def prune_dead_workers
101
- # This must be called inside a mutex block.
106
+ # This must be called inside @mutex.
102
107
  @workers = @workers.select(&:alive?)
103
108
  end
104
109
 
@@ -126,7 +131,7 @@ module Protobuf
126
131
  rescue => error
127
132
  @cb_mutex.synchronize { @error_cb.call(error) }
128
133
  ensure
129
- @mutex.synchronize { @active_work -= 1 }
134
+ @active_work.decrement
130
135
  end
131
136
  end
132
137
  end
@@ -1,6 +1,29 @@
1
1
  module Protobuf
2
2
  module Nats
3
3
  class UUIDv7Helper
4
+ # Generate a UUIDv7 string without a CSPRNG. Callers that only need a
5
+ # 48-bit millisecond timestamp prefix (so #age_in_seconds can report a
6
+ # value) plus enough randomness to stay unique among concurrent generators
7
+ # don't need SecureRandom: its gen_random call dominated per-request CPU
8
+ # and garbage (measured ~6.8us/op and 4 GC-triggering allocations). A
9
+ # per-thread non-cryptographic Random halves both. The layout still matches
10
+ # RFC 9562 UUIDv7 (version 7 + RFC 4122 variant bits).
11
+ #
12
+ # @return [String] a UUIDv7 string (e.g. "01234567-89ab-7def-8123-456789abcdef")
13
+ def self.generate
14
+ unix_ts_ms = ::Process.clock_gettime(::Process::CLOCK_REALTIME, :millisecond) & 0xffffffffffff
15
+ rng = (::Thread.current[:pb_nats_uuid_rng] ||= ::Random.new)
16
+ format(
17
+ "%08x-%04x-%04x-%04x-%04x%08x",
18
+ (unix_ts_ms >> 16) & 0xffffffff, # 32 high bits of the ms timestamp
19
+ unix_ts_ms & 0xffff, # 16 low bits of the ms timestamp
20
+ (0x7000 | rng.rand(0x1000)), # version 7 + 12 random bits
21
+ (0x8000 | rng.rand(0x4000)), # RFC 4122 variant + 14 random bits
22
+ rng.rand(0x10000), # 16 random bits
23
+ rng.rand(0x100000000) # 32 random bits
24
+ )
25
+ end
26
+
4
27
  # Extract the Unix timestamp (in seconds) from a UUIDv7 string
5
28
  # Returns nil if the UUID cannot be parsed
6
29
  #
@@ -1,5 +1,5 @@
1
1
  module Protobuf
2
2
  module Nats
3
- VERSION = "0.13.0.pre4"
3
+ VERSION = "0.13.0.pre5"
4
4
  end
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: protobuf-nats
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.13.0.pre4
4
+ version: 0.13.0.pre5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Brandon Dewitt
@@ -200,7 +200,9 @@ files:
200
200
  - Rakefile
201
201
  - bench/bench.md
202
202
  - bench/client.rb
203
+ - bench/concurrency_bench.rb
203
204
  - bench/console.rb
205
+ - bench/e2e_bench.rb
204
206
  - bench/real_client.rb
205
207
  - bench/real_client.sh
206
208
  - bench/real_client_threaded.rb