quonfig 0.0.15 → 0.0.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,300 +2,611 @@
2
2
 
3
3
  require 'base64'
4
4
  require 'json'
5
+ require 'net/http'
6
+ require 'uri'
5
7
 
6
8
  module Quonfig
9
+ # Event delivered to on_envelope. +id+ mirrors the SSE +id:+ field and is
10
+ # consumed by callers that want the server cursor (tests + last-event-id
11
+ # resume). +data+ is the raw +data:+ payload string. +envelope+ is the
12
+ # parsed Quonfig::ConfigEnvelope.
13
+ StreamEvent = Struct.new(:envelope, :id, :data)
14
+
15
+ # SSE client for real-time config delivery from api-delivery-sse.
16
+ #
17
+ # Owns its reconnect loop end-to-end. sdk-go, sdk-python, and sdk-node all
18
+ # reached the same conclusion: the wire format we consume (plain JSON
19
+ # envelopes in single-line +data:+ frames, no named events, no retry
20
+ # directives) is simple enough that an SDK-owned loop is clearer than a
21
+ # library wrapper, and the operator-facing reconnect counter becomes
22
+ # trivially correct because there is exactly one place that increments it
23
+ # (qfg-35sm; replaces the ld-eventsource integration from qfg-ie49 +
24
+ # qfg-cf52, which required log-line scraping and a raise-proof logger
25
+ # wrapper to observe reconnects through the upstream library).
7
26
  class SSEConfigClient
8
- # ld-eventsource auto-reconnects on a clean socket EOF (server FIN)
9
- # *internally* — it never calls +on_error+ for that case, only for
10
- # ECONNREFUSED-style failures (qfg-ie49; see chaos scenario 09). The one
11
- # signal it emits for any reconnect is an info-level
12
- # "Will retry connection after ..." line, logged once per reconnect attempt
13
- # and never on the first connect. Wrapping the logger we hand to
14
- # SSE::Client lets the SDK observe those internal reconnects without
15
- # touching the data path. This is the only reconnect hook ld-eventsource
16
- # >= 2.0 exposes.
17
- class ReconnectCountingLogger
18
- RECONNECT_SIGNAL = 'Will retry connection after'
19
-
20
- LEVELS = %i[trace debug info warn error fatal].freeze
21
-
22
- def initialize(wrapped, &on_reconnect)
23
- @wrapped = wrapped
24
- @on_reconnect = on_reconnect
25
- end
26
-
27
- # Crash-safe by construction: ld-eventsource calls this logger from
28
- # inside its bare-Thread +run_stream+ loop, and several of those call
29
- # sites (+connect+, +log_and_dispatch_error+, query-param building) are
30
- # NOT wrapped in a rescue. Any exception that escapes a logger call kills
31
- # the worker thread with +@stopped+ still false, so +closed?+ never flips
32
- # true and the SDK's @retry_thread never reconnects — the SSE stream is
33
- # silently wedged forever (qfg-cf52, the chaos scenario 05 flake). Every
34
- # step here is therefore independently guarded: a throwing message block,
35
- # a throwing on_reconnect callback, or a throwing wrapped logger can
36
- # never propagate out of this method.
37
- LEVELS.each do |level|
38
- define_method(level) do |message = nil, &block|
39
- begin
40
- message = block.call if message.nil? && block
41
- rescue StandardError
42
- message = nil
43
- end
44
-
45
- if level == :info && message.to_s.include?(RECONNECT_SIGNAL)
46
- begin
47
- @on_reconnect.call
48
- rescue StandardError
49
- nil
50
- end
51
- end
52
-
53
- begin
54
- @wrapped.public_send(level, message) if @wrapped.respond_to?(level)
55
- rescue StandardError
56
- nil
57
- end
58
- end
59
- end
60
-
61
- def level
62
- @wrapped&.level
63
- end
64
-
65
- def level=(new_level)
66
- @wrapped.level = new_level if @wrapped.respond_to?(:level=)
67
- end
68
- end
69
-
70
27
  class Options
71
- attr_reader :sse_read_timeout, :seconds_between_new_connection,
72
- :sse_default_reconnect_time, :sleep_delay_for_new_connection_check,
73
- :errors_to_close_connection, :sse_reconnect_reset_interval
28
+ attr_reader :sse_read_timeout, :sse_connect_timeout,
29
+ :sse_initial_reconnect_delay, :sse_max_reconnect_delay
74
30
 
75
31
  # sse_read_timeout: 90s = 3x the 30s server heartbeat. A silent socket
76
- # stall trips the read deadline within one missed-heartbeat window
77
- # rather than the previous 5-minute idle. See plan
78
- # `project/plans/sdk-hardening-and-verification.md` Layer 1.
32
+ # stall trips within one missed-heartbeat window rather than the OS
33
+ # TCP idle (often hours).
79
34
  #
80
- # sse_reconnect_reset_interval: 1s (ld-eventsource default is 60s). The
81
- # ld-eventsource backoff only resets to the base interval once a
82
- # connection has stayed up this long; until then each reconnect doubles
83
- # the delay (1s, 2s, 4s, 8s...). With the 60s default, a flapping
84
- # connection (chaos scenario 09 proxy killed every 6s) backs off so
85
- # fast the SDK is mid-sleep when the next kill lands and never observes
86
- # it. Resetting after 1s of healthy connection mirrors sdk-python, which
87
- # resets its backoff on every successful connect (sdk-python/quonfig/
88
- # sse.py). A *sustained* outage still backs off exponentially: no
89
- # connection succeeds, so `mark_success` is never called and the reset
90
- # never triggers (qfg-ie49).
35
+ # sse_initial_reconnect_delay / sse_max_reconnect_delay: backoff bounds.
36
+ # Each failed reconnect doubles the delay (with +/-50% jitter) up to the
37
+ # max. A successful event delivery resets the delay to the initial
38
+ # value matches sdk-python's policy. A clean server-initiated FIN is
39
+ # treated as "not a failure for backoff purposes" because LBs recycling
40
+ # connections is normal; the reconnect counter still increments.
91
41
  def initialize(sse_read_timeout: 90,
92
- seconds_between_new_connection: 5,
93
- sleep_delay_for_new_connection_check: 1,
94
- sse_default_reconnect_time: SSE::Client::DEFAULT_RECONNECT_TIME,
95
- sse_reconnect_reset_interval: 1,
96
- errors_to_close_connection: [HTTP::ConnectionError])
42
+ sse_connect_timeout: 10,
43
+ sse_initial_reconnect_delay: 1.0,
44
+ sse_max_reconnect_delay: 30.0)
97
45
  @sse_read_timeout = sse_read_timeout
98
- @seconds_between_new_connection = seconds_between_new_connection
99
- @sse_default_reconnect_time = sse_default_reconnect_time
100
- @sse_reconnect_reset_interval = sse_reconnect_reset_interval
101
- @sleep_delay_for_new_connection_check = sleep_delay_for_new_connection_check
102
- @errors_to_close_connection = errors_to_close_connection
46
+ @sse_connect_timeout = sse_connect_timeout
47
+ @sse_initial_reconnect_delay = sse_initial_reconnect_delay.to_f
48
+ @sse_max_reconnect_delay = sse_max_reconnect_delay.to_f
103
49
  end
104
50
  end
105
51
 
106
52
  LOG = Quonfig::InternalLogger.new(self)
107
53
 
54
+ # qfg-i5xv: HTTP status codes the SDK classifies as terminal — these will
55
+ # not heal by retrying (bad key, revoked permission, missing endpoint).
56
+ # Anything else (5xx, 429, network errors) stays on the transient path.
57
+ TERMINAL_HTTP_CODES = [401, 403, 404].freeze
58
+
108
59
  # +on_error+: optional callable invoked on every SSE error edge. Parent
109
60
  # Quonfig::Client wires this to drive @sse_state -> :error so that
110
- # +connection_state+ reflects the disconnect (qfg-47c2.27). Without it
111
- # the SDK's public health primitive would lie about its own state during
112
- # a mid-run socket drop.
61
+ # +connection_state+ reflects the disconnect (qfg-47c2.27).
113
62
  def initialize(prefab_options, config_loader, options = nil, logger = nil, on_error: nil)
114
63
  @prefab_options = prefab_options
115
64
  @options = options || Options.new
116
65
  @config_loader = config_loader
117
- @connected = false
118
66
  @logger = logger || LOG
119
67
  @on_error = on_error
68
+
69
+ @stopped = Concurrent::AtomicBoolean.new(false)
120
70
  @restart_total = 0
121
71
  @restart_mutex = Mutex.new
72
+
73
+ @on_envelope_error_total = 0
74
+ @on_envelope_error_mutex = Mutex.new
75
+
76
+ @conn_mutex = Mutex.new
77
+ @active_http = nil
78
+
79
+ @source_index = -1
80
+ @last_event_id = nil
122
81
  end
123
82
 
124
- # qfg-ll6r / qfg-ie49: Layer 1 (SSE) restart counter counts every
125
- # *reconnect*, from two sources:
126
- # 1. ld-eventsource's own internal reconnect (clean FIN, read timeout,
127
- # transient errors it doesn't surface) observed via the
128
- # ReconnectCountingLogger "Will retry connection after" signal.
129
- # 2. SDK-driven reconnects in @retry_thread, after a closing error
130
- # (HTTP::ConnectionError) made us close the SSE::Client outright.
131
- # These two are mutually exclusive per disconnect, so there is no
132
- # double-count. on_error is deliberately NOT a source — ld-eventsource
133
- # reconnects internally after most non-closing errors, so counting the
134
- # error edge AND the reconnect would double up (qfg-ie49).
135
- #
136
- # The chaos harness pulls this via Client#worker_restart_total(layer: '1')
137
- # so kill-storm scenarios (e.g. scenario 09 — proxy killed 5x in 30s) can
138
- # assert restart_total >= 5 even when the kills produce clean FINs that
139
- # never reach on_error.
83
+ # Layer 1 (SSE) reconnect counter. Bumped exactly once per reconnect
84
+ # attempt never per error edge, never per envelope. Read by
85
+ # Quonfig::Client#worker_restart_total(layer: '1') and asserted by chaos
86
+ # scenario 09 (>= 5 after 5 proxy flaps in 30s).
140
87
  def restart_total
141
88
  @restart_mutex.synchronize { @restart_total }
142
89
  end
143
90
 
144
- # Bump the Layer 1 reconnect counter. Called from the ld-eventsource
145
- # worker thread (via ReconnectCountingLogger) and from @retry_thread.
146
- def count_restart!
147
- @restart_mutex.synchronize { @restart_total += 1 }
91
+ # qfg-m3lk: count of user-supplied on_envelope callback invocations that
92
+ # raised. Surfaced for operator visibility a non-zero value here with
93
+ # restart_total stable means a caller-side listener bug, not a transport
94
+ # problem. (Pre-fix, those raises propagated into run_loop's rescue and
95
+ # masqueraded as transport errors, causing reconnect storms.)
96
+ def on_envelope_error_total
97
+ @on_envelope_error_mutex.synchronize { @on_envelope_error_total }
148
98
  end
149
99
 
150
- def close
151
- @retry_thread&.kill
152
- @client&.close
100
+ def start(&on_envelope)
101
+ return if @prefab_options.sse_api_urls.nil? || @prefab_options.sse_api_urls.empty?
102
+
103
+ @worker = Thread.new { run_loop(&on_envelope) }
153
104
  end
154
105
 
155
- def start(&load_configs)
156
- if @prefab_options.sse_api_urls.empty?
157
- @logger.debug 'No SSE api_urls configured'
158
- return
106
+ # Shut down. Interrupts the in-flight stream by closing the underlying
107
+ # socket from this thread — the worker thread observes the resulting
108
+ # IOError, sees @stopped == true, and exits cleanly.
109
+ def close
110
+ @stopped.make_true
111
+ @conn_mutex.synchronize do
112
+ begin
113
+ @active_http&.finish
114
+ rescue StandardError
115
+ # already closed / never started — idempotent
116
+ end
117
+ @active_http = nil
159
118
  end
119
+ @worker&.join(2)
120
+ @worker = nil
121
+ end
122
+
123
+ # Public so tests can assert the headers shape. Body of the request is
124
+ # always empty; this is the full set api-delivery-sse sees.
125
+ def headers
126
+ auth = "1:#{@prefab_options.sdk_key}"
127
+ auth_string = Base64.strict_encode64(auth)
128
+ h = {
129
+ 'Authorization' => "Basic #{auth_string}",
130
+ 'Accept' => 'text/event-stream',
131
+ 'Cache-Control' => 'no-cache',
132
+ 'X-Quonfig-SDK-Version' => "ruby-#{Quonfig::VERSION}"
133
+ }
134
+ cursor = current_cursor
135
+ h['Last-Event-Id'] = cursor if cursor
136
+ h
137
+ end
160
138
 
161
- @client = connect(&load_configs)
139
+ # Compute a Last-Event-ID for the next request. Three sources, in
140
+ # priority order:
141
+ # 1. @last_event_id -- set by the most recent event we processed
142
+ # 2. config_loader.version -- string ETag from last HTTP fetch
143
+ # 3. config_loader.highwater_mark -- legacy numeric cursor
144
+ # Returns nil if no prior state exists.
145
+ def current_cursor
146
+ return @last_event_id if @last_event_id && !@last_event_id.empty?
162
147
 
163
- closed_count = 0
148
+ if @config_loader.respond_to?(:version)
149
+ v = @config_loader.version
150
+ return v if v.is_a?(String) && !v.empty?
151
+ end
164
152
 
165
- @retry_thread = Thread.new do
166
- loop do
167
- sleep @options.sleep_delay_for_new_connection_check
153
+ if @config_loader.respond_to?(:highwater_mark)
154
+ hw = @config_loader.highwater_mark
155
+ return hw.to_s if hw.is_a?(Numeric) && hw.positive?
156
+ return hw if hw.is_a?(String) && !hw.empty?
157
+ end
168
158
 
169
- next unless @client.closed?
159
+ nil
160
+ end
170
161
 
171
- closed_count += @options.sleep_delay_for_new_connection_check
162
+ private
172
163
 
173
- next unless closed_count > @options.seconds_between_new_connection
164
+ # Long-lived reconnect loop. One iteration = one connect attempt. Bumps
165
+ # restart_total *before* every retry — so the counter answers "how many
166
+ # times have we reconnected after a drop" rather than "how many connect
167
+ # attempts have occurred." The first attempt is not a restart.
168
+ #
169
+ # qfg-tj18: the body is wrapped in
170
+ # +Thread.handle_interrupt(SSEReadDeadlineExceeded => :on_blocking)+ so a
171
+ # watchdog raise that's already been queued (the watchdog's mutex covers
172
+ # the *decision* to fire but cannot un-queue a delivered raise) lands
173
+ # only at a blocking-IO checkpoint. Inside stream_once we explicitly
174
+ # re-enable +:immediate+ around the +read_body+ block where we *do*
175
+ # want the raise to wake the read. A per-iteration paranoid rescue
176
+ # catches any late-landing raise that escapes the inner +rescue
177
+ # StandardError+ (e.g. lands inside +interruptible_sleep+ between
178
+ # iterations) so the worker thread never silently dies.
179
+ def run_loop(&on_envelope)
180
+ Thread.handle_interrupt(SSEReadDeadlineExceeded => :on_blocking) do
181
+ delay = @options.sse_initial_reconnect_delay
182
+ first_attempt = true
183
+
184
+ until @stopped.value
185
+ begin
186
+ unless first_attempt
187
+ increment_restart!
188
+ interruptible_sleep(jittered(delay))
189
+ break if @stopped.value
190
+ end
191
+ first_attempt = false
174
192
 
175
- closed_count = 0
176
- @logger.debug 'Reconnecting SSE client'
177
- # SDK-driven reconnect: a closing error (HTTP::ConnectionError)
178
- # closed the previous SSE::Client, so ld-eventsource's own
179
- # reconnect loop has exited and won't emit the "Will retry" signal.
180
- # Count it here instead (qfg-ie49).
181
- count_restart!
182
- @client = connect(&load_configs)
193
+ connected_at_least_once = false
194
+ begin
195
+ stream_once do |event|
196
+ connected_at_least_once = true
197
+ # Persist the most recent id so the next reconnect resumes
198
+ # from there via Last-Event-Id. Updated *before* the user
199
+ # callback runs so a raising listener still advances the
200
+ # cursor — the event was delivered to us, the bug is on the
201
+ # caller side.
202
+ @last_event_id = event.id if event.id
203
+ # qfg-m3lk: callback exceptions are isolated. A buggy
204
+ # listener must not look like a transport error and trigger
205
+ # a reconnect.
206
+ invoke_on_envelope_safely(on_envelope, event)
207
+ # A connection healthy enough to deliver a real envelope
208
+ # earns a reset of the backoff. Sustained outages never
209
+ # reach this branch (no event ever delivered) so the
210
+ # exponential growth still holds.
211
+ delay = @options.sse_initial_reconnect_delay
212
+ end
213
+ rescue StandardError => e
214
+ handle_error(e) unless @stopped.value
215
+ end
216
+
217
+ # Backoff only grows on failed connect attempts. A server-
218
+ # initiated clean FIN after a healthy session (normal LB
219
+ # recycling) reuses the same delay — punishing it would make
220
+ # us look broken under benign rolling restarts. Matches
221
+ # sdk-go's `connectedOK` distinction.
222
+ delay = [delay * 2, @options.sse_max_reconnect_delay].min unless connected_at_least_once
223
+ rescue SSEReadDeadlineExceeded => e
224
+ # Paranoid backstop (qfg-tj18). A watchdog raise that landed
225
+ # outside +stream_once+ — typically in +interruptible_sleep+
226
+ # — must not kill the worker thread. We log loudly and let the
227
+ # +until+ loop carry on.
228
+ @logger.error "SSE watchdog late-raise contained: #{e.inspect}; resuming loop"
229
+ end
183
230
  end
184
231
  end
232
+ ensure
233
+ register_active(nil)
185
234
  end
186
235
 
187
- def connect(&load_configs)
188
- url = "#{source}/api/v2/sse/config"
236
+ # Opens one SSE request and yields each parsed event until the stream
237
+ # ends (clean FIN, error, or stop). Raises on transport errors so the
238
+ # caller can apply backoff. Clean FIN returns without raising.
239
+ #
240
+ # A watchdog thread closes the socket if no bytes arrive within
241
+ # +sse_read_timeout+. Net::HTTP#read_timeout is NOT reliable for the
242
+ # streaming +read_body do |chunk|+ form — the underlying BufferedIO
243
+ # reads bypass it in practice (a silent server stall blocks indefinitely
244
+ # against a configured deadline). sdk-go and sdk-node hit the same
245
+ # gotcha and solve it the same way: per-chunk reset, async close on
246
+ # expiry (chaos scenario 02 — sse_silent_stall).
247
+ def stream_once(&block)
248
+ url = "#{current_url}/api/v2/sse/config"
189
249
  cursor = current_cursor
190
250
  @logger.debug "SSE Streaming Connect to #{url} start_at #{cursor.inspect}"
191
251
 
192
- # Wrap the ld-eventsource logger so internal reconnects (clean FIN,
193
- # read-timeout, transient errors) bump restart_total — they never reach
194
- # on_error (qfg-ie49).
195
- sse_logger = ReconnectCountingLogger.new(
196
- Quonfig::InternalLogger.new(SSE::Client)
197
- ) { count_restart! }
198
-
199
- SSE::Client.new(url,
200
- headers: headers,
201
- read_timeout: @options.sse_read_timeout,
202
- reconnect_time: @options.sse_default_reconnect_time,
203
- reconnect_reset_interval: @options.sse_reconnect_reset_interval,
204
- last_event_id: cursor,
205
- logger: sse_logger) do |client|
206
- client.on_event do |event|
207
- if event.data.nil? || event.data.empty?
208
- @logger.error "SSE Streaming Error: Received empty data for url #{url}"
209
- client.close
210
- next
252
+ uri = URI(url)
253
+ http = Net::HTTP.new(uri.host, uri.port)
254
+ http.use_ssl = (uri.scheme == 'https')
255
+ http.open_timeout = @options.sse_connect_timeout
256
+ # Keep Net::HTTP's read_timeout as a backstop for the header read
257
+ # (where it does apply reliably). The watchdog covers the body path.
258
+ http.read_timeout = @options.sse_read_timeout
259
+
260
+ req = Net::HTTP::Get.new(uri.request_uri, headers)
261
+
262
+ http.start
263
+ register_active(http)
264
+
265
+ watchdog = ReadDeadlineWatchdog.new(
266
+ worker: Thread.current, deadline_s: @options.sse_read_timeout,
267
+ stopped: @stopped, logger: @logger
268
+ )
269
+ watchdog.start
270
+
271
+ begin
272
+ http.request(req) do |resp|
273
+ code = resp.code.to_i
274
+ if TERMINAL_HTTP_CODES.include?(code)
275
+ # qfg-i5xv: 401/403/404 will not heal by retrying — bad key,
276
+ # revoked permission, or wrong endpoint. Mark stopped *before*
277
+ # invoking on_error so the loop's terminal-error branch is
278
+ # already locked in if the parent callback inspects state, and
279
+ # so the inner rescue's `handle_error(e) unless @stopped.value`
280
+ # guard suppresses a second on_error edge.
281
+ err = SSEHTTPTerminalError.new(code)
282
+ @logger.error "SSE Streaming Terminal Error: HTTP #{code} for url #{url}; will not retry"
283
+ @stopped.make_true
284
+ invoke_on_error(err)
285
+ raise err
211
286
  end
212
-
213
- begin
214
- parsed = JSON.parse(event.data)
215
- rescue JSON::ParserError => e
216
- @logger.error "SSE Streaming Error: Failed to parse JSON for url #{url}: #{e.message}"
217
- client.close
218
- next
287
+ if code != 200
288
+ err = SSEHTTPStatusError.new(code)
289
+ @logger.error "SSE Streaming Error: HTTP #{code} for url #{url}"
290
+ invoke_on_error(err)
291
+ raise err
219
292
  end
220
293
 
221
- envelope = Quonfig::ConfigEnvelope.new(
222
- configs: parsed['configs'] || [],
223
- meta: parsed['meta'] || {}
224
- )
225
- load_configs.call(envelope, event, :sse)
294
+ parser = EventParser.new
295
+ # qfg-tj18: run_loop wraps the body in +:on_blocking+ which
296
+ # *would* still deliver during read_body (read_body is a
297
+ # blocking IO call), but be explicit: we want the watchdog raise
298
+ # to land here without ambiguity.
299
+ Thread.handle_interrupt(SSEReadDeadlineExceeded => :immediate) do
300
+ resp.read_body do |chunk|
301
+ watchdog.reset!
302
+ break if @stopped.value
303
+
304
+ parser.feed(chunk, &block)
305
+ end
306
+ end
307
+ # read_body returned cleanly — either a server-initiated FIN, or
308
+ # the watchdog closed the socket on a silent stall. Either way,
309
+ # the outer loop will reconnect and bump restart_total on the
310
+ # next iteration.
311
+ @logger.debug "SSE stream ended for url #{url}"
312
+ end
313
+ ensure
314
+ watchdog.stop
315
+ register_active(nil)
316
+ begin
317
+ http.finish if http.started?
318
+ rescue StandardError
319
+ # already closed
226
320
  end
321
+ end
322
+ end
227
323
 
228
- client.on_error do |error|
229
- # SSL "unexpected eof" is expected when SSE sessions timeout normally
230
- if error.is_a?(OpenSSL::SSL::SSLError) && error.message.include?('unexpected eof')
231
- @logger.debug "SSE Streaming: Connection closed (expected timeout) for url #{url}"
232
- else
233
- @logger.error "SSE Streaming Error: #{error.inspect} for url #{url}"
234
- end
324
+ # Track the active connection so close() can interrupt a blocked
325
+ # read_body from another thread. Guarded by @conn_mutex.
326
+ def register_active(http)
327
+ @conn_mutex.synchronize { @active_http = http }
328
+ end
235
329
 
236
- # qfg-ie49: restart_total is NOT bumped here. ld-eventsource
237
- # auto-reconnects after most non-closing errors, and that reconnect
238
- # is already counted via ReconnectCountingLogger; bumping here too
239
- # would double-count. For closing errors (HTTP::ConnectionError) the
240
- # reconnect is counted in @retry_thread instead. on_error's job is
241
- # purely to notify the parent client of the disconnect edge.
242
-
243
- # Notify the parent client BEFORE deciding whether to close — every
244
- # error edge is a disconnect signal as far as @sse_state goes, even
245
- # if we let the underlying SSE library handle reconnect itself.
246
- # qfg-47c2.27
247
- if @on_error
248
- begin
249
- @on_error.call(error)
250
- rescue StandardError => e
251
- @logger.error "SSE on_error callback raised: #{e.inspect}"
252
- end
253
- end
330
+ def increment_restart!
331
+ @restart_mutex.synchronize { @restart_total += 1 }
332
+ end
254
333
 
255
- if @options.errors_to_close_connection.any? { |klass| error.is_a?(klass) }
256
- @logger.debug "Closing SSE connection for url #{url}"
257
- client.close
258
- end
259
- end
334
+ def handle_error(error)
335
+ @logger.error "SSE Streaming Error: #{error.inspect}"
336
+ invoke_on_error(error)
337
+ end
338
+
339
+ # qfg-m3lk: rescue StandardError (NOT Exception) so SystemExit /
340
+ # Interrupt / SignalException still escape — Ctrl-C inside a customer
341
+ # callback must still kill the process. StandardError is the right
342
+ # boundary for "the caller's listener has a bug".
343
+ def invoke_on_envelope_safely(on_envelope, event)
344
+ on_envelope.call(event.envelope, event, :sse)
345
+ rescue StandardError => e
346
+ @on_envelope_error_mutex.synchronize { @on_envelope_error_total += 1 }
347
+ bt = (e.backtrace || []).first(5).join("\n ")
348
+ @logger.error "SSE on_envelope callback raised: #{e.class}: #{e.message}\n #{bt}"
349
+ end
350
+
351
+ def invoke_on_error(error)
352
+ return unless @on_error
353
+
354
+ begin
355
+ @on_error.call(error)
356
+ rescue StandardError => e
357
+ @logger.error "SSE on_error callback raised: #{e.inspect}"
260
358
  end
261
359
  end
262
360
 
263
- def headers
264
- auth = "1:#{@prefab_options.sdk_key}"
265
- auth_string = Base64.strict_encode64(auth)
266
- {
267
- 'Authorization' => "Basic #{auth_string}",
268
- 'Accept' => 'text/event-stream',
269
- 'X-Quonfig-SDK-Version' => "ruby-#{Quonfig::VERSION}"
270
- }
361
+ # +/-50% jitter — caps thundering-herd amplitude after a partition heal.
362
+ # Identical shape to ld-eventsource's Backoff#next_interval (and
363
+ # sdk-go's runLoop jitter) so we don't surprise operators familiar with
364
+ # those.
365
+ def jittered(delay)
366
+ (delay / 2) + rand(delay / 2.0)
271
367
  end
272
368
 
273
- def source
274
- @source_index = @source_index.nil? ? 0 : @source_index + 1
369
+ # Sleep with interrupt: chunks the sleep so close() during a long
370
+ # backoff doesn't block shutdown for tens of seconds.
371
+ def interruptible_sleep(seconds)
372
+ deadline = Process.clock_gettime(Process::CLOCK_MONOTONIC) + seconds
373
+ until @stopped.value
374
+ remaining = deadline - Process.clock_gettime(Process::CLOCK_MONOTONIC)
375
+ break if remaining <= 0
275
376
 
276
- @source_index = 0 if @source_index >= @prefab_options.sse_api_urls.size
377
+ sleep([remaining, 0.1].min)
378
+ end
379
+ end
277
380
 
278
- @prefab_options.sse_api_urls[@source_index]
381
+ # Rotate through configured SSE URLs. The same rotation rule the
382
+ # previous implementation used, preserved so multi-region failover
383
+ # behavior is unchanged.
384
+ def current_url
385
+ urls = @prefab_options.sse_api_urls
386
+ @source_index = (@source_index + 1) % urls.size
387
+ urls[@source_index]
279
388
  end
280
389
 
281
- # Compute a Last-Event-ID to resume the stream from. Three sources, in
282
- # priority order:
283
- # 1. config_loader.version -- string ETag from last HTTP fetch (new path)
284
- # 2. config_loader.highwater_mark -- legacy numeric cursor
285
- # 3. nil -- no prior state; stream from HEAD
286
- def current_cursor
287
- if @config_loader.respond_to?(:version)
288
- v = @config_loader.version
289
- return v if v.is_a?(String) && !v.empty?
390
+ # Internal: HTTP-status sentinel error for non-200 SSE responses. Surfaces
391
+ # the status code through #message so parent on_error callbacks can log
392
+ # meaningfully without depending on ld-eventsource's error hierarchy.
393
+ class SSEHTTPStatusError < StandardError
394
+ attr_reader :status_code
395
+
396
+ def initialize(status_code)
397
+ @status_code = status_code
398
+ super("HTTP #{status_code}")
290
399
  end
400
+ end
291
401
 
292
- if @config_loader.respond_to?(:highwater_mark)
293
- hw = @config_loader.highwater_mark
294
- return hw.to_s if hw.is_a?(Numeric) && hw.positive?
295
- return hw if hw.is_a?(String) && !hw.empty?
402
+ # qfg-i5xv: terminal HTTP failures the SDK will not retry. 401 = bad key,
403
+ # 403 = revoked workspace permission, 404 = wrong endpoint / missing
404
+ # workspace. A subclass of SSEHTTPStatusError so existing on_error
405
+ # callbacks that only check `is_a?(SSEHTTPStatusError)` keep working,
406
+ # while customers that want to distinguish (alerting, OpenFeature
407
+ # provider error events) can dispatch on the subclass.
408
+ class SSEHTTPTerminalError < SSEHTTPStatusError; end
409
+
410
+ # Raised by the watchdog into the worker thread when the per-chunk
411
+ # read deadline elapses. Caught by run_loop's rescue, indistinguishable
412
+ # from any other transport error for backoff/restart purposes.
413
+ class SSEReadDeadlineExceeded < StandardError; end
414
+
415
+ # Background watchdog that interrupts the worker thread if no chunk
416
+ # arrives within +deadline_s+ seconds. Uses Thread#raise — the only
417
+ # reliable cross-platform way to unblock a Ruby thread blocked in
418
+ # +Net::HTTP+'s body-read on macOS. (Closing or shutting down the
419
+ # underlying socket from another thread does NOT wake the reader on
420
+ # macOS; the kernel discards future reads but the in-flight syscall
421
+ # stays blocked until something else trips. sdk-go and sdk-node solve
422
+ # the equivalent problem with context cancellation / AbortController,
423
+ # which Ruby lacks at the IO layer.) Thread#raise is essentially what
424
+ # +Timeout.timeout+ does internally; using it directly avoids
425
+ # Timeout.timeout's sketch reputation around ensure blocks.
426
+ class ReadDeadlineWatchdog
427
+ POLL_INTERVAL = 0.25
428
+
429
+ def initialize(worker:, deadline_s:, stopped:, logger:)
430
+ @worker = worker
431
+ @deadline_s = deadline_s
432
+ @stopped = stopped
433
+ @logger = logger
434
+ @active = true
435
+ # Mutex covers @active AND the decision to fire Thread#raise. stop()
436
+ # holds the mutex when flipping @active false, so a +stop+ that
437
+ # arrives mid-deadline-check cannot lose the race against the
438
+ # watchdog's @worker.raise call (which would inject a spurious
439
+ # SSEReadDeadlineExceeded into the worker thread right after a
440
+ # clean read_body return).
441
+ @mutex = Mutex.new
442
+ @last_read_at = Concurrent::AtomicReference.new(Process.clock_gettime(Process::CLOCK_MONOTONIC))
296
443
  end
297
444
 
298
- nil
445
+ def start
446
+ @thread = Thread.new { watch }
447
+ end
448
+
449
+ def reset!
450
+ @last_read_at.set(Process.clock_gettime(Process::CLOCK_MONOTONIC))
451
+ end
452
+
453
+ def stop
454
+ @mutex.synchronize { @active = false }
455
+ @thread&.join(1)
456
+ @thread = nil
457
+ end
458
+
459
+ private
460
+
461
+ def watch
462
+ loop do
463
+ sleep POLL_INTERVAL
464
+ break unless @mutex.synchronize { @active } && !@stopped.value
465
+
466
+ idle = Process.clock_gettime(Process::CLOCK_MONOTONIC) - @last_read_at.value
467
+ next if idle < @deadline_s
468
+
469
+ fired = @mutex.synchronize do
470
+ next false unless @active && !@stopped.value
471
+
472
+ @logger.debug "SSE read deadline exceeded (#{idle.round(1)}s idle >= #{@deadline_s}s); interrupting worker"
473
+ @worker.raise(SSEReadDeadlineExceeded.new("SSE read deadline #{@deadline_s}s exceeded"))
474
+ true
475
+ end
476
+ break if fired
477
+ end
478
+ rescue StandardError => e
479
+ # Watchdog must never crash the SDK. Worst case we silently fall
480
+ # back to Net::HTTP's own (unreliable) read_timeout.
481
+ @logger.debug "SSE watchdog error: #{e.inspect}"
482
+ end
483
+ end
484
+
485
+ # Streaming SSE parser. Accepts byte chunks (any encoding), yields one
486
+ # Quonfig::StreamEvent per complete event. Tolerates:
487
+ # - chunks that split a UTF-8 multi-byte character (buffer in 8-bit,
488
+ # transcode whole lines)
489
+ # - chunks that split a line mid-way
490
+ # - any of CR / LF / CRLF as line terminators
491
+ # - +data:+, +data: + (optional space per SSE spec)
492
+ # - +:comment+ lines (keepalives — ignored)
493
+ # - multi-line +data:+ (concatenated with +\n+, per spec)
494
+ # Ignores +event:+ and +retry:+ — api-delivery does not emit them and the
495
+ # Quonfig wire contract does not honor reconnect-time directives.
496
+ # Malformed +data:+ JSON is logged and skipped; one bad event does not
497
+ # tear down the stream.
498
+ class EventParser
499
+ def initialize(logger: nil)
500
+ @logger = logger
501
+ @reader = LineReader.new
502
+ @data = +''
503
+ @have_data = false
504
+ @id = nil
505
+ end
506
+
507
+ def feed(chunk)
508
+ @reader.feed(chunk) do |line|
509
+ if line.empty?
510
+ event = flush
511
+ yield event if event
512
+ elsif line.start_with?(':')
513
+ # comment / keepalive — ignore
514
+ else
515
+ process_field(line)
516
+ end
517
+ end
518
+ end
519
+
520
+ private
521
+
522
+ def process_field(line)
523
+ idx = line.index(':')
524
+ return unless idx
525
+
526
+ name = line[0...idx]
527
+ rest = line[(idx + 1)..]
528
+ rest = rest[1..] if rest.start_with?(' ')
529
+
530
+ case name
531
+ when 'data'
532
+ if @have_data
533
+ @data << "\n" << rest
534
+ else
535
+ @data = rest
536
+ @have_data = true
537
+ end
538
+ when 'id'
539
+ @id = rest unless rest.include?("\x00")
540
+ # event: / retry: are intentionally ignored
541
+ end
542
+ end
543
+
544
+ def flush
545
+ return nil unless @have_data
546
+
547
+ data = @data
548
+ id = @id
549
+ @data = +''
550
+ @have_data = false
551
+ # NB: @id persists across events — the SSE spec says last-event-id
552
+ # is sticky until overwritten. Matches ld-eventsource.
553
+
554
+ begin
555
+ parsed = JSON.parse(data)
556
+ rescue JSON::ParserError => e
557
+ (@logger || LOG).error "SSE Streaming Error: malformed JSON: #{e.message}"
558
+ return nil
559
+ end
560
+
561
+ envelope = Quonfig::ConfigEnvelope.new(
562
+ configs: parsed['configs'] || [],
563
+ meta: parsed['meta'] || {}
564
+ )
565
+ StreamEvent.new(envelope, id, data)
566
+ end
567
+ end
568
+
569
+ # Byte-level line reader. Accepts arbitrary chunks, yields one UTF-8
570
+ # line per call to the block. Terminator-stripped (CR / LF / CRLF
571
+ # supported). Modeled on ld-eventsource's BufferedLineReader — same
572
+ # invariants: split bytes-not-chars while scanning, force-encode to
573
+ # UTF-8 only once a complete line is sliced out, so a multi-byte
574
+ # character spanning two chunks does not raise Encoding::CompatibilityError.
575
+ class LineReader
576
+ def initialize
577
+ @buffer = +''.b
578
+ @last_was_cr = false
579
+ end
580
+
581
+ def feed(chunk)
582
+ @buffer << chunk.b
583
+ loop do
584
+ idx = @buffer.index(/[\r\n]/)
585
+ break if idx.nil?
586
+
587
+ ch = @buffer[idx]
588
+ if idx.zero? && ch == "\n" && @last_was_cr
589
+ # Dangling LF of a CRLF pair split across chunks — consume and skip.
590
+ @last_was_cr = false
591
+ @buffer.slice!(0, 1)
592
+ next
593
+ end
594
+
595
+ line = @buffer[0, idx].force_encoding('UTF-8')
596
+ consume = idx + 1
597
+ @last_was_cr = false
598
+ if ch == "\r"
599
+ if consume == @buffer.bytesize
600
+ # CR at end of buffer — could be CRLF split across feeds.
601
+ @last_was_cr = true
602
+ elsif @buffer[consume] == "\n"
603
+ consume += 1
604
+ end
605
+ end
606
+ @buffer.slice!(0, consume)
607
+ yield line
608
+ end
609
+ end
299
610
  end
300
611
  end
301
612
  end