quonfig 0.0.15 → 0.0.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/README.md +143 -12
- data/lib/quonfig/client.rb +230 -22
- data/lib/quonfig/datadir_watcher.rb +113 -0
- data/lib/quonfig/options.rb +25 -2
- data/lib/quonfig/sse_config_client.rb +536 -225
- data/lib/quonfig/version.rb +1 -1
- data/lib/quonfig.rb +3 -1
- data/quonfig.gemspec +4 -1
- metadata +8 -7
|
@@ -2,300 +2,611 @@
|
|
|
2
2
|
|
|
3
3
|
require 'base64'
|
|
4
4
|
require 'json'
|
|
5
|
+
require 'net/http'
|
|
6
|
+
require 'uri'
|
|
5
7
|
|
|
6
8
|
module Quonfig
|
|
9
|
+
# Event delivered to on_envelope. +id+ mirrors the SSE +id:+ field and is
|
|
10
|
+
# consumed by callers that want the server cursor (tests + last-event-id
|
|
11
|
+
# resume). +data+ is the raw +data:+ payload string. +envelope+ is the
|
|
12
|
+
# parsed Quonfig::ConfigEnvelope.
|
|
13
|
+
StreamEvent = Struct.new(:envelope, :id, :data)
|
|
14
|
+
|
|
15
|
+
# SSE client for real-time config delivery from api-delivery-sse.
|
|
16
|
+
#
|
|
17
|
+
# Owns its reconnect loop end-to-end. sdk-go, sdk-python, and sdk-node all
|
|
18
|
+
# reached the same conclusion: the wire format we consume (plain JSON
|
|
19
|
+
# envelopes in single-line +data:+ frames, no named events, no retry
|
|
20
|
+
# directives) is simple enough that an SDK-owned loop is clearer than a
|
|
21
|
+
# library wrapper, and the operator-facing reconnect counter becomes
|
|
22
|
+
# trivially correct because there is exactly one place that increments it
|
|
23
|
+
# (qfg-35sm; replaces the ld-eventsource integration from qfg-ie49 +
|
|
24
|
+
# qfg-cf52, which required log-line scraping and a raise-proof logger
|
|
25
|
+
# wrapper to observe reconnects through the upstream library).
|
|
7
26
|
class SSEConfigClient
|
|
8
|
-
# ld-eventsource auto-reconnects on a clean socket EOF (server FIN)
|
|
9
|
-
# *internally* — it never calls +on_error+ for that case, only for
|
|
10
|
-
# ECONNREFUSED-style failures (qfg-ie49; see chaos scenario 09). The one
|
|
11
|
-
# signal it emits for any reconnect is an info-level
|
|
12
|
-
# "Will retry connection after ..." line, logged once per reconnect attempt
|
|
13
|
-
# and never on the first connect. Wrapping the logger we hand to
|
|
14
|
-
# SSE::Client lets the SDK observe those internal reconnects without
|
|
15
|
-
# touching the data path. This is the only reconnect hook ld-eventsource
|
|
16
|
-
# >= 2.0 exposes.
|
|
17
|
-
class ReconnectCountingLogger
|
|
18
|
-
RECONNECT_SIGNAL = 'Will retry connection after'
|
|
19
|
-
|
|
20
|
-
LEVELS = %i[trace debug info warn error fatal].freeze
|
|
21
|
-
|
|
22
|
-
def initialize(wrapped, &on_reconnect)
|
|
23
|
-
@wrapped = wrapped
|
|
24
|
-
@on_reconnect = on_reconnect
|
|
25
|
-
end
|
|
26
|
-
|
|
27
|
-
# Crash-safe by construction: ld-eventsource calls this logger from
|
|
28
|
-
# inside its bare-Thread +run_stream+ loop, and several of those call
|
|
29
|
-
# sites (+connect+, +log_and_dispatch_error+, query-param building) are
|
|
30
|
-
# NOT wrapped in a rescue. Any exception that escapes a logger call kills
|
|
31
|
-
# the worker thread with +@stopped+ still false, so +closed?+ never flips
|
|
32
|
-
# true and the SDK's @retry_thread never reconnects — the SSE stream is
|
|
33
|
-
# silently wedged forever (qfg-cf52, the chaos scenario 05 flake). Every
|
|
34
|
-
# step here is therefore independently guarded: a throwing message block,
|
|
35
|
-
# a throwing on_reconnect callback, or a throwing wrapped logger can
|
|
36
|
-
# never propagate out of this method.
|
|
37
|
-
LEVELS.each do |level|
|
|
38
|
-
define_method(level) do |message = nil, &block|
|
|
39
|
-
begin
|
|
40
|
-
message = block.call if message.nil? && block
|
|
41
|
-
rescue StandardError
|
|
42
|
-
message = nil
|
|
43
|
-
end
|
|
44
|
-
|
|
45
|
-
if level == :info && message.to_s.include?(RECONNECT_SIGNAL)
|
|
46
|
-
begin
|
|
47
|
-
@on_reconnect.call
|
|
48
|
-
rescue StandardError
|
|
49
|
-
nil
|
|
50
|
-
end
|
|
51
|
-
end
|
|
52
|
-
|
|
53
|
-
begin
|
|
54
|
-
@wrapped.public_send(level, message) if @wrapped.respond_to?(level)
|
|
55
|
-
rescue StandardError
|
|
56
|
-
nil
|
|
57
|
-
end
|
|
58
|
-
end
|
|
59
|
-
end
|
|
60
|
-
|
|
61
|
-
def level
|
|
62
|
-
@wrapped&.level
|
|
63
|
-
end
|
|
64
|
-
|
|
65
|
-
def level=(new_level)
|
|
66
|
-
@wrapped.level = new_level if @wrapped.respond_to?(:level=)
|
|
67
|
-
end
|
|
68
|
-
end
|
|
69
|
-
|
|
70
27
|
class Options
|
|
71
|
-
attr_reader :sse_read_timeout, :
|
|
72
|
-
:
|
|
73
|
-
:errors_to_close_connection, :sse_reconnect_reset_interval
|
|
28
|
+
attr_reader :sse_read_timeout, :sse_connect_timeout,
|
|
29
|
+
:sse_initial_reconnect_delay, :sse_max_reconnect_delay
|
|
74
30
|
|
|
75
31
|
# sse_read_timeout: 90s = 3x the 30s server heartbeat. A silent socket
|
|
76
|
-
# stall trips
|
|
77
|
-
#
|
|
78
|
-
# `project/plans/sdk-hardening-and-verification.md` Layer 1.
|
|
32
|
+
# stall trips within one missed-heartbeat window rather than the OS
|
|
33
|
+
# TCP idle (often hours).
|
|
79
34
|
#
|
|
80
|
-
#
|
|
81
|
-
#
|
|
82
|
-
#
|
|
83
|
-
#
|
|
84
|
-
#
|
|
85
|
-
#
|
|
86
|
-
# it. Resetting after 1s of healthy connection mirrors sdk-python, which
|
|
87
|
-
# resets its backoff on every successful connect (sdk-python/quonfig/
|
|
88
|
-
# sse.py). A *sustained* outage still backs off exponentially: no
|
|
89
|
-
# connection succeeds, so `mark_success` is never called and the reset
|
|
90
|
-
# never triggers (qfg-ie49).
|
|
35
|
+
# sse_initial_reconnect_delay / sse_max_reconnect_delay: backoff bounds.
|
|
36
|
+
# Each failed reconnect doubles the delay (with +/-50% jitter) up to the
|
|
37
|
+
# max. A successful event delivery resets the delay to the initial
|
|
38
|
+
# value — matches sdk-python's policy. A clean server-initiated FIN is
|
|
39
|
+
# treated as "not a failure for backoff purposes" because LBs recycling
|
|
40
|
+
# connections is normal; the reconnect counter still increments.
|
|
91
41
|
def initialize(sse_read_timeout: 90,
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
sse_reconnect_reset_interval: 1,
|
|
96
|
-
errors_to_close_connection: [HTTP::ConnectionError])
|
|
42
|
+
sse_connect_timeout: 10,
|
|
43
|
+
sse_initial_reconnect_delay: 1.0,
|
|
44
|
+
sse_max_reconnect_delay: 30.0)
|
|
97
45
|
@sse_read_timeout = sse_read_timeout
|
|
98
|
-
@
|
|
99
|
-
@
|
|
100
|
-
@
|
|
101
|
-
@sleep_delay_for_new_connection_check = sleep_delay_for_new_connection_check
|
|
102
|
-
@errors_to_close_connection = errors_to_close_connection
|
|
46
|
+
@sse_connect_timeout = sse_connect_timeout
|
|
47
|
+
@sse_initial_reconnect_delay = sse_initial_reconnect_delay.to_f
|
|
48
|
+
@sse_max_reconnect_delay = sse_max_reconnect_delay.to_f
|
|
103
49
|
end
|
|
104
50
|
end
|
|
105
51
|
|
|
106
52
|
LOG = Quonfig::InternalLogger.new(self)
|
|
107
53
|
|
|
54
|
+
# qfg-i5xv: HTTP status codes the SDK classifies as terminal — these will
|
|
55
|
+
# not heal by retrying (bad key, revoked permission, missing endpoint).
|
|
56
|
+
# Anything else (5xx, 429, network errors) stays on the transient path.
|
|
57
|
+
TERMINAL_HTTP_CODES = [401, 403, 404].freeze
|
|
58
|
+
|
|
108
59
|
# +on_error+: optional callable invoked on every SSE error edge. Parent
|
|
109
60
|
# Quonfig::Client wires this to drive @sse_state -> :error so that
|
|
110
|
-
# +connection_state+ reflects the disconnect (qfg-47c2.27).
|
|
111
|
-
# the SDK's public health primitive would lie about its own state during
|
|
112
|
-
# a mid-run socket drop.
|
|
61
|
+
# +connection_state+ reflects the disconnect (qfg-47c2.27).
|
|
113
62
|
def initialize(prefab_options, config_loader, options = nil, logger = nil, on_error: nil)
|
|
114
63
|
@prefab_options = prefab_options
|
|
115
64
|
@options = options || Options.new
|
|
116
65
|
@config_loader = config_loader
|
|
117
|
-
@connected = false
|
|
118
66
|
@logger = logger || LOG
|
|
119
67
|
@on_error = on_error
|
|
68
|
+
|
|
69
|
+
@stopped = Concurrent::AtomicBoolean.new(false)
|
|
120
70
|
@restart_total = 0
|
|
121
71
|
@restart_mutex = Mutex.new
|
|
72
|
+
|
|
73
|
+
@on_envelope_error_total = 0
|
|
74
|
+
@on_envelope_error_mutex = Mutex.new
|
|
75
|
+
|
|
76
|
+
@conn_mutex = Mutex.new
|
|
77
|
+
@active_http = nil
|
|
78
|
+
|
|
79
|
+
@source_index = -1
|
|
80
|
+
@last_event_id = nil
|
|
122
81
|
end
|
|
123
82
|
|
|
124
|
-
#
|
|
125
|
-
#
|
|
126
|
-
#
|
|
127
|
-
#
|
|
128
|
-
# ReconnectCountingLogger "Will retry connection after" signal.
|
|
129
|
-
# 2. SDK-driven reconnects in @retry_thread, after a closing error
|
|
130
|
-
# (HTTP::ConnectionError) made us close the SSE::Client outright.
|
|
131
|
-
# These two are mutually exclusive per disconnect, so there is no
|
|
132
|
-
# double-count. on_error is deliberately NOT a source — ld-eventsource
|
|
133
|
-
# reconnects internally after most non-closing errors, so counting the
|
|
134
|
-
# error edge AND the reconnect would double up (qfg-ie49).
|
|
135
|
-
#
|
|
136
|
-
# The chaos harness pulls this via Client#worker_restart_total(layer: '1')
|
|
137
|
-
# so kill-storm scenarios (e.g. scenario 09 — proxy killed 5x in 30s) can
|
|
138
|
-
# assert restart_total >= 5 even when the kills produce clean FINs that
|
|
139
|
-
# never reach on_error.
|
|
83
|
+
# Layer 1 (SSE) reconnect counter. Bumped exactly once per reconnect
|
|
84
|
+
# attempt — never per error edge, never per envelope. Read by
|
|
85
|
+
# Quonfig::Client#worker_restart_total(layer: '1') and asserted by chaos
|
|
86
|
+
# scenario 09 (>= 5 after 5 proxy flaps in 30s).
|
|
140
87
|
def restart_total
|
|
141
88
|
@restart_mutex.synchronize { @restart_total }
|
|
142
89
|
end
|
|
143
90
|
|
|
144
|
-
#
|
|
145
|
-
#
|
|
146
|
-
|
|
147
|
-
|
|
91
|
+
# qfg-m3lk: count of user-supplied on_envelope callback invocations that
|
|
92
|
+
# raised. Surfaced for operator visibility — a non-zero value here with
|
|
93
|
+
# restart_total stable means a caller-side listener bug, not a transport
|
|
94
|
+
# problem. (Pre-fix, those raises propagated into run_loop's rescue and
|
|
95
|
+
# masqueraded as transport errors, causing reconnect storms.)
|
|
96
|
+
def on_envelope_error_total
|
|
97
|
+
@on_envelope_error_mutex.synchronize { @on_envelope_error_total }
|
|
148
98
|
end
|
|
149
99
|
|
|
150
|
-
def
|
|
151
|
-
@
|
|
152
|
-
|
|
100
|
+
def start(&on_envelope)
|
|
101
|
+
return if @prefab_options.sse_api_urls.nil? || @prefab_options.sse_api_urls.empty?
|
|
102
|
+
|
|
103
|
+
@worker = Thread.new { run_loop(&on_envelope) }
|
|
153
104
|
end
|
|
154
105
|
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
106
|
+
# Shut down. Interrupts the in-flight stream by closing the underlying
|
|
107
|
+
# socket from this thread — the worker thread observes the resulting
|
|
108
|
+
# IOError, sees @stopped == true, and exits cleanly.
|
|
109
|
+
def close
|
|
110
|
+
@stopped.make_true
|
|
111
|
+
@conn_mutex.synchronize do
|
|
112
|
+
begin
|
|
113
|
+
@active_http&.finish
|
|
114
|
+
rescue StandardError
|
|
115
|
+
# already closed / never started — idempotent
|
|
116
|
+
end
|
|
117
|
+
@active_http = nil
|
|
159
118
|
end
|
|
119
|
+
@worker&.join(2)
|
|
120
|
+
@worker = nil
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
# Public so tests can assert the headers shape. Body of the request is
|
|
124
|
+
# always empty; this is the full set api-delivery-sse sees.
|
|
125
|
+
def headers
|
|
126
|
+
auth = "1:#{@prefab_options.sdk_key}"
|
|
127
|
+
auth_string = Base64.strict_encode64(auth)
|
|
128
|
+
h = {
|
|
129
|
+
'Authorization' => "Basic #{auth_string}",
|
|
130
|
+
'Accept' => 'text/event-stream',
|
|
131
|
+
'Cache-Control' => 'no-cache',
|
|
132
|
+
'X-Quonfig-SDK-Version' => "ruby-#{Quonfig::VERSION}"
|
|
133
|
+
}
|
|
134
|
+
cursor = current_cursor
|
|
135
|
+
h['Last-Event-Id'] = cursor if cursor
|
|
136
|
+
h
|
|
137
|
+
end
|
|
160
138
|
|
|
161
|
-
|
|
139
|
+
# Compute a Last-Event-ID for the next request. Three sources, in
|
|
140
|
+
# priority order:
|
|
141
|
+
# 1. @last_event_id -- set by the most recent event we processed
|
|
142
|
+
# 2. config_loader.version -- string ETag from last HTTP fetch
|
|
143
|
+
# 3. config_loader.highwater_mark -- legacy numeric cursor
|
|
144
|
+
# Returns nil if no prior state exists.
|
|
145
|
+
def current_cursor
|
|
146
|
+
return @last_event_id if @last_event_id && !@last_event_id.empty?
|
|
162
147
|
|
|
163
|
-
|
|
148
|
+
if @config_loader.respond_to?(:version)
|
|
149
|
+
v = @config_loader.version
|
|
150
|
+
return v if v.is_a?(String) && !v.empty?
|
|
151
|
+
end
|
|
164
152
|
|
|
165
|
-
@
|
|
166
|
-
|
|
167
|
-
|
|
153
|
+
if @config_loader.respond_to?(:highwater_mark)
|
|
154
|
+
hw = @config_loader.highwater_mark
|
|
155
|
+
return hw.to_s if hw.is_a?(Numeric) && hw.positive?
|
|
156
|
+
return hw if hw.is_a?(String) && !hw.empty?
|
|
157
|
+
end
|
|
168
158
|
|
|
169
|
-
|
|
159
|
+
nil
|
|
160
|
+
end
|
|
170
161
|
|
|
171
|
-
|
|
162
|
+
private
|
|
172
163
|
|
|
173
|
-
|
|
164
|
+
# Long-lived reconnect loop. One iteration = one connect attempt. Bumps
|
|
165
|
+
# restart_total *before* every retry — so the counter answers "how many
|
|
166
|
+
# times have we reconnected after a drop" rather than "how many connect
|
|
167
|
+
# attempts have occurred." The first attempt is not a restart.
|
|
168
|
+
#
|
|
169
|
+
# qfg-tj18: the body is wrapped in
|
|
170
|
+
# +Thread.handle_interrupt(SSEReadDeadlineExceeded => :on_blocking)+ so a
|
|
171
|
+
# watchdog raise that's already been queued (the watchdog's mutex covers
|
|
172
|
+
# the *decision* to fire but cannot un-queue a delivered raise) lands
|
|
173
|
+
# only at a blocking-IO checkpoint. Inside stream_once we explicitly
|
|
174
|
+
# re-enable +:immediate+ around the +read_body+ block where we *do*
|
|
175
|
+
# want the raise to wake the read. A per-iteration paranoid rescue
|
|
176
|
+
# catches any late-landing raise that escapes the inner +rescue
|
|
177
|
+
# StandardError+ (e.g. lands inside +interruptible_sleep+ between
|
|
178
|
+
# iterations) so the worker thread never silently dies.
|
|
179
|
+
def run_loop(&on_envelope)
|
|
180
|
+
Thread.handle_interrupt(SSEReadDeadlineExceeded => :on_blocking) do
|
|
181
|
+
delay = @options.sse_initial_reconnect_delay
|
|
182
|
+
first_attempt = true
|
|
183
|
+
|
|
184
|
+
until @stopped.value
|
|
185
|
+
begin
|
|
186
|
+
unless first_attempt
|
|
187
|
+
increment_restart!
|
|
188
|
+
interruptible_sleep(jittered(delay))
|
|
189
|
+
break if @stopped.value
|
|
190
|
+
end
|
|
191
|
+
first_attempt = false
|
|
174
192
|
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
193
|
+
connected_at_least_once = false
|
|
194
|
+
begin
|
|
195
|
+
stream_once do |event|
|
|
196
|
+
connected_at_least_once = true
|
|
197
|
+
# Persist the most recent id so the next reconnect resumes
|
|
198
|
+
# from there via Last-Event-Id. Updated *before* the user
|
|
199
|
+
# callback runs so a raising listener still advances the
|
|
200
|
+
# cursor — the event was delivered to us, the bug is on the
|
|
201
|
+
# caller side.
|
|
202
|
+
@last_event_id = event.id if event.id
|
|
203
|
+
# qfg-m3lk: callback exceptions are isolated. A buggy
|
|
204
|
+
# listener must not look like a transport error and trigger
|
|
205
|
+
# a reconnect.
|
|
206
|
+
invoke_on_envelope_safely(on_envelope, event)
|
|
207
|
+
# A connection healthy enough to deliver a real envelope
|
|
208
|
+
# earns a reset of the backoff. Sustained outages never
|
|
209
|
+
# reach this branch (no event ever delivered) so the
|
|
210
|
+
# exponential growth still holds.
|
|
211
|
+
delay = @options.sse_initial_reconnect_delay
|
|
212
|
+
end
|
|
213
|
+
rescue StandardError => e
|
|
214
|
+
handle_error(e) unless @stopped.value
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
# Backoff only grows on failed connect attempts. A server-
|
|
218
|
+
# initiated clean FIN after a healthy session (normal LB
|
|
219
|
+
# recycling) reuses the same delay — punishing it would make
|
|
220
|
+
# us look broken under benign rolling restarts. Matches
|
|
221
|
+
# sdk-go's `connectedOK` distinction.
|
|
222
|
+
delay = [delay * 2, @options.sse_max_reconnect_delay].min unless connected_at_least_once
|
|
223
|
+
rescue SSEReadDeadlineExceeded => e
|
|
224
|
+
# Paranoid backstop (qfg-tj18). A watchdog raise that landed
|
|
225
|
+
# outside +stream_once+ — typically in +interruptible_sleep+
|
|
226
|
+
# — must not kill the worker thread. We log loudly and let the
|
|
227
|
+
# +until+ loop carry on.
|
|
228
|
+
@logger.error "SSE watchdog late-raise contained: #{e.inspect}; resuming loop"
|
|
229
|
+
end
|
|
183
230
|
end
|
|
184
231
|
end
|
|
232
|
+
ensure
|
|
233
|
+
register_active(nil)
|
|
185
234
|
end
|
|
186
235
|
|
|
187
|
-
|
|
188
|
-
|
|
236
|
+
# Opens one SSE request and yields each parsed event until the stream
|
|
237
|
+
# ends (clean FIN, error, or stop). Raises on transport errors so the
|
|
238
|
+
# caller can apply backoff. Clean FIN returns without raising.
|
|
239
|
+
#
|
|
240
|
+
# A watchdog thread closes the socket if no bytes arrive within
|
|
241
|
+
# +sse_read_timeout+. Net::HTTP#read_timeout is NOT reliable for the
|
|
242
|
+
# streaming +read_body do |chunk|+ form — the underlying BufferedIO
|
|
243
|
+
# reads bypass it in practice (a silent server stall blocks indefinitely
|
|
244
|
+
# against a configured deadline). sdk-go and sdk-node hit the same
|
|
245
|
+
# gotcha and solve it the same way: per-chunk reset, async close on
|
|
246
|
+
# expiry (chaos scenario 02 — sse_silent_stall).
|
|
247
|
+
def stream_once(&block)
|
|
248
|
+
url = "#{current_url}/api/v2/sse/config"
|
|
189
249
|
cursor = current_cursor
|
|
190
250
|
@logger.debug "SSE Streaming Connect to #{url} start_at #{cursor.inspect}"
|
|
191
251
|
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
)
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
252
|
+
uri = URI(url)
|
|
253
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
|
254
|
+
http.use_ssl = (uri.scheme == 'https')
|
|
255
|
+
http.open_timeout = @options.sse_connect_timeout
|
|
256
|
+
# Keep Net::HTTP's read_timeout as a backstop for the header read
|
|
257
|
+
# (where it does apply reliably). The watchdog covers the body path.
|
|
258
|
+
http.read_timeout = @options.sse_read_timeout
|
|
259
|
+
|
|
260
|
+
req = Net::HTTP::Get.new(uri.request_uri, headers)
|
|
261
|
+
|
|
262
|
+
http.start
|
|
263
|
+
register_active(http)
|
|
264
|
+
|
|
265
|
+
watchdog = ReadDeadlineWatchdog.new(
|
|
266
|
+
worker: Thread.current, deadline_s: @options.sse_read_timeout,
|
|
267
|
+
stopped: @stopped, logger: @logger
|
|
268
|
+
)
|
|
269
|
+
watchdog.start
|
|
270
|
+
|
|
271
|
+
begin
|
|
272
|
+
http.request(req) do |resp|
|
|
273
|
+
code = resp.code.to_i
|
|
274
|
+
if TERMINAL_HTTP_CODES.include?(code)
|
|
275
|
+
# qfg-i5xv: 401/403/404 will not heal by retrying — bad key,
|
|
276
|
+
# revoked permission, or wrong endpoint. Mark stopped *before*
|
|
277
|
+
# invoking on_error so the loop's terminal-error branch is
|
|
278
|
+
# already locked in if the parent callback inspects state, and
|
|
279
|
+
# so the inner rescue's `handle_error(e) unless @stopped.value`
|
|
280
|
+
# guard suppresses a second on_error edge.
|
|
281
|
+
err = SSEHTTPTerminalError.new(code)
|
|
282
|
+
@logger.error "SSE Streaming Terminal Error: HTTP #{code} for url #{url}; will not retry"
|
|
283
|
+
@stopped.make_true
|
|
284
|
+
invoke_on_error(err)
|
|
285
|
+
raise err
|
|
211
286
|
end
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
client.close
|
|
218
|
-
next
|
|
287
|
+
if code != 200
|
|
288
|
+
err = SSEHTTPStatusError.new(code)
|
|
289
|
+
@logger.error "SSE Streaming Error: HTTP #{code} for url #{url}"
|
|
290
|
+
invoke_on_error(err)
|
|
291
|
+
raise err
|
|
219
292
|
end
|
|
220
293
|
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
)
|
|
225
|
-
|
|
294
|
+
parser = EventParser.new
|
|
295
|
+
# qfg-tj18: run_loop wraps the body in +:on_blocking+ which
|
|
296
|
+
# *would* still deliver during read_body (read_body is a
|
|
297
|
+
# blocking IO call), but be explicit: we want the watchdog raise
|
|
298
|
+
# to land here without ambiguity.
|
|
299
|
+
Thread.handle_interrupt(SSEReadDeadlineExceeded => :immediate) do
|
|
300
|
+
resp.read_body do |chunk|
|
|
301
|
+
watchdog.reset!
|
|
302
|
+
break if @stopped.value
|
|
303
|
+
|
|
304
|
+
parser.feed(chunk, &block)
|
|
305
|
+
end
|
|
306
|
+
end
|
|
307
|
+
# read_body returned cleanly — either a server-initiated FIN, or
|
|
308
|
+
# the watchdog closed the socket on a silent stall. Either way,
|
|
309
|
+
# the outer loop will reconnect and bump restart_total on the
|
|
310
|
+
# next iteration.
|
|
311
|
+
@logger.debug "SSE stream ended for url #{url}"
|
|
312
|
+
end
|
|
313
|
+
ensure
|
|
314
|
+
watchdog.stop
|
|
315
|
+
register_active(nil)
|
|
316
|
+
begin
|
|
317
|
+
http.finish if http.started?
|
|
318
|
+
rescue StandardError
|
|
319
|
+
# already closed
|
|
226
320
|
end
|
|
321
|
+
end
|
|
322
|
+
end
|
|
227
323
|
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
@logger.error "SSE Streaming Error: #{error.inspect} for url #{url}"
|
|
234
|
-
end
|
|
324
|
+
# Track the active connection so close() can interrupt a blocked
|
|
325
|
+
# read_body from another thread. Guarded by @conn_mutex.
|
|
326
|
+
def register_active(http)
|
|
327
|
+
@conn_mutex.synchronize { @active_http = http }
|
|
328
|
+
end
|
|
235
329
|
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
# would double-count. For closing errors (HTTP::ConnectionError) the
|
|
240
|
-
# reconnect is counted in @retry_thread instead. on_error's job is
|
|
241
|
-
# purely to notify the parent client of the disconnect edge.
|
|
242
|
-
|
|
243
|
-
# Notify the parent client BEFORE deciding whether to close — every
|
|
244
|
-
# error edge is a disconnect signal as far as @sse_state goes, even
|
|
245
|
-
# if we let the underlying SSE library handle reconnect itself.
|
|
246
|
-
# qfg-47c2.27
|
|
247
|
-
if @on_error
|
|
248
|
-
begin
|
|
249
|
-
@on_error.call(error)
|
|
250
|
-
rescue StandardError => e
|
|
251
|
-
@logger.error "SSE on_error callback raised: #{e.inspect}"
|
|
252
|
-
end
|
|
253
|
-
end
|
|
330
|
+
def increment_restart!
|
|
331
|
+
@restart_mutex.synchronize { @restart_total += 1 }
|
|
332
|
+
end
|
|
254
333
|
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
334
|
+
def handle_error(error)
|
|
335
|
+
@logger.error "SSE Streaming Error: #{error.inspect}"
|
|
336
|
+
invoke_on_error(error)
|
|
337
|
+
end
|
|
338
|
+
|
|
339
|
+
# qfg-m3lk: rescue StandardError (NOT Exception) so SystemExit /
|
|
340
|
+
# Interrupt / SignalException still escape — Ctrl-C inside a customer
|
|
341
|
+
# callback must still kill the process. StandardError is the right
|
|
342
|
+
# boundary for "the caller's listener has a bug".
|
|
343
|
+
def invoke_on_envelope_safely(on_envelope, event)
|
|
344
|
+
on_envelope.call(event.envelope, event, :sse)
|
|
345
|
+
rescue StandardError => e
|
|
346
|
+
@on_envelope_error_mutex.synchronize { @on_envelope_error_total += 1 }
|
|
347
|
+
bt = (e.backtrace || []).first(5).join("\n ")
|
|
348
|
+
@logger.error "SSE on_envelope callback raised: #{e.class}: #{e.message}\n #{bt}"
|
|
349
|
+
end
|
|
350
|
+
|
|
351
|
+
def invoke_on_error(error)
|
|
352
|
+
return unless @on_error
|
|
353
|
+
|
|
354
|
+
begin
|
|
355
|
+
@on_error.call(error)
|
|
356
|
+
rescue StandardError => e
|
|
357
|
+
@logger.error "SSE on_error callback raised: #{e.inspect}"
|
|
260
358
|
end
|
|
261
359
|
end
|
|
262
360
|
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
'X-Quonfig-SDK-Version' => "ruby-#{Quonfig::VERSION}"
|
|
270
|
-
}
|
|
361
|
+
# +/-50% jitter — caps thundering-herd amplitude after a partition heal.
|
|
362
|
+
# Identical shape to ld-eventsource's Backoff#next_interval (and
|
|
363
|
+
# sdk-go's runLoop jitter) so we don't surprise operators familiar with
|
|
364
|
+
# those.
|
|
365
|
+
def jittered(delay)
|
|
366
|
+
(delay / 2) + rand(delay / 2.0)
|
|
271
367
|
end
|
|
272
368
|
|
|
273
|
-
|
|
274
|
-
|
|
369
|
+
# Sleep with interrupt: chunks the sleep so close() during a long
|
|
370
|
+
# backoff doesn't block shutdown for tens of seconds.
|
|
371
|
+
def interruptible_sleep(seconds)
|
|
372
|
+
deadline = Process.clock_gettime(Process::CLOCK_MONOTONIC) + seconds
|
|
373
|
+
until @stopped.value
|
|
374
|
+
remaining = deadline - Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
375
|
+
break if remaining <= 0
|
|
275
376
|
|
|
276
|
-
|
|
377
|
+
sleep([remaining, 0.1].min)
|
|
378
|
+
end
|
|
379
|
+
end
|
|
277
380
|
|
|
278
|
-
|
|
381
|
+
# Rotate through configured SSE URLs. The same rotation rule the
|
|
382
|
+
# previous implementation used, preserved so multi-region failover
|
|
383
|
+
# behavior is unchanged.
|
|
384
|
+
def current_url
|
|
385
|
+
urls = @prefab_options.sse_api_urls
|
|
386
|
+
@source_index = (@source_index + 1) % urls.size
|
|
387
|
+
urls[@source_index]
|
|
279
388
|
end
|
|
280
389
|
|
|
281
|
-
#
|
|
282
|
-
#
|
|
283
|
-
#
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
390
|
+
# Internal: HTTP-status sentinel error for non-200 SSE responses. Surfaces
|
|
391
|
+
# the status code through #message so parent on_error callbacks can log
|
|
392
|
+
# meaningfully without depending on ld-eventsource's error hierarchy.
|
|
393
|
+
class SSEHTTPStatusError < StandardError
|
|
394
|
+
attr_reader :status_code
|
|
395
|
+
|
|
396
|
+
def initialize(status_code)
|
|
397
|
+
@status_code = status_code
|
|
398
|
+
super("HTTP #{status_code}")
|
|
290
399
|
end
|
|
400
|
+
end
|
|
291
401
|
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
402
|
+
# qfg-i5xv: terminal HTTP failures the SDK will not retry. 401 = bad key,
|
|
403
|
+
# 403 = revoked workspace permission, 404 = wrong endpoint / missing
|
|
404
|
+
# workspace. A subclass of SSEHTTPStatusError so existing on_error
|
|
405
|
+
# callbacks that only check `is_a?(SSEHTTPStatusError)` keep working,
|
|
406
|
+
# while customers that want to distinguish (alerting, OpenFeature
|
|
407
|
+
# provider error events) can dispatch on the subclass.
|
|
408
|
+
class SSEHTTPTerminalError < SSEHTTPStatusError; end
|
|
409
|
+
|
|
410
|
+
# Raised by the watchdog into the worker thread when the per-chunk
|
|
411
|
+
# read deadline elapses. Caught by run_loop's rescue, indistinguishable
|
|
412
|
+
# from any other transport error for backoff/restart purposes.
|
|
413
|
+
class SSEReadDeadlineExceeded < StandardError; end
|
|
414
|
+
|
|
415
|
+
# Background watchdog that interrupts the worker thread if no chunk
|
|
416
|
+
# arrives within +deadline_s+ seconds. Uses Thread#raise — the only
|
|
417
|
+
# reliable cross-platform way to unblock a Ruby thread blocked in
|
|
418
|
+
# +Net::HTTP+'s body-read on macOS. (Closing or shutting down the
|
|
419
|
+
# underlying socket from another thread does NOT wake the reader on
|
|
420
|
+
# macOS; the kernel discards future reads but the in-flight syscall
|
|
421
|
+
# stays blocked until something else trips. sdk-go and sdk-node solve
|
|
422
|
+
# the equivalent problem with context cancellation / AbortController,
|
|
423
|
+
# which Ruby lacks at the IO layer.) Thread#raise is essentially what
|
|
424
|
+
# +Timeout.timeout+ does internally; using it directly avoids
|
|
425
|
+
# Timeout.timeout's sketch reputation around ensure blocks.
|
|
426
|
+
class ReadDeadlineWatchdog
|
|
427
|
+
POLL_INTERVAL = 0.25
|
|
428
|
+
|
|
429
|
+
def initialize(worker:, deadline_s:, stopped:, logger:)
|
|
430
|
+
@worker = worker
|
|
431
|
+
@deadline_s = deadline_s
|
|
432
|
+
@stopped = stopped
|
|
433
|
+
@logger = logger
|
|
434
|
+
@active = true
|
|
435
|
+
# Mutex covers @active AND the decision to fire Thread#raise. stop()
|
|
436
|
+
# holds the mutex when flipping @active false, so a +stop+ that
|
|
437
|
+
# arrives mid-deadline-check cannot lose the race against the
|
|
438
|
+
# watchdog's @worker.raise call (which would inject a spurious
|
|
439
|
+
# SSEReadDeadlineExceeded into the worker thread right after a
|
|
440
|
+
# clean read_body return).
|
|
441
|
+
@mutex = Mutex.new
|
|
442
|
+
@last_read_at = Concurrent::AtomicReference.new(Process.clock_gettime(Process::CLOCK_MONOTONIC))
|
|
296
443
|
end
|
|
297
444
|
|
|
298
|
-
|
|
445
|
+
def start
|
|
446
|
+
@thread = Thread.new { watch }
|
|
447
|
+
end
|
|
448
|
+
|
|
449
|
+
def reset!
|
|
450
|
+
@last_read_at.set(Process.clock_gettime(Process::CLOCK_MONOTONIC))
|
|
451
|
+
end
|
|
452
|
+
|
|
453
|
+
def stop
|
|
454
|
+
@mutex.synchronize { @active = false }
|
|
455
|
+
@thread&.join(1)
|
|
456
|
+
@thread = nil
|
|
457
|
+
end
|
|
458
|
+
|
|
459
|
+
private
|
|
460
|
+
|
|
461
|
+
def watch
|
|
462
|
+
loop do
|
|
463
|
+
sleep POLL_INTERVAL
|
|
464
|
+
break unless @mutex.synchronize { @active } && !@stopped.value
|
|
465
|
+
|
|
466
|
+
idle = Process.clock_gettime(Process::CLOCK_MONOTONIC) - @last_read_at.value
|
|
467
|
+
next if idle < @deadline_s
|
|
468
|
+
|
|
469
|
+
fired = @mutex.synchronize do
|
|
470
|
+
next false unless @active && !@stopped.value
|
|
471
|
+
|
|
472
|
+
@logger.debug "SSE read deadline exceeded (#{idle.round(1)}s idle >= #{@deadline_s}s); interrupting worker"
|
|
473
|
+
@worker.raise(SSEReadDeadlineExceeded.new("SSE read deadline #{@deadline_s}s exceeded"))
|
|
474
|
+
true
|
|
475
|
+
end
|
|
476
|
+
break if fired
|
|
477
|
+
end
|
|
478
|
+
rescue StandardError => e
|
|
479
|
+
# Watchdog must never crash the SDK. Worst case we silently fall
|
|
480
|
+
# back to Net::HTTP's own (unreliable) read_timeout.
|
|
481
|
+
@logger.debug "SSE watchdog error: #{e.inspect}"
|
|
482
|
+
end
|
|
483
|
+
end
|
|
484
|
+
|
|
485
|
+
# Streaming SSE parser. Accepts byte chunks (any encoding), yields one
|
|
486
|
+
# Quonfig::StreamEvent per complete event. Tolerates:
|
|
487
|
+
# - chunks that split a UTF-8 multi-byte character (buffer in 8-bit,
|
|
488
|
+
# transcode whole lines)
|
|
489
|
+
# - chunks that split a line mid-way
|
|
490
|
+
# - any of CR / LF / CRLF as line terminators
|
|
491
|
+
# - +data:+, +data: + (optional space per SSE spec)
|
|
492
|
+
# - +:comment+ lines (keepalives — ignored)
|
|
493
|
+
# - multi-line +data:+ (concatenated with +\n+, per spec)
|
|
494
|
+
# Ignores +event:+ and +retry:+ — api-delivery does not emit them and the
|
|
495
|
+
# Quonfig wire contract does not honor reconnect-time directives.
|
|
496
|
+
# Malformed +data:+ JSON is logged and skipped; one bad event does not
|
|
497
|
+
# tear down the stream.
|
|
498
|
+
class EventParser
|
|
499
|
+
def initialize(logger: nil)
|
|
500
|
+
@logger = logger
|
|
501
|
+
@reader = LineReader.new
|
|
502
|
+
@data = +''
|
|
503
|
+
@have_data = false
|
|
504
|
+
@id = nil
|
|
505
|
+
end
|
|
506
|
+
|
|
507
|
+
def feed(chunk)
|
|
508
|
+
@reader.feed(chunk) do |line|
|
|
509
|
+
if line.empty?
|
|
510
|
+
event = flush
|
|
511
|
+
yield event if event
|
|
512
|
+
elsif line.start_with?(':')
|
|
513
|
+
# comment / keepalive — ignore
|
|
514
|
+
else
|
|
515
|
+
process_field(line)
|
|
516
|
+
end
|
|
517
|
+
end
|
|
518
|
+
end
|
|
519
|
+
|
|
520
|
+
private
|
|
521
|
+
|
|
522
|
+
def process_field(line)
|
|
523
|
+
idx = line.index(':')
|
|
524
|
+
return unless idx
|
|
525
|
+
|
|
526
|
+
name = line[0...idx]
|
|
527
|
+
rest = line[(idx + 1)..]
|
|
528
|
+
rest = rest[1..] if rest.start_with?(' ')
|
|
529
|
+
|
|
530
|
+
case name
|
|
531
|
+
when 'data'
|
|
532
|
+
if @have_data
|
|
533
|
+
@data << "\n" << rest
|
|
534
|
+
else
|
|
535
|
+
@data = rest
|
|
536
|
+
@have_data = true
|
|
537
|
+
end
|
|
538
|
+
when 'id'
|
|
539
|
+
@id = rest unless rest.include?("\x00")
|
|
540
|
+
# event: / retry: are intentionally ignored
|
|
541
|
+
end
|
|
542
|
+
end
|
|
543
|
+
|
|
544
|
+
def flush
|
|
545
|
+
return nil unless @have_data
|
|
546
|
+
|
|
547
|
+
data = @data
|
|
548
|
+
id = @id
|
|
549
|
+
@data = +''
|
|
550
|
+
@have_data = false
|
|
551
|
+
# NB: @id persists across events — the SSE spec says last-event-id
|
|
552
|
+
# is sticky until overwritten. Matches ld-eventsource.
|
|
553
|
+
|
|
554
|
+
begin
|
|
555
|
+
parsed = JSON.parse(data)
|
|
556
|
+
rescue JSON::ParserError => e
|
|
557
|
+
(@logger || LOG).error "SSE Streaming Error: malformed JSON: #{e.message}"
|
|
558
|
+
return nil
|
|
559
|
+
end
|
|
560
|
+
|
|
561
|
+
envelope = Quonfig::ConfigEnvelope.new(
|
|
562
|
+
configs: parsed['configs'] || [],
|
|
563
|
+
meta: parsed['meta'] || {}
|
|
564
|
+
)
|
|
565
|
+
StreamEvent.new(envelope, id, data)
|
|
566
|
+
end
|
|
567
|
+
end
|
|
568
|
+
|
|
569
|
+
# Byte-level line reader. Accepts arbitrary chunks, yields one UTF-8
|
|
570
|
+
# line per call to the block. Terminator-stripped (CR / LF / CRLF
|
|
571
|
+
# supported). Modeled on ld-eventsource's BufferedLineReader — same
|
|
572
|
+
# invariants: split bytes-not-chars while scanning, force-encode to
|
|
573
|
+
# UTF-8 only once a complete line is sliced out, so a multi-byte
|
|
574
|
+
# character spanning two chunks does not raise Encoding::CompatibilityError.
|
|
575
|
+
class LineReader
|
|
576
|
+
def initialize
|
|
577
|
+
@buffer = +''.b
|
|
578
|
+
@last_was_cr = false
|
|
579
|
+
end
|
|
580
|
+
|
|
581
|
+
def feed(chunk)
|
|
582
|
+
@buffer << chunk.b
|
|
583
|
+
loop do
|
|
584
|
+
idx = @buffer.index(/[\r\n]/)
|
|
585
|
+
break if idx.nil?
|
|
586
|
+
|
|
587
|
+
ch = @buffer[idx]
|
|
588
|
+
if idx.zero? && ch == "\n" && @last_was_cr
|
|
589
|
+
# Dangling LF of a CRLF pair split across chunks — consume and skip.
|
|
590
|
+
@last_was_cr = false
|
|
591
|
+
@buffer.slice!(0, 1)
|
|
592
|
+
next
|
|
593
|
+
end
|
|
594
|
+
|
|
595
|
+
line = @buffer[0, idx].force_encoding('UTF-8')
|
|
596
|
+
consume = idx + 1
|
|
597
|
+
@last_was_cr = false
|
|
598
|
+
if ch == "\r"
|
|
599
|
+
if consume == @buffer.bytesize
|
|
600
|
+
# CR at end of buffer — could be CRLF split across feeds.
|
|
601
|
+
@last_was_cr = true
|
|
602
|
+
elsif @buffer[consume] == "\n"
|
|
603
|
+
consume += 1
|
|
604
|
+
end
|
|
605
|
+
end
|
|
606
|
+
@buffer.slice!(0, consume)
|
|
607
|
+
yield line
|
|
608
|
+
end
|
|
609
|
+
end
|
|
299
610
|
end
|
|
300
611
|
end
|
|
301
612
|
end
|