switchroom 0.14.93 → 0.15.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agent-scheduler/index.js +22 -1
- package/dist/auth-broker/index.js +84 -13
- package/dist/cli/drive-write-pretool.mjs +23 -2
- package/dist/cli/switchroom.js +39 -6
- package/dist/cli/ui/index.html +16 -5
- package/package.json +1 -1
- package/telegram-plugin/auth-snapshot-format.ts +9 -0
- package/telegram-plugin/auto-fallback-fleet.ts +59 -0
- package/telegram-plugin/dist/gateway/gateway.js +225 -20
- package/telegram-plugin/gateway/auth-broker-client.ts +2 -0
- package/telegram-plugin/gateway/auth-command.ts +35 -2
- package/telegram-plugin/gateway/gateway.ts +203 -18
- package/telegram-plugin/quota-watch.ts +141 -3
- package/telegram-plugin/tests/auth-quota-util-cell.test.ts +23 -0
- package/telegram-plugin/tests/auto-fallback-fleet.test.ts +71 -0
- package/telegram-plugin/tests/quota-watch.test.ts +266 -0
|
@@ -157,7 +157,7 @@ import {
|
|
|
157
157
|
formatModelUnavailableCard,
|
|
158
158
|
resolveModelUnavailableFromOperatorEvent,
|
|
159
159
|
} from '../model-unavailable.js'
|
|
160
|
-
import { runFleetAutoFallback } from '../auto-fallback-fleet.js'
|
|
160
|
+
import { runFleetAutoFallback, renderFallbackFailureNotice, evaluateFallbackFailureNotice, type FallbackFailureNoticeState } from '../auto-fallback-fleet.js'
|
|
161
161
|
import { startRestartWatchdog } from './restart-watchdog.js'
|
|
162
162
|
import { validateStringArray } from './access-validator.js'
|
|
163
163
|
|
|
@@ -422,6 +422,9 @@ import {
|
|
|
422
422
|
saveQuotaWatchState,
|
|
423
423
|
patchQuotaWatchState,
|
|
424
424
|
emptyAccountState,
|
|
425
|
+
resolveQuotaWatchTuning,
|
|
426
|
+
buildQuotaClaimKey,
|
|
427
|
+
QUOTA_WATCH_CLAIM_WINDOW_MS,
|
|
425
428
|
} from '../quota-watch.js'
|
|
426
429
|
import { buildSnapshotsFromState, buildSnapshotsFromCachedState } from '../auth-snapshot-format.js'
|
|
427
430
|
import {
|
|
@@ -14804,6 +14807,51 @@ async function fireFleetAutoFallback(triggerAgent: string, untilMs?: number): Pr
|
|
|
14804
14807
|
)
|
|
14805
14808
|
}
|
|
14806
14809
|
|
|
14810
|
+
/**
|
|
14811
|
+
* Broadcast a fleet-fallback FAILURE notice to every authorized chat.
|
|
14812
|
+
*
|
|
14813
|
+
* Why this exists: the model-unavailable card renders "Auto-failover in
|
|
14814
|
+
* progress — see the announcement below" BEFORE the dispatcher's outcome
|
|
14815
|
+
* is known. When the dispatcher errors (broker down, listState throw,
|
|
14816
|
+
* markExhausted failure), the success announcement never lands and the
|
|
14817
|
+
* card's promise is broken — the 2026-06-06→07 incident sent 12 such
|
|
14818
|
+
* broken-promise cards while every fallback errored "set-active requires
|
|
14819
|
+
* admin". The admin-gating root cause is fixed (#2206), but ANY future
|
|
14820
|
+
* dispatcher error reproduces the broken promise. This notice closes the
|
|
14821
|
+
* loop deterministically: card promised an announcement → an
|
|
14822
|
+
* announcement ALWAYS arrives, success or failure.
|
|
14823
|
+
*
|
|
14824
|
+
* Disable with SWITCHROOM_FLEET_FALLBACK_FAILURE_NOTICE=0 (log-only,
|
|
14825
|
+
* pre-fix behaviour).
|
|
14826
|
+
*/
|
|
14827
|
+
let fallbackFailureNoticeState: FallbackFailureNoticeState = { lastSentAtMs: 0 }
|
|
14828
|
+
|
|
14829
|
+
function broadcastFleetFallbackFailure(triggerAgent: string, reason: string): void {
|
|
14830
|
+
if (process.env.SWITCHROOM_FLEET_FALLBACK_FAILURE_NOTICE === '0') return
|
|
14831
|
+
// Notice-level cooldown (30 min, per gateway). The fleetFallbackGate's
|
|
14832
|
+
// dedup window only arms on SUCCESSFUL swaps, so it bounds nothing
|
|
14833
|
+
// here — and the card-less quota_wall_detected trigger re-fires every
|
|
14834
|
+
// ~60s during a wall. Without this, a persistent broker outage would
|
|
14835
|
+
// stream failure notices for days. See evaluateFallbackFailureNotice.
|
|
14836
|
+
const verdict = evaluateFallbackFailureNotice(fallbackFailureNoticeState, Date.now())
|
|
14837
|
+
if (!verdict.send) {
|
|
14838
|
+
process.stderr.write(
|
|
14839
|
+
`telegram gateway: [fleet-fallback] failure notice suppressed (cooldown) agent=${triggerAgent}: ${reason}\n`,
|
|
14840
|
+
)
|
|
14841
|
+
return
|
|
14842
|
+
}
|
|
14843
|
+
fallbackFailureNoticeState = verdict.next
|
|
14844
|
+
const access = loadAccess()
|
|
14845
|
+
if (access.allowFrom.length === 0) return
|
|
14846
|
+
const html = renderFallbackFailureNotice(triggerAgent, reason)
|
|
14847
|
+
for (const chat_id of access.allowFrom) {
|
|
14848
|
+
void swallowingApiCall(
|
|
14849
|
+
() => bot.api.sendMessage(chat_id, html, { parse_mode: 'HTML' as const }),
|
|
14850
|
+
{ chat_id, verb: 'fleet-fallback:failure-notify' },
|
|
14851
|
+
)
|
|
14852
|
+
}
|
|
14853
|
+
}
|
|
14854
|
+
|
|
14807
14855
|
/** Returns true iff the dispatcher actually performed a swap (and the
|
|
14808
14856
|
* user-visible announcement was broadcast). False on no-op /
|
|
14809
14857
|
* error / idempotent-skip — caller uses this to decide whether to
|
|
@@ -14815,6 +14863,9 @@ async function doFireFleetAutoFallback(triggerAgent: string, untilMs?: number):
|
|
|
14815
14863
|
process.stderr.write(
|
|
14816
14864
|
`telegram gateway: [fleet-fallback] skipped agent=${triggerAgent} reason=no-broker-client\n`,
|
|
14817
14865
|
)
|
|
14866
|
+
// The model-unavailable card may have promised an announcement —
|
|
14867
|
+
// keep the promise even though nothing could run.
|
|
14868
|
+
broadcastFleetFallbackFailure(triggerAgent, 'auth-broker unreachable (no client).')
|
|
14818
14869
|
return false
|
|
14819
14870
|
}
|
|
14820
14871
|
const state = await client.listState()
|
|
@@ -14878,6 +14929,10 @@ async function doFireFleetAutoFallback(triggerAgent: string, untilMs?: number):
|
|
|
14878
14929
|
process.stderr.write(
|
|
14879
14930
|
`telegram gateway: [fleet-fallback] error agent=${triggerAgent}: ${(err as Error)?.message ?? err}\n`,
|
|
14880
14931
|
)
|
|
14932
|
+
// Keep the card's "see the announcement below" promise on the error
|
|
14933
|
+
// path — the 06-06→07 incident sent 12 cards whose promised
|
|
14934
|
+
// announcement never arrived because this catch was log-only.
|
|
14935
|
+
broadcastFleetFallbackFailure(triggerAgent, (err as Error)?.message ?? String(err))
|
|
14881
14936
|
return false
|
|
14882
14937
|
}
|
|
14883
14938
|
}
|
|
@@ -14969,9 +15024,34 @@ async function runCreditWatch(): Promise<void> {
|
|
|
14969
15024
|
* State persists across restarts via `<stateDir>/quota-watch.json`.
|
|
14970
15025
|
* Mirrors runCreditWatch's structure and notification routing.
|
|
14971
15026
|
*/
|
|
14972
|
-
|
|
15027
|
+
|
|
15028
|
+
/**
|
|
15029
|
+
* Ask the broker for the fleet-wide dedup claim on one notification key.
|
|
15030
|
+
* FAIL-OPEN on any error: a broker that predates the `claim-notification`
|
|
15031
|
+
* op (skewed rollout) rejects at the protocol layer, and a transient IPC
|
|
15032
|
+
* failure must degrade to duplicated notifications, never dropped ones.
|
|
15033
|
+
*/
|
|
15034
|
+
async function claimQuotaNotification(
|
|
15035
|
+
brokerClient: NonNullable<Awaited<ReturnType<typeof getAuthBrokerClient>>>,
|
|
15036
|
+
key: string,
|
|
15037
|
+
): Promise<boolean> {
|
|
15038
|
+
try {
|
|
15039
|
+
const res = await brokerClient.claimNotification(key, QUOTA_WATCH_CLAIM_WINDOW_MS)
|
|
15040
|
+
return res.granted
|
|
15041
|
+
} catch (err) {
|
|
15042
|
+
process.stderr.write(`telegram gateway: quota-watch: claim failed (fail-open): ${err}\n`)
|
|
15043
|
+
return true
|
|
15044
|
+
}
|
|
15045
|
+
}
|
|
15046
|
+
|
|
15047
|
+
async function runQuotaWatch(opts: { bootTick?: boolean } = {}): Promise<void> {
|
|
14973
15048
|
const agentName = getMyAgentName()
|
|
14974
15049
|
const stateDir = STATE_DIR
|
|
15050
|
+
const bootTick = opts.bootTick ?? false
|
|
15051
|
+
// Hardening knobs (2026-06-09 incident: fleet bounce released stale
|
|
15052
|
+
// recovery latches on all 11 agents at once → 26 duplicate sends).
|
|
15053
|
+
// See QuotaWatchTuning in quota-watch.ts for the env contract.
|
|
15054
|
+
const tuning = resolveQuotaWatchTuning(process.env)
|
|
14975
15055
|
|
|
14976
15056
|
// Read broker state. The listState response now includes last_quota
|
|
14977
15057
|
// per account — the broker's in-memory cache from previous probeQuota
|
|
@@ -15019,6 +15099,20 @@ async function runQuotaWatch(): Promise<void> {
|
|
|
15019
15099
|
})
|
|
15020
15100
|
if (fleetDecision.kind === 'notify') {
|
|
15021
15101
|
for (const chat_id of access.allowFrom) {
|
|
15102
|
+
// Fleet-level dedup: all 11 gateways detect this same edge within
|
|
15103
|
+
// one poll cycle — only the broker-claim winner sends per chat.
|
|
15104
|
+
if (tuning.fleetDedup) {
|
|
15105
|
+
const granted = await claimQuotaNotification(
|
|
15106
|
+
brokerClient,
|
|
15107
|
+
buildQuotaClaimKey(FLEET_ALL_EXHAUSTED_KEY, fleetDecision.transition, chat_id),
|
|
15108
|
+
)
|
|
15109
|
+
if (!granted) {
|
|
15110
|
+
process.stderr.write(
|
|
15111
|
+
`telegram gateway: quota-watch: fleet-all-exhausted claim denied chat=${chat_id} — another agent notified\n`,
|
|
15112
|
+
)
|
|
15113
|
+
continue
|
|
15114
|
+
}
|
|
15115
|
+
}
|
|
15022
15116
|
await swallowingApiCall(
|
|
15023
15117
|
() =>
|
|
15024
15118
|
bot.api.sendMessage(chat_id, fleetDecision.message, {
|
|
@@ -15056,10 +15150,21 @@ async function runQuotaWatch(): Promise<void> {
|
|
|
15056
15150
|
snapshots.map((s, i) => [s.label, i]),
|
|
15057
15151
|
)
|
|
15058
15152
|
|
|
15153
|
+
// Reconciled transitions: state advances (latch clears) but nothing is
|
|
15154
|
+
// sent — boot-tick and late recoveries (see QuotaWatchDecision docs).
|
|
15155
|
+
let reconciledCount = 0
|
|
15156
|
+
let mutatedState = watchState
|
|
15157
|
+
|
|
15059
15158
|
for (const snap of snapshots) {
|
|
15060
15159
|
const prev = watchState[snap.label] ?? emptyAccountState()
|
|
15061
|
-
const decision = evaluateQuotaWatchAccount({ agentName, snap, prev, now })
|
|
15062
|
-
if (decision.kind
|
|
15160
|
+
const decision = evaluateQuotaWatchAccount({ agentName, snap, prev, now, bootTick, tuning })
|
|
15161
|
+
if (decision.kind === 'reconcile') {
|
|
15162
|
+
mutatedState = patchQuotaWatchState(mutatedState, decision.accountLabel, decision.newAccountState)
|
|
15163
|
+
reconciledCount++
|
|
15164
|
+
process.stderr.write(
|
|
15165
|
+
`telegram gateway: quota-watch: reconciled ${decision.transition} for account=${decision.accountLabel} (${decision.reason}) — no notification\n`,
|
|
15166
|
+
)
|
|
15167
|
+
} else if (decision.kind !== 'skip') {
|
|
15063
15168
|
pendingTransitions.push({
|
|
15064
15169
|
accountLabel: snap.label,
|
|
15065
15170
|
snapIndex: labelToSnapIndex.get(snap.label) ?? -1,
|
|
@@ -15069,7 +15174,16 @@ async function runQuotaWatch(): Promise<void> {
|
|
|
15069
15174
|
}
|
|
15070
15175
|
|
|
15071
15176
|
if (pendingTransitions.length === 0) {
|
|
15072
|
-
|
|
15177
|
+
// Steady-state: no notifications, no probes. Persist only if a
|
|
15178
|
+
// reconcile advanced the latch (otherwise no state write at all).
|
|
15179
|
+
if (reconciledCount > 0) {
|
|
15180
|
+
try {
|
|
15181
|
+
saveQuotaWatchState(stateDir, mutatedState)
|
|
15182
|
+
} catch (err) {
|
|
15183
|
+
process.stderr.write(`telegram gateway: quota-watch state persist failed: ${err}\n`)
|
|
15184
|
+
}
|
|
15185
|
+
}
|
|
15186
|
+
return
|
|
15073
15187
|
}
|
|
15074
15188
|
|
|
15075
15189
|
// Transition detected: probe ONLY the crossing accounts to get fresh
|
|
@@ -15083,16 +15197,31 @@ async function runQuotaWatch(): Promise<void> {
|
|
|
15083
15197
|
freshProbeMap.set(entry.label, entry.result)
|
|
15084
15198
|
}
|
|
15085
15199
|
} catch (err) {
|
|
15086
|
-
// Probe failed — still send notifications using cached data.
|
|
15087
|
-
// Don't abort: the user should know about the threshold crossing
|
|
15088
|
-
// even if the message body shows slightly stale numbers.
|
|
15089
15200
|
process.stderr.write(`telegram gateway: quota-watch: probe for crossing accounts failed: ${err}\n`)
|
|
15201
|
+
if (!tuning.sendOnProbeFail) {
|
|
15202
|
+
// A quota notification must never carry numbers we could not verify
|
|
15203
|
+
// live. Leave the crossing accounts' state untouched — the
|
|
15204
|
+
// transition re-evaluates (and re-probes) on the next 15-min tick.
|
|
15205
|
+
// Persist any reconciles already applied, then bail.
|
|
15206
|
+
if (reconciledCount > 0) {
|
|
15207
|
+
try {
|
|
15208
|
+
saveQuotaWatchState(stateDir, mutatedState)
|
|
15209
|
+
} catch (saveErr) {
|
|
15210
|
+
process.stderr.write(`telegram gateway: quota-watch state persist failed: ${saveErr}\n`)
|
|
15211
|
+
}
|
|
15212
|
+
}
|
|
15213
|
+
process.stderr.write(
|
|
15214
|
+
`telegram gateway: quota-watch: deferring ${pendingTransitions.length} notification(s) until probe succeeds\n`,
|
|
15215
|
+
)
|
|
15216
|
+
return
|
|
15217
|
+
}
|
|
15218
|
+
// Legacy (SWITCHROOM_QUOTA_WATCH_SEND_ON_PROBE_FAIL=1): fall through
|
|
15219
|
+
// and send from cached data.
|
|
15090
15220
|
}
|
|
15091
15221
|
|
|
15092
15222
|
// Build final notifications, enriching the snapshot with fresh probe
|
|
15093
15223
|
// data where available.
|
|
15094
|
-
|
|
15095
|
-
const notifications: Array<{ message: string; accountLabel: string }> = []
|
|
15224
|
+
const notifications: Array<{ message: string; accountLabel: string; transition: string }> = []
|
|
15096
15225
|
|
|
15097
15226
|
for (const { accountLabel, snapIndex, decision } of pendingTransitions) {
|
|
15098
15227
|
// Re-evaluate with fresh probe data to get an accurate message body.
|
|
@@ -15100,37 +15229,88 @@ async function runQuotaWatch(): Promise<void> {
|
|
|
15100
15229
|
const freshResult = freshProbeMap.get(accountLabel)
|
|
15101
15230
|
let enrichedDecision = decision
|
|
15102
15231
|
// pendingTransitions only ever holds notify decisions (pushed under
|
|
15103
|
-
// `decision.kind !== 'skip'`). Narrow explicitly so
|
|
15104
|
-
// type-checks below; this continue never fires
|
|
15232
|
+
// `decision.kind !== 'skip'` / `!== 'reconcile'`). Narrow explicitly so
|
|
15233
|
+
// `decision.transition` type-checks below; this continue never fires
|
|
15234
|
+
// at runtime.
|
|
15105
15235
|
if (decision.kind !== 'notify') continue
|
|
15106
15236
|
if (freshResult && freshResult.ok && snapIndex >= 0) {
|
|
15107
|
-
|
|
15237
|
+
// Live numbers replace the cache — and capturedAtMs is cleared so the
|
|
15238
|
+
// staleness gate never misfires on data we JUST probed.
|
|
15239
|
+
const enrichedSnap = { ...snapshots[snapIndex]!, quota: freshResult.data, capturedAtMs: undefined }
|
|
15108
15240
|
const prev = watchState[accountLabel] ?? emptyAccountState()
|
|
15109
|
-
const re = evaluateQuotaWatchAccount({ agentName, snap: enrichedSnap, prev, now })
|
|
15241
|
+
const re = evaluateQuotaWatchAccount({ agentName, snap: enrichedSnap, prev, now, bootTick, tuning })
|
|
15110
15242
|
// If the fresh probe still shows the same transition, use the
|
|
15111
15243
|
// enriched message. If it no longer shows a transition (e.g. the
|
|
15112
15244
|
// account recovered in the 100ms between listState and probe),
|
|
15113
15245
|
// fall through to skip this notification.
|
|
15114
15246
|
if (re.kind === 'notify' && re.transition === decision.transition) {
|
|
15115
15247
|
enrichedDecision = re
|
|
15248
|
+
} else if (re.kind === 'reconcile') {
|
|
15249
|
+
// Fresh data confirms the transition but it isn't news (boot-tick /
|
|
15250
|
+
// late recovery) — advance the latch silently.
|
|
15251
|
+
mutatedState = patchQuotaWatchState(mutatedState, accountLabel, re.newAccountState)
|
|
15252
|
+
reconciledCount++
|
|
15253
|
+
process.stderr.write(
|
|
15254
|
+
`telegram gateway: quota-watch: reconciled ${re.transition} for account=${accountLabel} (${re.reason}) — no notification\n`,
|
|
15255
|
+
)
|
|
15256
|
+
continue
|
|
15116
15257
|
} else if (re.kind === 'skip') {
|
|
15117
15258
|
// State normalised by the time of the probe — don't notify.
|
|
15118
15259
|
continue
|
|
15119
15260
|
}
|
|
15261
|
+
} else if (!tuning.sendOnProbeFail) {
|
|
15262
|
+
// No verified fresh data for this account (per-account probe failure
|
|
15263
|
+
// or label missing from the batch result). Same rule as the batch
|
|
15264
|
+
// throw above: never send unverified numbers. State untouched —
|
|
15265
|
+
// re-evaluated (and re-probed) next tick.
|
|
15266
|
+
process.stderr.write(
|
|
15267
|
+
`telegram gateway: quota-watch: probe unavailable for account=${accountLabel} — deferring notification\n`,
|
|
15268
|
+
)
|
|
15269
|
+
continue
|
|
15120
15270
|
}
|
|
15121
15271
|
|
|
15122
15272
|
if (enrichedDecision.kind !== 'notify') continue
|
|
15123
|
-
notifications.push({
|
|
15273
|
+
notifications.push({
|
|
15274
|
+
message: enrichedDecision.message,
|
|
15275
|
+
accountLabel,
|
|
15276
|
+
transition: enrichedDecision.transition,
|
|
15277
|
+
})
|
|
15124
15278
|
mutatedState = patchQuotaWatchState(mutatedState, accountLabel, enrichedDecision.newAccountState)
|
|
15125
15279
|
}
|
|
15126
15280
|
|
|
15127
15281
|
if (notifications.length === 0) {
|
|
15128
|
-
|
|
15282
|
+
// All transitions resolved/deferred by the time of the live probe.
|
|
15283
|
+
// Reconciles may still have advanced the latch — persist those.
|
|
15284
|
+
if (reconciledCount > 0) {
|
|
15285
|
+
try {
|
|
15286
|
+
saveQuotaWatchState(stateDir, mutatedState)
|
|
15287
|
+
} catch (err) {
|
|
15288
|
+
process.stderr.write(`telegram gateway: quota-watch state persist failed: ${err}\n`)
|
|
15289
|
+
}
|
|
15290
|
+
}
|
|
15291
|
+
return
|
|
15129
15292
|
}
|
|
15130
15293
|
|
|
15131
15294
|
// Send all notifications (one message per crossing account).
|
|
15132
|
-
for (const { message, accountLabel } of notifications) {
|
|
15295
|
+
for (const { message, accountLabel, transition } of notifications) {
|
|
15133
15296
|
for (const chat_id of access.allowFrom) {
|
|
15297
|
+
// Fleet-level dedup: every agent gateway independently detects the
|
|
15298
|
+
// same account transition within one poll cycle. The broker claim
|
|
15299
|
+
// grants exactly one sender per (account, transition, chat) per
|
|
15300
|
+
// window — the other ten agents advance their local state silently.
|
|
15301
|
+
// Fail-open on claim error (see claimQuotaNotification).
|
|
15302
|
+
if (tuning.fleetDedup) {
|
|
15303
|
+
const granted = await claimQuotaNotification(
|
|
15304
|
+
brokerClient,
|
|
15305
|
+
buildQuotaClaimKey(accountLabel, transition, chat_id),
|
|
15306
|
+
)
|
|
15307
|
+
if (!granted) {
|
|
15308
|
+
process.stderr.write(
|
|
15309
|
+
`telegram gateway: quota-watch: claim denied account=${accountLabel} chat=${chat_id} — another agent notified\n`,
|
|
15310
|
+
)
|
|
15311
|
+
continue
|
|
15312
|
+
}
|
|
15313
|
+
}
|
|
15134
15314
|
// Quota-watch notify — best-effort. Wrap via swallowingApiCall so
|
|
15135
15315
|
// flood-wait / deleted-chat / not-found surface as a stderr log
|
|
15136
15316
|
// rather than a thrown exception that aborts the loop and leaves
|
|
@@ -20453,7 +20633,12 @@ void (async () => {
|
|
|
20453
20633
|
// settle after boot (avoids a probe race with the boot-card
|
|
20454
20634
|
// quota probe that fires in the first few seconds).
|
|
20455
20635
|
setTimeout(() => {
|
|
20456
|
-
|
|
20636
|
+
// bootTick: recovery edges observed on the FIRST post-boot tick
|
|
20637
|
+
// reconcile silently — a fleet bounce synchronizes all agents'
|
|
20638
|
+
// first ticks, and a just-booted gateway can't tell "just
|
|
20639
|
+
// recovered" from "recovered while we were down" (the
|
|
20640
|
+
// 2026-06-09 26-message flood). Warnings still notify.
|
|
20641
|
+
void runQuotaWatch({ bootTick: true }).catch((err) => {
|
|
20457
20642
|
process.stderr.write(`telegram gateway: quota-watch initial run failed: ${err}\n`)
|
|
20458
20643
|
})
|
|
20459
20644
|
}, 30_000)
|
|
@@ -73,6 +73,76 @@ export function emptyAccountState(): QuotaWatchAccountState {
|
|
|
73
73
|
return { lastNotifiedHealth: null, lastNotifiedAt: 0 };
|
|
74
74
|
}
|
|
75
75
|
|
|
76
|
+
// ─── Tuning (env knobs) ───────────────────────────────────────────────────────
|
|
77
|
+
|
|
78
|
+
/**
|
|
79
|
+
* Operational tuning for the watch loop, resolved once from env by the
|
|
80
|
+
* gateway. All three hardening behaviours are individually
|
|
81
|
+
* kill-switchable (incident 2026-06-09: a fleet bounce released
|
|
82
|
+
* days-stale recovery latches on all 11 agents at once → 26 duplicate
|
|
83
|
+
* 🟢 messages in 16 minutes):
|
|
84
|
+
*
|
|
85
|
+
* SWITCHROOM_QUOTA_WATCH_MAX_STALE_MS 0 disables the staleness gate
|
|
86
|
+
* (default 60 min)
|
|
87
|
+
* SWITCHROOM_QUOTA_WATCH_LATE_RECOVERY_MS 0 disables silent late-recovery
|
|
88
|
+
* reconciliation (default 6 h)
|
|
89
|
+
* SWITCHROOM_QUOTA_WATCH_FLEET_DEDUP "0" disables the broker claim
|
|
90
|
+
* (every agent sends, pre-incident
|
|
91
|
+
* behaviour)
|
|
92
|
+
* SWITCHROOM_QUOTA_WATCH_SEND_ON_PROBE_FAIL "1" restores sending from
|
|
93
|
+
* cached data when the pre-send
|
|
94
|
+
* validation probe fails
|
|
95
|
+
*/
|
|
96
|
+
export interface QuotaWatchTuning {
|
|
97
|
+
/** Cached snapshots older than this are treated as unknown (no opinion). 0 = off. */
|
|
98
|
+
maxStaleMs: number;
|
|
99
|
+
/** Recovery edges whose 🟡 warning is older than this reconcile silently. 0 = off. */
|
|
100
|
+
lateRecoveryMs: number;
|
|
101
|
+
/** Route sends through the broker's claim-notification dedup. */
|
|
102
|
+
fleetDedup: boolean;
|
|
103
|
+
/** Legacy: send from cached data when the validation probe fails. */
|
|
104
|
+
sendOnProbeFail: boolean;
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
export const DEFAULT_QUOTA_WATCH_MAX_STALE_MS = 60 * 60_000;
|
|
108
|
+
export const DEFAULT_QUOTA_WATCH_LATE_RECOVERY_MS = 6 * 60 * 60_000;
|
|
109
|
+
|
|
110
|
+
/** Broker claim window. Must exceed one full poll cycle (15 min) plus the
|
|
111
|
+
* boot-stagger spread so every agent's observation of the SAME edge lands
|
|
112
|
+
* inside one window; an account genuinely re-crossing the same edge later
|
|
113
|
+
* than this re-notifies. */
|
|
114
|
+
export const QUOTA_WATCH_CLAIM_WINDOW_MS = 30 * 60_000;
|
|
115
|
+
|
|
116
|
+
export function resolveQuotaWatchTuning(
|
|
117
|
+
env: Record<string, string | undefined>,
|
|
118
|
+
): QuotaWatchTuning {
|
|
119
|
+
const num = (raw: string | undefined, fallback: number): number => {
|
|
120
|
+
if (raw === undefined || raw === "") return fallback;
|
|
121
|
+
const n = Number(raw);
|
|
122
|
+
return Number.isFinite(n) && n >= 0 ? n : fallback;
|
|
123
|
+
};
|
|
124
|
+
return {
|
|
125
|
+
maxStaleMs: num(env.SWITCHROOM_QUOTA_WATCH_MAX_STALE_MS, DEFAULT_QUOTA_WATCH_MAX_STALE_MS),
|
|
126
|
+
lateRecoveryMs: num(env.SWITCHROOM_QUOTA_WATCH_LATE_RECOVERY_MS, DEFAULT_QUOTA_WATCH_LATE_RECOVERY_MS),
|
|
127
|
+
fleetDedup: env.SWITCHROOM_QUOTA_WATCH_FLEET_DEDUP !== "0",
|
|
128
|
+
sendOnProbeFail: env.SWITCHROOM_QUOTA_WATCH_SEND_ON_PROBE_FAIL === "1",
|
|
129
|
+
};
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
/**
|
|
133
|
+
* Broker dedup-claim key for one (account, transition, chat) cell.
|
|
134
|
+
* Per-CHAT keys keep the audience identical to pre-dedup behaviour:
|
|
135
|
+
* every chat that any agent would have notified still receives exactly
|
|
136
|
+
* one copy — from whichever agent claims it first.
|
|
137
|
+
*/
|
|
138
|
+
export function buildQuotaClaimKey(
|
|
139
|
+
accountLabel: string,
|
|
140
|
+
transition: string,
|
|
141
|
+
chatId: string | number,
|
|
142
|
+
): string {
|
|
143
|
+
return `quota-watch:${accountLabel}:${transition}:${chatId}`;
|
|
144
|
+
}
|
|
145
|
+
|
|
76
146
|
// ─── Decision logic ───────────────────────────────────────────────────────────
|
|
77
147
|
|
|
78
148
|
export type QuotaWatchTransition =
|
|
@@ -87,30 +157,73 @@ export type QuotaWatchDecision =
|
|
|
87
157
|
newAccountState: QuotaWatchAccountState;
|
|
88
158
|
transition: QuotaWatchTransition;
|
|
89
159
|
}
|
|
160
|
+
| {
|
|
161
|
+
/**
|
|
162
|
+
* A real transition was observed, but it is no longer NEWS — persist
|
|
163
|
+
* the new state so the edge-trigger latch clears, send nothing.
|
|
164
|
+
* Two producers: boot-tick recoveries (a just-booted gateway cannot
|
|
165
|
+
* distinguish "just recovered" from "recovered while we were down",
|
|
166
|
+
* and fleet bounces synchronize all agents' first ticks → flood) and
|
|
167
|
+
* late recoveries (the matching 🟡 is hours old; an "all clear" now
|
|
168
|
+
* is state reconciliation, not information).
|
|
169
|
+
*/
|
|
170
|
+
kind: "reconcile";
|
|
171
|
+
accountLabel: string;
|
|
172
|
+
newAccountState: QuotaWatchAccountState;
|
|
173
|
+
transition: QuotaWatchTransition;
|
|
174
|
+
reason: "boot-tick-recovery" | "late-recovery";
|
|
175
|
+
}
|
|
90
176
|
| { kind: "skip"; accountLabel: string; reason: string };
|
|
91
177
|
|
|
92
178
|
/**
|
|
93
179
|
* Evaluate one account's quota state against its last-notified health.
|
|
94
180
|
*
|
|
95
|
-
* Transition table
|
|
181
|
+
* Transition table (after the staleness gate — a cached snapshot older
|
|
182
|
+
* than `maxStaleMs` is no opinion at all → skip "stale-snapshot"):
|
|
96
183
|
* healthy → healthy skip (steady-state)
|
|
97
|
-
* healthy → throttling notify (entered-throttling)
|
|
184
|
+
* healthy → throttling notify (entered-throttling) — warnings are
|
|
185
|
+
* level-state news, valid on any tick incl. boot
|
|
98
186
|
* healthy → blocked skip (credits-watch covers this)
|
|
99
|
-
* throttling → healthy notify (recovered-to-healthy)
|
|
187
|
+
* throttling → healthy notify (recovered-to-healthy), EXCEPT:
|
|
188
|
+
* boot tick → reconcile silently
|
|
189
|
+
* warning > lateRecoveryMs old → reconcile silently
|
|
100
190
|
* throttling → throttling skip (already notified)
|
|
101
191
|
* throttling → blocked skip (credits-watch covers blocked)
|
|
102
192
|
* blocked → * skip (credits-watch domain)
|
|
103
193
|
* unknown → * skip (no quota data — don't spam)
|
|
104
194
|
* * → unknown skip (probe failed — transient, don't alarm)
|
|
195
|
+
*
|
|
196
|
+
* `bootTick` / `tuning` are optional: omitted (legacy callers/tests) the
|
|
197
|
+
* behaviour is exactly the pre-hardening table (no stale gate, no
|
|
198
|
+
* reconciliation).
|
|
105
199
|
*/
|
|
106
200
|
export function evaluateQuotaWatchAccount(args: {
|
|
107
201
|
agentName: string;
|
|
108
202
|
snap: AccountSnapshot;
|
|
109
203
|
prev: QuotaWatchAccountState;
|
|
110
204
|
now: number;
|
|
205
|
+
/** True on the gateway's first watch tick after boot. */
|
|
206
|
+
bootTick?: boolean;
|
|
207
|
+
/** Staleness / late-recovery thresholds; 0 disables each. */
|
|
208
|
+
tuning?: Pick<QuotaWatchTuning, "maxStaleMs" | "lateRecoveryMs">;
|
|
111
209
|
}): QuotaWatchDecision {
|
|
112
210
|
const { agentName, snap, prev, now } = args;
|
|
211
|
+
const bootTick = args.bootTick ?? false;
|
|
212
|
+
const maxStaleMs = args.tuning?.maxStaleMs ?? 0;
|
|
213
|
+
const lateRecoveryMs = args.tuning?.lateRecoveryMs ?? 0;
|
|
113
214
|
const label = snap.label;
|
|
215
|
+
|
|
216
|
+
// Staleness gate: a CACHED snapshot (capturedAtMs set) past its shelf
|
|
217
|
+
// life carries no opinion about the present — neither latch nor release.
|
|
218
|
+
// Live-probe snapshots (capturedAtMs undefined) are fresh by construction.
|
|
219
|
+
if (
|
|
220
|
+
maxStaleMs > 0 &&
|
|
221
|
+
snap.capturedAtMs !== undefined &&
|
|
222
|
+
now - snap.capturedAtMs > maxStaleMs
|
|
223
|
+
) {
|
|
224
|
+
return { kind: "skip", accountLabel: label, reason: "stale-snapshot" };
|
|
225
|
+
}
|
|
226
|
+
|
|
114
227
|
const currentHealth = classifyHealth(snap);
|
|
115
228
|
|
|
116
229
|
// Unknown (probe failed) or blocked — skip entirely.
|
|
@@ -147,6 +260,31 @@ export function evaluateQuotaWatchAccount(args: {
|
|
|
147
260
|
lastNotifiedHealth: "healthy",
|
|
148
261
|
lastNotifiedAt: now,
|
|
149
262
|
};
|
|
263
|
+
// A recovery observed on the first post-boot tick is not attributable
|
|
264
|
+
// to "just now" — the account may have recovered any time while this
|
|
265
|
+
// gateway was down, and a fleet bounce synchronizes every agent's
|
|
266
|
+
// first tick (the 2026-06-09 26-message flood). Reconcile silently.
|
|
267
|
+
if (bootTick) {
|
|
268
|
+
return {
|
|
269
|
+
kind: "reconcile",
|
|
270
|
+
accountLabel: label,
|
|
271
|
+
newAccountState: newState,
|
|
272
|
+
transition: "recovered-to-healthy",
|
|
273
|
+
reason: "boot-tick-recovery",
|
|
274
|
+
};
|
|
275
|
+
}
|
|
276
|
+
// Recovery whose matching 🟡 warning is hours old: the "all clear" is
|
|
277
|
+
// no longer actionable news (the user has long moved on; /auth shows
|
|
278
|
+
// live state on demand). Clear the latch without a message.
|
|
279
|
+
if (lateRecoveryMs > 0 && now - prev.lastNotifiedAt > lateRecoveryMs) {
|
|
280
|
+
return {
|
|
281
|
+
kind: "reconcile",
|
|
282
|
+
accountLabel: label,
|
|
283
|
+
newAccountState: newState,
|
|
284
|
+
transition: "recovered-to-healthy",
|
|
285
|
+
reason: "late-recovery",
|
|
286
|
+
};
|
|
287
|
+
}
|
|
150
288
|
return {
|
|
151
289
|
kind: "notify",
|
|
152
290
|
accountLabel: label,
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Telegram legacy accounts-table twin of the CLI honesty fix — the
|
|
3
|
+
* legacy table renders exactly when the live probe FAILED, i.e. when
|
|
4
|
+
* cached-data disclosure matters most.
|
|
5
|
+
*/
|
|
6
|
+
import { describe, it, expect } from "vitest";
|
|
7
|
+
import { formatQuotaUtilCell } from "../gateway/auth-command.js";
|
|
8
|
+
|
|
9
|
+
const NOW = 1_780_000_000_000;
|
|
10
|
+
|
|
11
|
+
describe("formatQuotaUtilCell (Telegram legacy table)", () => {
|
|
12
|
+
it("no cached snapshot → 'no data'", () => {
|
|
13
|
+
expect(formatQuotaUtilCell({ last_quota: null }, NOW)).toBe("no data");
|
|
14
|
+
});
|
|
15
|
+
|
|
16
|
+
it("renders both windows with the snapshot age", () => {
|
|
17
|
+
const cell = formatQuotaUtilCell(
|
|
18
|
+
{ last_quota: { fiveHourUtilizationPct: 84.6, sevenDayUtilizationPct: 12.1, capturedAt: NOW - 90_000 } },
|
|
19
|
+
NOW,
|
|
20
|
+
);
|
|
21
|
+
expect(cell).toBe("85%·12% (1m 30s ago)");
|
|
22
|
+
});
|
|
23
|
+
});
|
|
@@ -173,3 +173,74 @@ describe('runFleetAutoFallback', () => {
|
|
|
173
173
|
}
|
|
174
174
|
});
|
|
175
175
|
});
|
|
176
|
+
|
|
177
|
+
// ── failure notice (broken-promise fix, 2026-06-09 incident follow-up) ──────
|
|
178
|
+
|
|
179
|
+
import { renderFallbackFailureNotice } from "../auto-fallback-fleet.js";
|
|
180
|
+
|
|
181
|
+
describe("renderFallbackFailureNotice", () => {
|
|
182
|
+
it("names the trigger agent, the reason, and the manual recovery verbs", () => {
|
|
183
|
+
const html = renderFallbackFailureNotice("marko", "auth-broker unreachable (no client).");
|
|
184
|
+
expect(html).toContain("Auto-failover could not run");
|
|
185
|
+
expect(html).toContain("<b>marko</b>");
|
|
186
|
+
expect(html).toContain("auth-broker unreachable");
|
|
187
|
+
expect(html).toContain("/auth use");
|
|
188
|
+
expect(html).toContain("/auth</code>");
|
|
189
|
+
});
|
|
190
|
+
|
|
191
|
+
it("escapes HTML in the error reason (broker errors can contain angle brackets)", () => {
|
|
192
|
+
const html = renderFallbackFailureNotice("a<b", 'request <probe-quota> failed & "timed out"');
|
|
193
|
+
expect(html).toContain("a<b");
|
|
194
|
+
expect(html).toContain("<probe-quota>");
|
|
195
|
+
expect(html).toContain("&");
|
|
196
|
+
expect(html).not.toMatch(/<probe-quota>/);
|
|
197
|
+
});
|
|
198
|
+
});
|
|
199
|
+
|
|
200
|
+
// ── failure-notice cooldown (reviewer blocker: gate window never arms on
|
|
201
|
+
// failure; quota_wall_detected re-fires ~60s → unbounded notice spam) ─────
|
|
202
|
+
|
|
203
|
+
import {
|
|
204
|
+
evaluateFallbackFailureNotice,
|
|
205
|
+
FALLBACK_FAILURE_NOTICE_COOLDOWN_MS,
|
|
206
|
+
} from "../auto-fallback-fleet.js";
|
|
207
|
+
|
|
208
|
+
describe("evaluateFallbackFailureNotice", () => {
|
|
209
|
+
const T0 = 1_780_000_000_000;
|
|
210
|
+
|
|
211
|
+
it("first failure always sends and arms the cooldown", () => {
|
|
212
|
+
const r = evaluateFallbackFailureNotice({ lastSentAtMs: 0 }, T0);
|
|
213
|
+
expect(r.send).toBe(true);
|
|
214
|
+
expect(r.next.lastSentAtMs).toBe(T0);
|
|
215
|
+
});
|
|
216
|
+
|
|
217
|
+
it("a repeat failure inside the cooldown is suppressed and does NOT extend the window", () => {
|
|
218
|
+
const armed = { lastSentAtMs: T0 };
|
|
219
|
+
const r = evaluateFallbackFailureNotice(armed, T0 + 60_000);
|
|
220
|
+
expect(r.send).toBe(false);
|
|
221
|
+
expect(r.next).toBe(armed); // unchanged — window not extended by suppressed attempts
|
|
222
|
+
});
|
|
223
|
+
|
|
224
|
+
it("sends again once the cooldown elapses", () => {
|
|
225
|
+
const r = evaluateFallbackFailureNotice(
|
|
226
|
+
{ lastSentAtMs: T0 },
|
|
227
|
+
T0 + FALLBACK_FAILURE_NOTICE_COOLDOWN_MS,
|
|
228
|
+
);
|
|
229
|
+
expect(r.send).toBe(true);
|
|
230
|
+
expect(r.next.lastSentAtMs).toBe(T0 + FALLBACK_FAILURE_NOTICE_COOLDOWN_MS);
|
|
231
|
+
});
|
|
232
|
+
|
|
233
|
+
it("bounds the 60s quota_wall_detected re-fire storm to ≤2 notices/hour", () => {
|
|
234
|
+
// Simulate a wedged agent re-signalling every 60s for one hour with a
|
|
235
|
+
// dead broker — the incident shape the reviewer flagged.
|
|
236
|
+
let state = { lastSentAtMs: 0 };
|
|
237
|
+
let sent = 0;
|
|
238
|
+
for (let t = T0; t < T0 + 3_600_000; t += 60_000) {
|
|
239
|
+
const r = evaluateFallbackFailureNotice(state, t);
|
|
240
|
+
if (r.send) sent++;
|
|
241
|
+
state = r.next;
|
|
242
|
+
}
|
|
243
|
+
expect(sent).toBeLessThanOrEqual(2);
|
|
244
|
+
expect(sent).toBeGreaterThanOrEqual(1);
|
|
245
|
+
});
|
|
246
|
+
});
|