switchroom 0.14.93 → 0.15.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -157,7 +157,7 @@ import {
157
157
  formatModelUnavailableCard,
158
158
  resolveModelUnavailableFromOperatorEvent,
159
159
  } from '../model-unavailable.js'
160
- import { runFleetAutoFallback } from '../auto-fallback-fleet.js'
160
+ import { runFleetAutoFallback, renderFallbackFailureNotice, evaluateFallbackFailureNotice, type FallbackFailureNoticeState } from '../auto-fallback-fleet.js'
161
161
  import { startRestartWatchdog } from './restart-watchdog.js'
162
162
  import { validateStringArray } from './access-validator.js'
163
163
 
@@ -422,6 +422,9 @@ import {
422
422
  saveQuotaWatchState,
423
423
  patchQuotaWatchState,
424
424
  emptyAccountState,
425
+ resolveQuotaWatchTuning,
426
+ buildQuotaClaimKey,
427
+ QUOTA_WATCH_CLAIM_WINDOW_MS,
425
428
  } from '../quota-watch.js'
426
429
  import { buildSnapshotsFromState, buildSnapshotsFromCachedState } from '../auth-snapshot-format.js'
427
430
  import {
@@ -14804,6 +14807,51 @@ async function fireFleetAutoFallback(triggerAgent: string, untilMs?: number): Pr
14804
14807
  )
14805
14808
  }
14806
14809
 
14810
+ /**
14811
+ * Broadcast a fleet-fallback FAILURE notice to every authorized chat.
14812
+ *
14813
+ * Why this exists: the model-unavailable card renders "Auto-failover in
14814
+ * progress — see the announcement below" BEFORE the dispatcher's outcome
14815
+ * is known. When the dispatcher errors (broker down, listState throw,
14816
+ * markExhausted failure), the success announcement never lands and the
14817
+ * card's promise is broken — the 2026-06-06→07 incident sent 12 such
14818
+ * broken-promise cards while every fallback errored "set-active requires
14819
+ * admin". The admin-gating root cause is fixed (#2206), but ANY future
14820
+ * dispatcher error reproduces the broken promise. This notice closes the
14821
+ * loop deterministically: card promised an announcement → an
14822
+ * announcement ALWAYS arrives, success or failure.
14823
+ *
14824
+ * Disable with SWITCHROOM_FLEET_FALLBACK_FAILURE_NOTICE=0 (log-only,
14825
+ * pre-fix behaviour).
14826
+ */
14827
+ let fallbackFailureNoticeState: FallbackFailureNoticeState = { lastSentAtMs: 0 }
14828
+
14829
+ function broadcastFleetFallbackFailure(triggerAgent: string, reason: string): void {
14830
+ if (process.env.SWITCHROOM_FLEET_FALLBACK_FAILURE_NOTICE === '0') return
14831
+ // Notice-level cooldown (30 min, per gateway). The fleetFallbackGate's
14832
+ // dedup window only arms on SUCCESSFUL swaps, so it bounds nothing
14833
+ // here — and the card-less quota_wall_detected trigger re-fires every
14834
+ // ~60s during a wall. Without this, a persistent broker outage would
14835
+ // stream failure notices for days. See evaluateFallbackFailureNotice.
14836
+ const verdict = evaluateFallbackFailureNotice(fallbackFailureNoticeState, Date.now())
14837
+ if (!verdict.send) {
14838
+ process.stderr.write(
14839
+ `telegram gateway: [fleet-fallback] failure notice suppressed (cooldown) agent=${triggerAgent}: ${reason}\n`,
14840
+ )
14841
+ return
14842
+ }
14843
+ fallbackFailureNoticeState = verdict.next
14844
+ const access = loadAccess()
14845
+ if (access.allowFrom.length === 0) return
14846
+ const html = renderFallbackFailureNotice(triggerAgent, reason)
14847
+ for (const chat_id of access.allowFrom) {
14848
+ void swallowingApiCall(
14849
+ () => bot.api.sendMessage(chat_id, html, { parse_mode: 'HTML' as const }),
14850
+ { chat_id, verb: 'fleet-fallback:failure-notify' },
14851
+ )
14852
+ }
14853
+ }
14854
+
14807
14855
  /** Returns true iff the dispatcher actually performed a swap (and the
14808
14856
  * user-visible announcement was broadcast). False on no-op /
14809
14857
  * error / idempotent-skip — caller uses this to decide whether to
@@ -14815,6 +14863,9 @@ async function doFireFleetAutoFallback(triggerAgent: string, untilMs?: number):
14815
14863
  process.stderr.write(
14816
14864
  `telegram gateway: [fleet-fallback] skipped agent=${triggerAgent} reason=no-broker-client\n`,
14817
14865
  )
14866
+ // The model-unavailable card may have promised an announcement —
14867
+ // keep the promise even though nothing could run.
14868
+ broadcastFleetFallbackFailure(triggerAgent, 'auth-broker unreachable (no client).')
14818
14869
  return false
14819
14870
  }
14820
14871
  const state = await client.listState()
@@ -14878,6 +14929,10 @@ async function doFireFleetAutoFallback(triggerAgent: string, untilMs?: number):
14878
14929
  process.stderr.write(
14879
14930
  `telegram gateway: [fleet-fallback] error agent=${triggerAgent}: ${(err as Error)?.message ?? err}\n`,
14880
14931
  )
14932
+ // Keep the card's "see the announcement below" promise on the error
14933
+ // path — the 06-06→07 incident sent 12 cards whose promised
14934
+ // announcement never arrived because this catch was log-only.
14935
+ broadcastFleetFallbackFailure(triggerAgent, (err as Error)?.message ?? String(err))
14881
14936
  return false
14882
14937
  }
14883
14938
  }
@@ -14969,9 +15024,34 @@ async function runCreditWatch(): Promise<void> {
14969
15024
  * State persists across restarts via `<stateDir>/quota-watch.json`.
14970
15025
  * Mirrors runCreditWatch's structure and notification routing.
14971
15026
  */
14972
- async function runQuotaWatch(): Promise<void> {
15027
+
15028
+ /**
15029
+ * Ask the broker for the fleet-wide dedup claim on one notification key.
15030
+ * FAIL-OPEN on any error: a broker that predates the `claim-notification`
15031
+ * op (skewed rollout) rejects at the protocol layer, and a transient IPC
15032
+ * failure must degrade to duplicated notifications, never dropped ones.
15033
+ */
15034
+ async function claimQuotaNotification(
15035
+ brokerClient: NonNullable<Awaited<ReturnType<typeof getAuthBrokerClient>>>,
15036
+ key: string,
15037
+ ): Promise<boolean> {
15038
+ try {
15039
+ const res = await brokerClient.claimNotification(key, QUOTA_WATCH_CLAIM_WINDOW_MS)
15040
+ return res.granted
15041
+ } catch (err) {
15042
+ process.stderr.write(`telegram gateway: quota-watch: claim failed (fail-open): ${err}\n`)
15043
+ return true
15044
+ }
15045
+ }
15046
+
15047
+ async function runQuotaWatch(opts: { bootTick?: boolean } = {}): Promise<void> {
14973
15048
  const agentName = getMyAgentName()
14974
15049
  const stateDir = STATE_DIR
15050
+ const bootTick = opts.bootTick ?? false
15051
+ // Hardening knobs (2026-06-09 incident: fleet bounce released stale
15052
+ // recovery latches on all 11 agents at once → 26 duplicate sends).
15053
+ // See QuotaWatchTuning in quota-watch.ts for the env contract.
15054
+ const tuning = resolveQuotaWatchTuning(process.env)
14975
15055
 
14976
15056
  // Read broker state. The listState response now includes last_quota
14977
15057
  // per account — the broker's in-memory cache from previous probeQuota
@@ -15019,6 +15099,20 @@ async function runQuotaWatch(): Promise<void> {
15019
15099
  })
15020
15100
  if (fleetDecision.kind === 'notify') {
15021
15101
  for (const chat_id of access.allowFrom) {
15102
+ // Fleet-level dedup: all 11 gateways detect this same edge within
15103
+ // one poll cycle — only the broker-claim winner sends per chat.
15104
+ if (tuning.fleetDedup) {
15105
+ const granted = await claimQuotaNotification(
15106
+ brokerClient,
15107
+ buildQuotaClaimKey(FLEET_ALL_EXHAUSTED_KEY, fleetDecision.transition, chat_id),
15108
+ )
15109
+ if (!granted) {
15110
+ process.stderr.write(
15111
+ `telegram gateway: quota-watch: fleet-all-exhausted claim denied chat=${chat_id} — another agent notified\n`,
15112
+ )
15113
+ continue
15114
+ }
15115
+ }
15022
15116
  await swallowingApiCall(
15023
15117
  () =>
15024
15118
  bot.api.sendMessage(chat_id, fleetDecision.message, {
@@ -15056,10 +15150,21 @@ async function runQuotaWatch(): Promise<void> {
15056
15150
  snapshots.map((s, i) => [s.label, i]),
15057
15151
  )
15058
15152
 
15153
+ // Reconciled transitions: state advances (latch clears) but nothing is
15154
+ // sent — boot-tick and late recoveries (see QuotaWatchDecision docs).
15155
+ let reconciledCount = 0
15156
+ let mutatedState = watchState
15157
+
15059
15158
  for (const snap of snapshots) {
15060
15159
  const prev = watchState[snap.label] ?? emptyAccountState()
15061
- const decision = evaluateQuotaWatchAccount({ agentName, snap, prev, now })
15062
- if (decision.kind !== 'skip') {
15160
+ const decision = evaluateQuotaWatchAccount({ agentName, snap, prev, now, bootTick, tuning })
15161
+ if (decision.kind === 'reconcile') {
15162
+ mutatedState = patchQuotaWatchState(mutatedState, decision.accountLabel, decision.newAccountState)
15163
+ reconciledCount++
15164
+ process.stderr.write(
15165
+ `telegram gateway: quota-watch: reconciled ${decision.transition} for account=${decision.accountLabel} (${decision.reason}) — no notification\n`,
15166
+ )
15167
+ } else if (decision.kind !== 'skip') {
15063
15168
  pendingTransitions.push({
15064
15169
  accountLabel: snap.label,
15065
15170
  snapIndex: labelToSnapIndex.get(snap.label) ?? -1,
@@ -15069,7 +15174,16 @@ async function runQuotaWatch(): Promise<void> {
15069
15174
  }
15070
15175
 
15071
15176
  if (pendingTransitions.length === 0) {
15072
- return // Steady-state: no notifications, no probes, no state write.
15177
+ // Steady-state: no notifications, no probes. Persist only if a
15178
+ // reconcile advanced the latch (otherwise no state write at all).
15179
+ if (reconciledCount > 0) {
15180
+ try {
15181
+ saveQuotaWatchState(stateDir, mutatedState)
15182
+ } catch (err) {
15183
+ process.stderr.write(`telegram gateway: quota-watch state persist failed: ${err}\n`)
15184
+ }
15185
+ }
15186
+ return
15073
15187
  }
15074
15188
 
15075
15189
  // Transition detected: probe ONLY the crossing accounts to get fresh
@@ -15083,16 +15197,31 @@ async function runQuotaWatch(): Promise<void> {
15083
15197
  freshProbeMap.set(entry.label, entry.result)
15084
15198
  }
15085
15199
  } catch (err) {
15086
- // Probe failed — still send notifications using cached data.
15087
- // Don't abort: the user should know about the threshold crossing
15088
- // even if the message body shows slightly stale numbers.
15089
15200
  process.stderr.write(`telegram gateway: quota-watch: probe for crossing accounts failed: ${err}\n`)
15201
+ if (!tuning.sendOnProbeFail) {
15202
+ // A quota notification must never carry numbers we could not verify
15203
+ // live. Leave the crossing accounts' state untouched — the
15204
+ // transition re-evaluates (and re-probes) on the next 15-min tick.
15205
+ // Persist any reconciles already applied, then bail.
15206
+ if (reconciledCount > 0) {
15207
+ try {
15208
+ saveQuotaWatchState(stateDir, mutatedState)
15209
+ } catch (saveErr) {
15210
+ process.stderr.write(`telegram gateway: quota-watch state persist failed: ${saveErr}\n`)
15211
+ }
15212
+ }
15213
+ process.stderr.write(
15214
+ `telegram gateway: quota-watch: deferring ${pendingTransitions.length} notification(s) until probe succeeds\n`,
15215
+ )
15216
+ return
15217
+ }
15218
+ // Legacy (SWITCHROOM_QUOTA_WATCH_SEND_ON_PROBE_FAIL=1): fall through
15219
+ // and send from cached data.
15090
15220
  }
15091
15221
 
15092
15222
  // Build final notifications, enriching the snapshot with fresh probe
15093
15223
  // data where available.
15094
- let mutatedState = watchState
15095
- const notifications: Array<{ message: string; accountLabel: string }> = []
15224
+ const notifications: Array<{ message: string; accountLabel: string; transition: string }> = []
15096
15225
 
15097
15226
  for (const { accountLabel, snapIndex, decision } of pendingTransitions) {
15098
15227
  // Re-evaluate with fresh probe data to get an accurate message body.
@@ -15100,37 +15229,88 @@ async function runQuotaWatch(): Promise<void> {
15100
15229
  const freshResult = freshProbeMap.get(accountLabel)
15101
15230
  let enrichedDecision = decision
15102
15231
  // pendingTransitions only ever holds notify decisions (pushed under
15103
- // `decision.kind !== 'skip'`). Narrow explicitly so `decision.transition`
15104
- // type-checks below; this continue never fires at runtime.
15232
+ // `decision.kind !== 'skip'` / `!== 'reconcile'`). Narrow explicitly so
15233
+ // `decision.transition` type-checks below; this continue never fires
15234
+ // at runtime.
15105
15235
  if (decision.kind !== 'notify') continue
15106
15236
  if (freshResult && freshResult.ok && snapIndex >= 0) {
15107
- const enrichedSnap = { ...snapshots[snapIndex]!, quota: freshResult.data }
15237
+ // Live numbers replace the cache and capturedAtMs is cleared so the
15238
+ // staleness gate never misfires on data we JUST probed.
15239
+ const enrichedSnap = { ...snapshots[snapIndex]!, quota: freshResult.data, capturedAtMs: undefined }
15108
15240
  const prev = watchState[accountLabel] ?? emptyAccountState()
15109
- const re = evaluateQuotaWatchAccount({ agentName, snap: enrichedSnap, prev, now })
15241
+ const re = evaluateQuotaWatchAccount({ agentName, snap: enrichedSnap, prev, now, bootTick, tuning })
15110
15242
  // If the fresh probe still shows the same transition, use the
15111
15243
  // enriched message. If it no longer shows a transition (e.g. the
15112
15244
  // account recovered in the 100ms between listState and probe),
15113
15245
  // fall through to skip this notification.
15114
15246
  if (re.kind === 'notify' && re.transition === decision.transition) {
15115
15247
  enrichedDecision = re
15248
+ } else if (re.kind === 'reconcile') {
15249
+ // Fresh data confirms the transition but it isn't news (boot-tick /
15250
+ // late recovery) — advance the latch silently.
15251
+ mutatedState = patchQuotaWatchState(mutatedState, accountLabel, re.newAccountState)
15252
+ reconciledCount++
15253
+ process.stderr.write(
15254
+ `telegram gateway: quota-watch: reconciled ${re.transition} for account=${accountLabel} (${re.reason}) — no notification\n`,
15255
+ )
15256
+ continue
15116
15257
  } else if (re.kind === 'skip') {
15117
15258
  // State normalised by the time of the probe — don't notify.
15118
15259
  continue
15119
15260
  }
15261
+ } else if (!tuning.sendOnProbeFail) {
15262
+ // No verified fresh data for this account (per-account probe failure
15263
+ // or label missing from the batch result). Same rule as the batch
15264
+ // throw above: never send unverified numbers. State untouched —
15265
+ // re-evaluated (and re-probed) next tick.
15266
+ process.stderr.write(
15267
+ `telegram gateway: quota-watch: probe unavailable for account=${accountLabel} — deferring notification\n`,
15268
+ )
15269
+ continue
15120
15270
  }
15121
15271
 
15122
15272
  if (enrichedDecision.kind !== 'notify') continue
15123
- notifications.push({ message: enrichedDecision.message, accountLabel })
15273
+ notifications.push({
15274
+ message: enrichedDecision.message,
15275
+ accountLabel,
15276
+ transition: enrichedDecision.transition,
15277
+ })
15124
15278
  mutatedState = patchQuotaWatchState(mutatedState, accountLabel, enrichedDecision.newAccountState)
15125
15279
  }
15126
15280
 
15127
15281
  if (notifications.length === 0) {
15128
- return // All transitions resolved by the time of the live probe.
15282
+ // All transitions resolved/deferred by the time of the live probe.
15283
+ // Reconciles may still have advanced the latch — persist those.
15284
+ if (reconciledCount > 0) {
15285
+ try {
15286
+ saveQuotaWatchState(stateDir, mutatedState)
15287
+ } catch (err) {
15288
+ process.stderr.write(`telegram gateway: quota-watch state persist failed: ${err}\n`)
15289
+ }
15290
+ }
15291
+ return
15129
15292
  }
15130
15293
 
15131
15294
  // Send all notifications (one message per crossing account).
15132
- for (const { message, accountLabel } of notifications) {
15295
+ for (const { message, accountLabel, transition } of notifications) {
15133
15296
  for (const chat_id of access.allowFrom) {
15297
+ // Fleet-level dedup: every agent gateway independently detects the
15298
+ // same account transition within one poll cycle. The broker claim
15299
+ // grants exactly one sender per (account, transition, chat) per
15300
+ // window — the other ten agents advance their local state silently.
15301
+ // Fail-open on claim error (see claimQuotaNotification).
15302
+ if (tuning.fleetDedup) {
15303
+ const granted = await claimQuotaNotification(
15304
+ brokerClient,
15305
+ buildQuotaClaimKey(accountLabel, transition, chat_id),
15306
+ )
15307
+ if (!granted) {
15308
+ process.stderr.write(
15309
+ `telegram gateway: quota-watch: claim denied account=${accountLabel} chat=${chat_id} — another agent notified\n`,
15310
+ )
15311
+ continue
15312
+ }
15313
+ }
15134
15314
  // Quota-watch notify — best-effort. Wrap via swallowingApiCall so
15135
15315
  // flood-wait / deleted-chat / not-found surface as a stderr log
15136
15316
  // rather than a thrown exception that aborts the loop and leaves
@@ -20453,7 +20633,12 @@ void (async () => {
20453
20633
  // settle after boot (avoids a probe race with the boot-card
20454
20634
  // quota probe that fires in the first few seconds).
20455
20635
  setTimeout(() => {
20456
- void runQuotaWatch().catch((err) => {
20636
+ // bootTick: recovery edges observed on the FIRST post-boot tick
20637
+ // reconcile silently — a fleet bounce synchronizes all agents'
20638
+ // first ticks, and a just-booted gateway can't tell "just
20639
+ // recovered" from "recovered while we were down" (the
20640
+ // 2026-06-09 26-message flood). Warnings still notify.
20641
+ void runQuotaWatch({ bootTick: true }).catch((err) => {
20457
20642
  process.stderr.write(`telegram gateway: quota-watch initial run failed: ${err}\n`)
20458
20643
  })
20459
20644
  }, 30_000)
@@ -73,6 +73,76 @@ export function emptyAccountState(): QuotaWatchAccountState {
73
73
  return { lastNotifiedHealth: null, lastNotifiedAt: 0 };
74
74
  }
75
75
 
76
+ // ─── Tuning (env knobs) ───────────────────────────────────────────────────────
77
+
78
+ /**
79
+ * Operational tuning for the watch loop, resolved once from env by the
80
+ * gateway. All three hardening behaviours are individually
81
+ * kill-switchable (incident 2026-06-09: a fleet bounce released
82
+ * days-stale recovery latches on all 11 agents at once → 26 duplicate
83
+ * 🟢 messages in 16 minutes):
84
+ *
85
+ * SWITCHROOM_QUOTA_WATCH_MAX_STALE_MS 0 disables the staleness gate
86
+ * (default 60 min)
87
+ * SWITCHROOM_QUOTA_WATCH_LATE_RECOVERY_MS 0 disables silent late-recovery
88
+ * reconciliation (default 6 h)
89
+ * SWITCHROOM_QUOTA_WATCH_FLEET_DEDUP "0" disables the broker claim
90
+ * (every agent sends, pre-incident
91
+ * behaviour)
92
+ * SWITCHROOM_QUOTA_WATCH_SEND_ON_PROBE_FAIL "1" restores sending from
93
+ * cached data when the pre-send
94
+ * validation probe fails
95
+ */
96
+ export interface QuotaWatchTuning {
97
+ /** Cached snapshots older than this are treated as unknown (no opinion). 0 = off. */
98
+ maxStaleMs: number;
99
+ /** Recovery edges whose 🟡 warning is older than this reconcile silently. 0 = off. */
100
+ lateRecoveryMs: number;
101
+ /** Route sends through the broker's claim-notification dedup. */
102
+ fleetDedup: boolean;
103
+ /** Legacy: send from cached data when the validation probe fails. */
104
+ sendOnProbeFail: boolean;
105
+ }
106
+
107
+ export const DEFAULT_QUOTA_WATCH_MAX_STALE_MS = 60 * 60_000;
108
+ export const DEFAULT_QUOTA_WATCH_LATE_RECOVERY_MS = 6 * 60 * 60_000;
109
+
110
+ /** Broker claim window. Must exceed one full poll cycle (15 min) plus the
111
+ * boot-stagger spread so every agent's observation of the SAME edge lands
112
+ * inside one window; an account genuinely re-crossing the same edge later
113
+ * than this re-notifies. */
114
+ export const QUOTA_WATCH_CLAIM_WINDOW_MS = 30 * 60_000;
115
+
116
+ export function resolveQuotaWatchTuning(
117
+ env: Record<string, string | undefined>,
118
+ ): QuotaWatchTuning {
119
+ const num = (raw: string | undefined, fallback: number): number => {
120
+ if (raw === undefined || raw === "") return fallback;
121
+ const n = Number(raw);
122
+ return Number.isFinite(n) && n >= 0 ? n : fallback;
123
+ };
124
+ return {
125
+ maxStaleMs: num(env.SWITCHROOM_QUOTA_WATCH_MAX_STALE_MS, DEFAULT_QUOTA_WATCH_MAX_STALE_MS),
126
+ lateRecoveryMs: num(env.SWITCHROOM_QUOTA_WATCH_LATE_RECOVERY_MS, DEFAULT_QUOTA_WATCH_LATE_RECOVERY_MS),
127
+ fleetDedup: env.SWITCHROOM_QUOTA_WATCH_FLEET_DEDUP !== "0",
128
+ sendOnProbeFail: env.SWITCHROOM_QUOTA_WATCH_SEND_ON_PROBE_FAIL === "1",
129
+ };
130
+ }
131
+
132
+ /**
133
+ * Broker dedup-claim key for one (account, transition, chat) cell.
134
+ * Per-CHAT keys keep the audience identical to pre-dedup behaviour:
135
+ * every chat that any agent would have notified still receives exactly
136
+ * one copy — from whichever agent claims it first.
137
+ */
138
+ export function buildQuotaClaimKey(
139
+ accountLabel: string,
140
+ transition: string,
141
+ chatId: string | number,
142
+ ): string {
143
+ return `quota-watch:${accountLabel}:${transition}:${chatId}`;
144
+ }
145
+
76
146
  // ─── Decision logic ───────────────────────────────────────────────────────────
77
147
 
78
148
  export type QuotaWatchTransition =
@@ -87,30 +157,73 @@ export type QuotaWatchDecision =
87
157
  newAccountState: QuotaWatchAccountState;
88
158
  transition: QuotaWatchTransition;
89
159
  }
160
+ | {
161
+ /**
162
+ * A real transition was observed, but it is no longer NEWS — persist
163
+ * the new state so the edge-trigger latch clears, send nothing.
164
+ * Two producers: boot-tick recoveries (a just-booted gateway cannot
165
+ * distinguish "just recovered" from "recovered while we were down",
166
+ * and fleet bounces synchronize all agents' first ticks → flood) and
167
+ * late recoveries (the matching 🟡 is hours old; an "all clear" now
168
+ * is state reconciliation, not information).
169
+ */
170
+ kind: "reconcile";
171
+ accountLabel: string;
172
+ newAccountState: QuotaWatchAccountState;
173
+ transition: QuotaWatchTransition;
174
+ reason: "boot-tick-recovery" | "late-recovery";
175
+ }
90
176
  | { kind: "skip"; accountLabel: string; reason: string };
91
177
 
92
178
  /**
93
179
  * Evaluate one account's quota state against its last-notified health.
94
180
  *
95
- * Transition table:
181
+ * Transition table (after the staleness gate — a cached snapshot older
182
+ * than `maxStaleMs` is no opinion at all → skip "stale-snapshot"):
96
183
  * healthy → healthy skip (steady-state)
97
- * healthy → throttling notify (entered-throttling)
184
+ * healthy → throttling notify (entered-throttling) — warnings are
185
+ * level-state news, valid on any tick incl. boot
98
186
  * healthy → blocked skip (credits-watch covers this)
99
- * throttling → healthy notify (recovered-to-healthy)
187
+ * throttling → healthy notify (recovered-to-healthy), EXCEPT:
188
+ * boot tick → reconcile silently
189
+ * warning > lateRecoveryMs old → reconcile silently
100
190
  * throttling → throttling skip (already notified)
101
191
  * throttling → blocked skip (credits-watch covers blocked)
102
192
  * blocked → * skip (credits-watch domain)
103
193
  * unknown → * skip (no quota data — don't spam)
104
194
  * * → unknown skip (probe failed — transient, don't alarm)
195
+ *
196
+ * `bootTick` / `tuning` are optional: omitted (legacy callers/tests) the
197
+ * behaviour is exactly the pre-hardening table (no stale gate, no
198
+ * reconciliation).
105
199
  */
106
200
  export function evaluateQuotaWatchAccount(args: {
107
201
  agentName: string;
108
202
  snap: AccountSnapshot;
109
203
  prev: QuotaWatchAccountState;
110
204
  now: number;
205
+ /** True on the gateway's first watch tick after boot. */
206
+ bootTick?: boolean;
207
+ /** Staleness / late-recovery thresholds; 0 disables each. */
208
+ tuning?: Pick<QuotaWatchTuning, "maxStaleMs" | "lateRecoveryMs">;
111
209
  }): QuotaWatchDecision {
112
210
  const { agentName, snap, prev, now } = args;
211
+ const bootTick = args.bootTick ?? false;
212
+ const maxStaleMs = args.tuning?.maxStaleMs ?? 0;
213
+ const lateRecoveryMs = args.tuning?.lateRecoveryMs ?? 0;
113
214
  const label = snap.label;
215
+
216
+ // Staleness gate: a CACHED snapshot (capturedAtMs set) past its shelf
217
+ // life carries no opinion about the present — neither latch nor release.
218
+ // Live-probe snapshots (capturedAtMs undefined) are fresh by construction.
219
+ if (
220
+ maxStaleMs > 0 &&
221
+ snap.capturedAtMs !== undefined &&
222
+ now - snap.capturedAtMs > maxStaleMs
223
+ ) {
224
+ return { kind: "skip", accountLabel: label, reason: "stale-snapshot" };
225
+ }
226
+
114
227
  const currentHealth = classifyHealth(snap);
115
228
 
116
229
  // Unknown (probe failed) or blocked — skip entirely.
@@ -147,6 +260,31 @@ export function evaluateQuotaWatchAccount(args: {
147
260
  lastNotifiedHealth: "healthy",
148
261
  lastNotifiedAt: now,
149
262
  };
263
+ // A recovery observed on the first post-boot tick is not attributable
264
+ // to "just now" — the account may have recovered any time while this
265
+ // gateway was down, and a fleet bounce synchronizes every agent's
266
+ // first tick (the 2026-06-09 26-message flood). Reconcile silently.
267
+ if (bootTick) {
268
+ return {
269
+ kind: "reconcile",
270
+ accountLabel: label,
271
+ newAccountState: newState,
272
+ transition: "recovered-to-healthy",
273
+ reason: "boot-tick-recovery",
274
+ };
275
+ }
276
+ // Recovery whose matching 🟡 warning is hours old: the "all clear" is
277
+ // no longer actionable news (the user has long moved on; /auth shows
278
+ // live state on demand). Clear the latch without a message.
279
+ if (lateRecoveryMs > 0 && now - prev.lastNotifiedAt > lateRecoveryMs) {
280
+ return {
281
+ kind: "reconcile",
282
+ accountLabel: label,
283
+ newAccountState: newState,
284
+ transition: "recovered-to-healthy",
285
+ reason: "late-recovery",
286
+ };
287
+ }
150
288
  return {
151
289
  kind: "notify",
152
290
  accountLabel: label,
@@ -0,0 +1,23 @@
1
+ /**
2
+ * Telegram legacy accounts-table twin of the CLI honesty fix — the
3
+ * legacy table renders exactly when the live probe FAILED, i.e. when
4
+ * cached-data disclosure matters most.
5
+ */
6
+ import { describe, it, expect } from "vitest";
7
+ import { formatQuotaUtilCell } from "../gateway/auth-command.js";
8
+
9
+ const NOW = 1_780_000_000_000;
10
+
11
+ describe("formatQuotaUtilCell (Telegram legacy table)", () => {
12
+ it("no cached snapshot → 'no data'", () => {
13
+ expect(formatQuotaUtilCell({ last_quota: null }, NOW)).toBe("no data");
14
+ });
15
+
16
+ it("renders both windows with the snapshot age", () => {
17
+ const cell = formatQuotaUtilCell(
18
+ { last_quota: { fiveHourUtilizationPct: 84.6, sevenDayUtilizationPct: 12.1, capturedAt: NOW - 90_000 } },
19
+ NOW,
20
+ );
21
+ expect(cell).toBe("85%·12% (1m 30s ago)");
22
+ });
23
+ });
@@ -173,3 +173,74 @@ describe('runFleetAutoFallback', () => {
173
173
  }
174
174
  });
175
175
  });
176
+
177
+ // ── failure notice (broken-promise fix, 2026-06-09 incident follow-up) ──────
178
+
179
+ import { renderFallbackFailureNotice } from "../auto-fallback-fleet.js";
180
+
181
+ describe("renderFallbackFailureNotice", () => {
182
+ it("names the trigger agent, the reason, and the manual recovery verbs", () => {
183
+ const html = renderFallbackFailureNotice("marko", "auth-broker unreachable (no client).");
184
+ expect(html).toContain("Auto-failover could not run");
185
+ expect(html).toContain("<b>marko</b>");
186
+ expect(html).toContain("auth-broker unreachable");
187
+ expect(html).toContain("/auth use");
188
+ expect(html).toContain("/auth</code>");
189
+ });
190
+
191
+ it("escapes HTML in the error reason (broker errors can contain angle brackets)", () => {
192
+ const html = renderFallbackFailureNotice("a<b", 'request <probe-quota> failed & "timed out"');
193
+ expect(html).toContain("a&lt;b");
194
+ expect(html).toContain("&lt;probe-quota&gt;");
195
+ expect(html).toContain("&amp;");
196
+ expect(html).not.toMatch(/<probe-quota>/);
197
+ });
198
+ });
199
+
200
+ // ── failure-notice cooldown (reviewer blocker: gate window never arms on
201
+ // failure; quota_wall_detected re-fires ~60s → unbounded notice spam) ─────
202
+
203
+ import {
204
+ evaluateFallbackFailureNotice,
205
+ FALLBACK_FAILURE_NOTICE_COOLDOWN_MS,
206
+ } from "../auto-fallback-fleet.js";
207
+
208
+ describe("evaluateFallbackFailureNotice", () => {
209
+ const T0 = 1_780_000_000_000;
210
+
211
+ it("first failure always sends and arms the cooldown", () => {
212
+ const r = evaluateFallbackFailureNotice({ lastSentAtMs: 0 }, T0);
213
+ expect(r.send).toBe(true);
214
+ expect(r.next.lastSentAtMs).toBe(T0);
215
+ });
216
+
217
+ it("a repeat failure inside the cooldown is suppressed and does NOT extend the window", () => {
218
+ const armed = { lastSentAtMs: T0 };
219
+ const r = evaluateFallbackFailureNotice(armed, T0 + 60_000);
220
+ expect(r.send).toBe(false);
221
+ expect(r.next).toBe(armed); // unchanged — window not extended by suppressed attempts
222
+ });
223
+
224
+ it("sends again once the cooldown elapses", () => {
225
+ const r = evaluateFallbackFailureNotice(
226
+ { lastSentAtMs: T0 },
227
+ T0 + FALLBACK_FAILURE_NOTICE_COOLDOWN_MS,
228
+ );
229
+ expect(r.send).toBe(true);
230
+ expect(r.next.lastSentAtMs).toBe(T0 + FALLBACK_FAILURE_NOTICE_COOLDOWN_MS);
231
+ });
232
+
233
+ it("bounds the 60s quota_wall_detected re-fire storm to ≤2 notices/hour", () => {
234
+ // Simulate a wedged agent re-signalling every 60s for one hour with a
235
+ // dead broker — the incident shape the reviewer flagged.
236
+ let state = { lastSentAtMs: 0 };
237
+ let sent = 0;
238
+ for (let t = T0; t < T0 + 3_600_000; t += 60_000) {
239
+ const r = evaluateFallbackFailureNotice(state, t);
240
+ if (r.send) sent++;
241
+ state = r.next;
242
+ }
243
+ expect(sent).toBeLessThanOrEqual(2);
244
+ expect(sent).toBeGreaterThanOrEqual(1);
245
+ });
246
+ });