switchroom 0.10.0 → 0.11.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/README.md +5 -4
  2. package/dist/agent-scheduler/index.js +2 -2
  3. package/dist/auth-broker/index.js +125 -3
  4. package/dist/cli/drive-write-pretool.mjs +5436 -0
  5. package/dist/cli/switchroom.js +231 -29
  6. package/dist/host-control/main.js +2 -2
  7. package/dist/vault/approvals/kernel-server.js +2 -2
  8. package/dist/vault/broker/server.js +2 -2
  9. package/package.json +1 -1
  10. package/telegram-plugin/admin-commands/dispatch.test.ts +1 -1
  11. package/telegram-plugin/admin-commands/index.ts +2 -0
  12. package/telegram-plugin/auth-snapshot-format.ts +612 -0
  13. package/telegram-plugin/auto-fallback-fleet.ts +215 -0
  14. package/telegram-plugin/auto-fallback.ts +28 -301
  15. package/telegram-plugin/dist/gateway/gateway.js +4314 -2143
  16. package/telegram-plugin/fleet-fallback-gate.ts +105 -0
  17. package/telegram-plugin/gateway/approval-callback.test.ts +104 -0
  18. package/telegram-plugin/gateway/approval-callback.ts +31 -3
  19. package/telegram-plugin/gateway/auth-broker-client.ts +2 -0
  20. package/telegram-plugin/gateway/auth-command.ts +131 -10
  21. package/telegram-plugin/gateway/auth-status-adapter.ts +101 -0
  22. package/telegram-plugin/gateway/boot-card.ts +1 -1
  23. package/telegram-plugin/gateway/boot-probes.ts +6 -9
  24. package/telegram-plugin/gateway/diff-preview-card.test.ts +192 -0
  25. package/telegram-plugin/gateway/diff-preview-card.ts +170 -0
  26. package/telegram-plugin/gateway/drive-write-approval.test.ts +312 -0
  27. package/telegram-plugin/gateway/drive-write-approval.ts +243 -0
  28. package/telegram-plugin/gateway/folder-picker-handler.test.ts +314 -0
  29. package/telegram-plugin/gateway/folder-picker-handler.ts +348 -0
  30. package/telegram-plugin/gateway/gateway.ts +903 -173
  31. package/telegram-plugin/gateway/hostd-dispatch.ts +137 -2
  32. package/telegram-plugin/gateway/ipc-protocol.ts +83 -2
  33. package/telegram-plugin/gateway/ipc-server.ts +69 -0
  34. package/telegram-plugin/hooks/sandbox-hint-posttool.mjs +103 -12
  35. package/telegram-plugin/model-unavailable.ts +28 -12
  36. package/telegram-plugin/silence-poke.ts +153 -1
  37. package/telegram-plugin/tests/auth-command-format2.test.ts +156 -0
  38. package/telegram-plugin/tests/auth-snapshot-format.test.ts +429 -0
  39. package/telegram-plugin/tests/auth-status-adapter.test.ts +129 -0
  40. package/telegram-plugin/tests/auto-fallback-fleet.test.ts +211 -0
  41. package/telegram-plugin/tests/auto-fallback.test.ts +60 -358
  42. package/telegram-plugin/tests/boot-probes.test.ts +16 -18
  43. package/telegram-plugin/tests/fleet-fallback-gate.test.ts +197 -0
  44. package/telegram-plugin/tests/model-unavailable.test.ts +30 -5
  45. package/telegram-plugin/tests/sandbox-hint-posttool.test.ts +212 -2
  46. package/telegram-plugin/tests/silence-poke.test.ts +237 -0
  47. package/telegram-plugin/tests/turn-flush-safety.test.ts +112 -0
  48. package/telegram-plugin/turn-flush-safety.ts +55 -1
  49. package/telegram-plugin/uat/SETUP.md +16 -12
  50. package/telegram-plugin/auto-fallback-dispatcher.ts +0 -68
  51. package/telegram-plugin/tests/auto-fallback-dispatcher.e2e.test.ts +0 -183
  52. package/telegram-plugin/tests/hostd-dispatch.test.ts +0 -129
@@ -52,6 +52,7 @@ import { OutboundDedupCache } from '../recent-outbound-dedup.js'
52
52
  import { createInboundCoalescer, inboundCoalesceKey } from './inbound-coalesce.js'
53
53
  import { StatusReactionController } from '../status-reactions.js'
54
54
  import { isTelegramReplyTool, isTelegramSurfaceTool } from '../tool-names.js'
55
+ import { toolLabel } from '../tool-labels.js'
55
56
  import { createTypingWrapper } from '../typing-wrap.js'
56
57
  import { type DraftStreamHandle } from '../draft-stream.js'
57
58
  import { handlePtyPartialPure, type PtyHandlerState } from '../pty-partial-handler.js'
@@ -94,6 +95,8 @@ import {
94
95
  import type { AuthBrokerClient } from './auth-command.js'
95
96
  import type { ListStateData } from './auth-line.js'
96
97
  import { getAuthBrokerClient, addAccountViaBroker } from './auth-broker-client.js'
98
+ import { resolveAuthBrokerSocketPath } from '../../src/auth/broker/client.js'
99
+ import { createFleetFallbackGate } from '../fleet-fallback-gate.js'
97
100
  import {
98
101
  pendingAuthAddFlows,
99
102
  startAccountAuthSession,
@@ -124,6 +127,7 @@ import {
124
127
  formatModelUnavailableCard,
125
128
  resolveModelUnavailableFromOperatorEvent,
126
129
  } from '../model-unavailable.js'
130
+ import { runFleetAutoFallback } from '../auto-fallback-fleet.js'
127
131
  import { startRestartWatchdog } from './restart-watchdog.js'
128
132
  import { validateStringArray } from './access-validator.js'
129
133
 
@@ -160,6 +164,11 @@ import {
160
164
  TELEGRAM_SWITCHROOM_COMMANDS,
161
165
  type AgentMetadata, type AuthSummary, type StatusProbeRow,
162
166
  } from '../welcome-text.js'
167
+ import {
168
+ type BrokerStateView,
169
+ type ClaudeJsonView,
170
+ buildAuthSummaryFromBroker,
171
+ } from './auth-status-adapter.js'
163
172
  import {
164
173
  isContextExhaustionText,
165
174
  shouldArmOrphanedReplyTimeout,
@@ -186,39 +195,52 @@ import {
186
195
  import { sweepActiveReactions } from '../active-reactions-sweep.js'
187
196
  import { flushOnAgentDisconnect } from './disconnect-flush.js'
188
197
  import { PreambleSuppressor } from './preamble-suppressor.js'
198
+ import {
199
+ fetchFolderPage,
200
+ FolderListCache,
201
+ } from '../../src/drive/folder-list.js'
202
+ import { loadFromAuthBroker } from '../../src/drive/wrapper-broker.js'
203
+ import {
204
+ handleFoldersCommand,
205
+ handleFolderPickerCallback,
206
+ type FolderPickerHandlerDeps,
207
+ } from './folder-picker-handler.js'
208
+ import {
209
+ approvalConsume as kernelApprovalConsume,
210
+ approvalRecord as kernelApprovalRecord,
211
+ approvalRequest as kernelApprovalRequest,
212
+ } from '../../src/vault/approvals/client.js'
189
213
  import {
190
214
  fetchQuota,
191
215
  formatQuotaBlock,
192
216
  } from '../quota-check.js'
193
217
  import {
194
- evaluateFallbackTrigger,
195
- performAutoFallback,
196
- emptyLockout,
197
218
  loadLockout,
198
- nextLockout,
199
- saveLockout,
200
219
  DEFAULT_FALLBACK_COOLDOWN_MS,
201
- type LockoutRecord,
202
220
  type LockoutPersistOps,
203
221
  } from '../auto-fallback.js'
204
- import { markSlotQuotaExhausted, DEFAULT_SLOT } from '../../src/auth/accounts.js'
205
- import { fallbackToNextSlot, currentActiveSlot, type AuthCodeOutcome } from '../../src/auth/manager.js'
222
+ import { DEFAULT_SLOT } from '../../src/auth/accounts.js'
223
+ import { currentActiveSlot, type AuthCodeOutcome } from '../../src/auth/manager.js'
206
224
  import { injectSlashCommand as injectSlashCommandImpl } from '../../src/agents/inject.js'
207
225
  import { handleInjectCommand } from './inject-handler.js'
208
226
  import { type BannerState } from '../slot-banner.js'
209
227
  import { refreshBanner } from '../slot-banner-driver.js'
210
- import { dispatchFallbackNotification } from '../auto-fallback-dispatcher.js'
211
228
  import { loadConfig as loadSwitchroomConfig } from '../../src/config/loader.js'; import { resolveAgentConfig } from '../../src/config/merge.js'
212
229
  import {
213
230
  tryHostdDispatch,
214
231
  hostdRequestId,
215
232
  hostdWillBeUsed,
233
+ pollHostdStatus,
234
+ warnLegacySpawnIfHostdDisabled,
216
235
  _resetHostdEnabledCache,
217
236
  } from './hostd-dispatch.js'
237
+ import type { HostdRequest } from '../../src/host-control/protocol.js'
218
238
  import type { AgentAudit } from '../welcome-text.js'
219
239
  import { shouldSweepChatAtBoot } from './boot-sweep-filter.js'
220
240
 
221
241
  import { createIpcServer, type IpcClient, type IpcServer } from './ipc-server.js'
242
+ import { handleRequestDriveApproval } from './drive-write-approval.js'
243
+ import { buildDiffPreviewCard } from './diff-preview-card.js'
222
244
  import { createPendingInboundBuffer } from './pending-inbound-buffer.js'
223
245
  import {
224
246
  buildVaultGrantApprovedInbound,
@@ -1086,6 +1108,14 @@ type CurrentTurn = {
1086
1108
  gatewayReceiveAt: number
1087
1109
  replyCalled: boolean
1088
1110
  capturedText: string[]
1111
+ // #1291: snapshot of capturedText.length at the moment of the most
1112
+ // recent reply / stream_reply tool call. Used by decideTurnFlush to
1113
+ // isolate the post-reply tail (e.g. a soft-commit reply followed by
1114
+ // the real substantive answer in terminal text only) and flush it as
1115
+ // a follow-up message. Pre-#1291 the existence of ANY reply call
1116
+ // suppressed flush entirely — that lost long terminal-only answers
1117
+ // after a "let me check" interim reply.
1118
+ capturedTextLenAtLastReply: number
1089
1119
  orphanedReplyTimeoutId: ReturnType<typeof setTimeout> | null
1090
1120
  registryKey: string | null
1091
1121
  // Last assistant outbound message id for the current turn — populated
@@ -1974,6 +2004,13 @@ const awaitingAuthCodeAt = new Map<string, number>()
1974
2004
  const AUTH_CODE_CONTEXT_TTL_MS = 5 * 60_000 // 5 min — OAuth code lifetime
1975
2005
  const DEFERRED_SECRET_TTL_MS = 24 * 60 * 60_000 // 24 h — ignored one-tap cards
1976
2006
 
2007
+ // Freshness throttle for `auth:refresh` taps. Keyed by `<chat_id>:<message_id>`
2008
+ // so two different snapshot messages throttle independently. Each refresh
2009
+ // fan-fires N live api.anthropic.com probes (one per account), so we cap
2010
+ // rapid re-taps to one per AUTH_REFRESH_THROTTLE_MS.
2011
+ const lastAuthRefreshAtMs = new Map<string, number>()
2012
+ const AUTH_REFRESH_THROTTLE_MS = 5_000
2013
+
1977
2014
  // ─── TTL reaper ───────────────────────────────────────────────────────────
1978
2015
  // Pending state maps above all grow whenever a flow starts and only shrink
1979
2016
  // when the flow completes. Users abandoning a flow (closing Telegram, losing
@@ -2037,6 +2074,12 @@ const pendingStateReaper = setInterval(() => {
2037
2074
  for (const [k, v] of awaitingAuthCodeAt) {
2038
2075
  if (now - v > AUTH_CODE_CONTEXT_TTL_MS) awaitingAuthCodeAt.delete(k)
2039
2076
  }
2077
+ // Auth-refresh throttle entries decay quickly (5s window); sweep
2078
+ // anything older than 60s so abandoned snapshot messages don't pin
2079
+ // their key forever.
2080
+ for (const [k, v] of lastAuthRefreshAtMs) {
2081
+ if (now - v > 60_000) lastAuthRefreshAtMs.delete(k)
2082
+ }
2040
2083
  // /auth rm two-step confirm window — self-expires at `expiresAt`.
2041
2084
  for (const [k, v] of pendingAuthRmFlows) {
2042
2085
  if (now >= v.expiresAt) pendingAuthRmFlows.delete(k)
@@ -2241,11 +2284,33 @@ function emitGatewayOperatorEvent(event: OperatorEvent): void {
2241
2284
  let renderedText: string
2242
2285
  let renderedKeyboard: ReturnType<typeof renderOperatorEvent>['keyboard'] | undefined
2243
2286
  if (modelUnavailable) {
2287
+ // Two questions, asked synchronously to avoid the "card promises
2288
+ // an announcement that never arrives" trap:
2289
+ // 1. Is this a kind that AUTO-fallback can address?
2290
+ // 2. Will the dispatcher actually fire (vs. dedup-drop)?
2291
+ // Card text branches on the AND. wouldFireFleetAutoFallback is a
2292
+ // pure read of the dedup state; calling fireFleetAutoFallback only
2293
+ // when both are true keeps the card honest.
2294
+ const isAutoKind =
2295
+ modelUnavailable.kind === 'quota_exhausted' || modelUnavailable.kind === 'overload'
2296
+ const willActuallyFire = isAutoKind && wouldFireFleetAutoFallback()
2244
2297
  process.stderr.write(
2245
- `telegram gateway: operator-event suppressing-raw-stderr-for-model-unavailable agent=${agent} kind=${kind} detected=${modelUnavailable.kind}\n`,
2298
+ `telegram gateway: operator-event suppressing-raw-stderr-for-model-unavailable agent=${agent} kind=${kind} detected=${modelUnavailable.kind} autoKind=${isAutoKind} willFire=${willActuallyFire}\n`,
2246
2299
  )
2247
- renderedText = formatModelUnavailableCard(modelUnavailable, agent)
2300
+ renderedText = formatModelUnavailableCard(modelUnavailable, agent, {
2301
+ autoFallbackInFlight: willActuallyFire,
2302
+ })
2248
2303
  renderedKeyboard = undefined
2304
+ // Trigger fleet-wide auto-fallback. Pre-fix this branch only
2305
+ // rendered the card; the fallback machinery was unreachable from
2306
+ // here. We fire-and-forget so card delivery is never blocked on
2307
+ // broker / API latency. The fallback's own announcement is sent
2308
+ // separately with the causal-shape headline ("5-hour limit on
2309
+ // ken" instead of generic "quota exhausted") — see
2310
+ // auth-snapshot-format.ts → renderFallbackAnnouncement.
2311
+ if (willActuallyFire) {
2312
+ void fireFleetAutoFallback(agent)
2313
+ }
2249
2314
  } else {
2250
2315
  try {
2251
2316
  const r = renderOperatorEvent(event)
@@ -2513,6 +2578,7 @@ silencePoke.startTimer({
2513
2578
  const text = silencePoke.formatFrameworkFallbackText(
2514
2579
  ctx.fallbackKind,
2515
2580
  ctx.silenceMs,
2581
+ ctx.inFlightTools,
2516
2582
  )
2517
2583
  try {
2518
2584
  await robustApiCall(
@@ -2820,9 +2886,46 @@ const ipcServer: IpcServer = createIpcServer({
2820
2886
  const key = statusKey(currentTurn.sessionChatId, currentTurn.sessionThreadId)
2821
2887
  if (ev.kind === 'thinking') {
2822
2888
  silencePoke.noteThinking(key, Date.now())
2823
- } else if (ev.kind === 'tool_use' && (ev.toolName === 'Task' || ev.toolName === 'Agent')) {
2824
- // Built-in claude sub-agent dispatch extends soft threshold to 5min.
2825
- silencePoke.noteSubagentDispatch(key)
2889
+ } else if (ev.kind === 'tool_use') {
2890
+ if (ev.toolName === 'Task' || ev.toolName === 'Agent') {
2891
+ // Built-in claude sub-agent dispatch — extends soft threshold to 5min.
2892
+ silencePoke.noteSubagentDispatch(key)
2893
+ }
2894
+ // #1292: track in-flight tool calls so the 300s framework
2895
+ // fallback message can name the actual observable (e.g.
2896
+ // "running Grep \"foo\" for 4m") instead of the dishonest
2897
+ // generic "still working… no update in 5 min" when the agent
2898
+ // is clearly busy on tool calls. Telegram-surface tools are
2899
+ // excluded — their job IS the outbound message, the silence
2900
+ // clock resets via noteOutbound when they fire. Sub-agent
2901
+ // tool_use events (kind='sub_agent_tool_use') intentionally
2902
+ // NOT tracked: the parent's Task tool_use is already on the
2903
+ // map and represents the user-observable wait.
2904
+ if (
2905
+ ev.toolUseId != null
2906
+ && ev.toolUseId.length > 0
2907
+ && !isTelegramSurfaceTool(ev.toolName)
2908
+ ) {
2909
+ const label = toolLabel(
2910
+ ev.toolName,
2911
+ ev.input,
2912
+ /*preamble*/ undefined,
2913
+ ev.precomputedLabel,
2914
+ )
2915
+ silencePoke.noteToolStart(
2916
+ key,
2917
+ ev.toolUseId,
2918
+ ev.toolName,
2919
+ label.length > 0 ? label : null,
2920
+ Date.now(),
2921
+ )
2922
+ }
2923
+ } else if (ev.kind === 'tool_result') {
2924
+ // #1292: drain the in-flight entry. Idempotent on unknown ids
2925
+ // (covers Telegram-surface tools we skipped at start time).
2926
+ if (ev.toolUseId != null && ev.toolUseId.length > 0) {
2927
+ silencePoke.noteToolEnd(key, ev.toolUseId, Date.now())
2928
+ }
2826
2929
  }
2827
2930
  }
2828
2931
  },
@@ -2959,6 +3062,69 @@ const ipcServer: IpcServer = createIpcServer({
2959
3062
  * Logs every fire so an operator can correlate the agent's
2960
3063
  * transcript turn against the scheduler's audit row by `prompt_key`.
2961
3064
  */
3065
+ async onRequestDriveApproval(client: IpcClient, msg) {
3066
+ // RFC E §4.2 Cut 2 — Drive-write PreToolUse hook is asking the
3067
+ // gateway to post a diff-preview card so the user can decide.
3068
+ await handleRequestDriveApproval(client, msg, {
3069
+ agentName: getMyAgentName(),
3070
+ loadAllowFrom: () => loadAccess().allowFrom,
3071
+ loadTargetChat: () => {
3072
+ const access = loadAccess()
3073
+ const operator = access.allowFrom[0]
3074
+ if (operator === undefined) return null
3075
+ // For DM-paired setups the target chat IS the operator's
3076
+ // user id. For group setups the gateway already has a topic
3077
+ // routing surface (see how /folders posts) — this picks the
3078
+ // DM path which is the common case; group-routing follow-up
3079
+ // can extend this.
3080
+ return { chatId: operator }
3081
+ },
3082
+ registerApproval: async (args) => {
3083
+ const r = await kernelApprovalRequest({
3084
+ agent_unit: args.agent_unit,
3085
+ scope: args.scope,
3086
+ action: args.action,
3087
+ approver_set: args.approver_set,
3088
+ why: args.why,
3089
+ ttl_ms: args.ttl_ms,
3090
+ })
3091
+ if (r === null || r.state === 'rate_limited') return null
3092
+ return {
3093
+ request_id: r.request_id,
3094
+ expires_at_ms: r.expires_at,
3095
+ }
3096
+ },
3097
+ postCard: async (args) => {
3098
+ try {
3099
+ const sent = await robustApiCall(
3100
+ () =>
3101
+ bot.api.sendMessage(args.chatId, args.text, {
3102
+ parse_mode: 'HTML',
3103
+ ...(args.threadId !== undefined
3104
+ ? { message_thread_id: args.threadId }
3105
+ : {}),
3106
+ reply_markup: args.replyMarkup as never,
3107
+ }),
3108
+ {
3109
+ chat_id: String(args.chatId),
3110
+ verb: 'drive-approval-card',
3111
+ ...(args.threadId !== undefined ? { threadId: args.threadId } : {}),
3112
+ },
3113
+ )
3114
+ return { messageId: (sent as { message_id: number }).message_id }
3115
+ } catch (err) {
3116
+ process.stderr.write(
3117
+ `telegram gateway: drive-approval postCard failed: ${(err as Error).message}\n`,
3118
+ )
3119
+ return null
3120
+ }
3121
+ },
3122
+ buildCard: ({ preview, suggestRequestId }) =>
3123
+ buildDiffPreviewCard({ preview, suggestRequestId }),
3124
+ log: (m) => process.stderr.write(`telegram gateway: drive-approval — ${m}\n`),
3125
+ })
3126
+ },
3127
+
2962
3128
  onInjectInbound(_client: IpcClient, msg: InjectInboundMessage) {
2963
3129
  const promptKey = typeof msg.inbound.meta?.prompt_key === 'string'
2964
3130
  ? msg.inbound.meta.prompt_key
@@ -4638,6 +4804,7 @@ function handleSessionEvent(ev: SessionEvent): void {
4638
4804
  gatewayReceiveAt: startedAt,
4639
4805
  replyCalled: false,
4640
4806
  capturedText: [],
4807
+ capturedTextLenAtLastReply: 0,
4641
4808
  orphanedReplyTimeoutId: null,
4642
4809
  registryKey: null,
4643
4810
  lastAssistantMsgId: null,
@@ -4734,6 +4901,12 @@ function handleSessionEvent(ev: SessionEvent): void {
4734
4901
  // placeholder-heartbeat label, which has been retired.
4735
4902
  if (isTelegramReplyTool(name)) {
4736
4903
  turn.replyCalled = true
4904
+ // #1291: pin the captured-text index at the moment of this reply
4905
+ // tool call. Anything pushed into capturedText after this point
4906
+ // is the post-reply tail (e.g. the substantive answer composed
4907
+ // in terminal text after a soft-commit "on it, back in a few").
4908
+ // decideTurnFlush slices from this index to flush the tail.
4909
+ turn.capturedTextLenAtLastReply = turn.capturedText.length
4737
4910
  if (turn.orphanedReplyTimeoutId != null) {
4738
4911
  clearTimeout(turn.orphanedReplyTimeoutId)
4739
4912
  turn.orphanedReplyTimeoutId = null
@@ -4993,8 +5166,20 @@ function handleSessionEvent(ev: SessionEvent): void {
4993
5166
  chatId: turn.sessionChatId,
4994
5167
  replyCalled: turn.replyCalled,
4995
5168
  capturedText: turn.capturedText,
5169
+ capturedTextLenAtLastReply: turn.capturedTextLenAtLastReply,
4996
5170
  flushEnabled: TURN_FLUSH_SAFETY_ENABLED,
4997
5171
  })
5172
+ // #1291: when the model emitted a soft-commit reply followed by a
5173
+ // substantive terminal-only answer, decideTurnFlush returns
5174
+ // kind:'flush' with the post-reply tail. Log WARN so this case is
5175
+ // auditable — the model SHOULD have called reply for the tail, but
5176
+ // didn't, and the framework is covering for it.
5177
+ if (flushDecision.kind === 'flush' && turn.replyCalled) {
5178
+ process.stderr.write(
5179
+ `telegram gateway: WARN post-reply-tail flush (#1291) — model emitted ${flushDecision.text.length} chars after a prior reply call without a follow-up reply tool` +
5180
+ ` chat=${chatId} turnStartedAt=${turn.startedAt}\n`,
5181
+ )
5182
+ }
4998
5183
  if (flushDecision.kind === 'skip' && flushDecision.reason !== 'reply-called') {
4999
5184
  process.stderr.write(
5000
5185
  `telegram gateway: turn-flush skipped — reason=${flushDecision.reason}\n`,
@@ -5144,6 +5329,21 @@ function handleSessionEvent(ev: SessionEvent): void {
5144
5329
  // backup; reset the preamble buffer (its content is already in
5145
5330
  // the captured `capturedText`, which turn-flush is about to send).
5146
5331
  preambleSuppressor.dropNow()
5332
+ // #1289 fix — drain silence-poke + signal-tracker state for this
5333
+ // turn. The three sibling turn_end exit branches (context-exhaust
5334
+ // at ~5098, silent-marker at ~5097-5098, default reply-called tail
5335
+ // at ~5348-5349) all call signalTracker.clear + silencePoke.endTurn.
5336
+ // The flush-backstop branch was retrofitted in #1067 to null
5337
+ // currentTurn early but never had this cleanup added — leaving the
5338
+ // silence-poke state in the Map, so 300s after the original turn
5339
+ // start the framework fallback fires and the user sees
5340
+ // "still working… (no update from agent in 5 min)" on a turn the
5341
+ // gateway already considers over.
5342
+ {
5343
+ const tKey = statusKey(chatId, threadId)
5344
+ signalTracker.clear(tKey)
5345
+ silencePoke.endTurn(tKey)
5346
+ }
5147
5347
 
5148
5348
  void (async () => {
5149
5349
  await new Promise<void>(resolve => setTimeout(resolve, 500))
@@ -7388,6 +7588,75 @@ async function executeVaultOp(ctx: Context, chatId: string, op: 'list' | 'get' |
7388
7588
  }
7389
7589
  }
7390
7590
 
7591
+ /**
7592
+ * Dispatch a short-running verb (agent_start, agent_stop, cross-agent
7593
+ * agent_restart) through hostd when available, else fall back to the
7594
+ * legacy in-container CLI shell-out.
7595
+ *
7596
+ * Why: on docker-mode hosts the agent container has no docker binary,
7597
+ * so the legacy `runSwitchroomCommand` path silently exits 127 for any
7598
+ * verb that touches compose (RFC C §1, #926). Hostd runs on the host
7599
+ * with the docker socket mounted, so the verb actually works.
7600
+ *
7601
+ * Result handling:
7602
+ * - `not-configured` → fall back to {@link runSwitchroomCommand}.
7603
+ * (Operator opted out; let the legacy path's existing error
7604
+ * surfacing handle the exit-127 case.)
7605
+ * - `completed` → reply with the stdout tail (mirrors the legacy
7606
+ * path's formatted-output reply).
7607
+ * - `started` → reply with a brief "🔄 dispatched" ack. Verbs that
7608
+ * return `started` (agent_restart) finish asynchronously on the
7609
+ * daemon; the audit log is the canonical record.
7610
+ * - `error` / `denied` → reply with the error tail inline. No
7611
+ * fallback (RFC §7 hard-fail contract — operator opted in).
7612
+ */
7613
+ async function dispatchShortVerbViaHostd(
7614
+ ctx: Context,
7615
+ req: HostdRequest,
7616
+ label: string,
7617
+ legacyArgs: string[],
7618
+ ): Promise<void> {
7619
+ const hostdResp = await tryHostdDispatch(getMyAgentName(), req)
7620
+ if (hostdResp === 'not-configured') {
7621
+ warnLegacySpawnIfHostdDisabled(req.op)
7622
+ await runSwitchroomCommand(ctx, legacyArgs, label)
7623
+ return
7624
+ }
7625
+ if (hostdResp.result === 'completed') {
7626
+ const body = hostdResp.stdout_tail?.trim() || `${label}: done (exit ${hostdResp.exit_code})`
7627
+ const formatted = formatSwitchroomOutput(stripAnsi(body))
7628
+ if (formatted) {
7629
+ await switchroomReply(ctx, preBlock(formatted), { html: true })
7630
+ } else {
7631
+ await switchroomReply(ctx, `${label}: done (no output)`)
7632
+ }
7633
+ return
7634
+ }
7635
+ if (hostdResp.result === 'started') {
7636
+ await switchroomReply(
7637
+ ctx,
7638
+ `🔄 <b>${escapeHtmlForTg(label)}</b> dispatched via hostd ` +
7639
+ `(request_id=<code>${escapeHtmlForTg(hostdResp.request_id)}</code>). ` +
7640
+ `Check audit log for completion.`,
7641
+ { html: true },
7642
+ )
7643
+ return
7644
+ }
7645
+ // error / denied — surface inline. RFC §7 hard-fail: no spawn fallback.
7646
+ const errBody =
7647
+ hostdResp.error ??
7648
+ hostdResp.stderr_tail ??
7649
+ hostdResp.stdout_tail ??
7650
+ '(no error tail returned)'
7651
+ await switchroomReply(
7652
+ ctx,
7653
+ `❌ <b>${escapeHtmlForTg(label)} failed via hostd</b> ` +
7654
+ `(result=${escapeHtmlForTg(hostdResp.result)}):\n` +
7655
+ preBlock(stripAnsi(errBody)),
7656
+ { html: true },
7657
+ )
7658
+ }
7659
+
7391
7660
  async function runSwitchroomCommand(ctx: Context, args: string[], label: string): Promise<void> {
7392
7661
  try {
7393
7662
  const output = stripAnsi(switchroomExec(args))
@@ -7620,13 +7889,13 @@ function buildAgentAudit(agentName: string): AgentAudit | undefined {
7620
7889
  }
7621
7890
 
7622
7891
  // Build an AgentMetadata snapshot for the current agent by shelling out
7623
- // to `switchroom agent list --json` and `switchroom auth status --json`.
7624
- // TODO(rfc-h): the `auth status` verb was retired by RFC H. The shell
7625
- // fails silently and `authSummary` lands as null — /status renders
7626
- // without auth detail. Replace with an `auth show --json` adapter that
7627
- // maps the new fleet-broker shape to the per-agent AuthSummary fields.
7892
+ // to `switchroom agent list --json` and `switchroom auth show --json`.
7628
7893
  // Best-effort — any missing piece renders as a placeholder in the text
7629
- // templates rather than blocking the reply.
7894
+ // templates rather than blocking the reply. RFC H retired the per-agent
7895
+ // `auth status --json` shape; auth state is now derived from the
7896
+ // broker's fleet-wide `ListStateData` payload via
7897
+ // `buildAuthSummaryFromBroker`, with billingType pulled from the
7898
+ // agent's `.claude.json` (the broker doesn't track plan tier).
7630
7899
  async function buildAgentMetadata(agentName: string): Promise<AgentMetadata> {
7631
7900
  type AgentListResp = {
7632
7901
  agents: Array<{
@@ -7636,24 +7905,22 @@ async function buildAgentMetadata(agentName: string): Promise<AgentMetadata> {
7636
7905
  model?: string | null;
7637
7906
  }>
7638
7907
  }
7639
- type AuthStatusResp = {
7640
- agents: Array<{
7641
- name: string; authenticated: boolean; auth_source: string | null;
7642
- subscription_type: string | null; expires_in: string | null;
7643
- }>
7644
- }
7645
7908
  const list = switchroomExecJson<AgentListResp>(['agent', 'list'])
7646
- const auth = switchroomExecJson<AuthStatusResp>(['auth', 'status'])
7909
+ const brokerState = switchroomExecJson<BrokerStateView>(['auth', 'show'])
7647
7910
  const a = list?.agents?.find(x => x.name === agentName) ?? null
7648
- const au = auth?.agents?.find(x => x.name === agentName) ?? null
7649
- const authSummary: AuthSummary | null = au
7650
- ? {
7651
- authenticated: au.authenticated,
7652
- subscription_type: au.subscription_type,
7653
- expires_in: au.expires_in,
7654
- auth_source: au.auth_source,
7655
- }
7656
- : null
7911
+ let claudeJson: ClaudeJsonView | null = null
7912
+ try {
7913
+ const agentDir = resolveAgentDirFromEnv()
7914
+ if (agentDir) {
7915
+ const raw = readFileSync(join(agentDir, '.claude', '.claude.json'), 'utf8')
7916
+ claudeJson = JSON.parse(raw) as ClaudeJsonView
7917
+ }
7918
+ } catch { /* leave null — billingType becomes null in the summary */ }
7919
+ const authSummary: AuthSummary | null = buildAuthSummaryFromBroker(
7920
+ brokerState,
7921
+ agentName,
7922
+ claudeJson,
7923
+ )
7657
7924
  return {
7658
7925
  agentName,
7659
7926
  model: a?.model ?? null,
@@ -7798,14 +8065,24 @@ bot.command('agentstart', async ctx => {
7798
8065
  if (!isAuthorizedSender(ctx)) return
7799
8066
  const name = ctx.match?.trim() || getMyAgentName()
7800
8067
  try { assertSafeAgentName(name) } catch { await switchroomReply(ctx, 'Invalid agent name.'); return }
7801
- await runSwitchroomCommand(ctx, ['agent', 'start', name], `start ${name}`)
8068
+ await dispatchShortVerbViaHostd(
8069
+ ctx,
8070
+ { v: 1, op: 'agent_start', request_id: hostdRequestId('gw-start'), args: { name } },
8071
+ `start ${name}`,
8072
+ ['agent', 'start', name],
8073
+ )
7802
8074
  })
7803
8075
 
7804
8076
  bot.command('stop', async ctx => {
7805
8077
  if (!isAuthorizedSender(ctx)) return
7806
8078
  const name = ctx.match?.trim() || getMyAgentName()
7807
8079
  try { assertSafeAgentName(name) } catch { await switchroomReply(ctx, 'Invalid agent name.'); return }
7808
- await runSwitchroomCommand(ctx, ['agent', 'stop', name], `stop ${name}`)
8080
+ await dispatchShortVerbViaHostd(
8081
+ ctx,
8082
+ { v: 1, op: 'agent_stop', request_id: hostdRequestId('gw-stop'), args: { name } },
8083
+ `stop ${name}`,
8084
+ ['agent', 'stop', name],
8085
+ )
7809
8086
  })
7810
8087
 
7811
8088
  bot.command('restart', async ctx => {
@@ -7852,6 +8129,7 @@ bot.command('restart', async ctx => {
7852
8129
  args: { name, force: true, reason: 'user: /restart from chat' },
7853
8130
  })
7854
8131
  if (hostdResp === 'not-configured') {
8132
+ warnLegacySpawnIfHostdDisabled('agent_restart')
7855
8133
  spawnSwitchroomDetached(
7856
8134
  ['agent', 'restart', name, '--force'],
7857
8135
  notifyDetachedFailure(chatId, threadId ?? null, `restart ${name}`),
@@ -7874,7 +8152,22 @@ bot.command('restart', async ctx => {
7874
8152
  )
7875
8153
  return
7876
8154
  }
7877
- await runSwitchroomCommand(ctx, ['agent', 'restart', name], `restart ${name}`)
8155
+ // Cross-agent /restart <other>. Same hostd-first shape as self-target,
8156
+ // but no restart marker / no self-kill: another agent's container is
8157
+ // about to bounce, not ours. The daemon spawns the work and returns
8158
+ // "started" (per handleAgentRestart at server.ts:466), so the user
8159
+ // sees a brief dispatch ack and the audit log carries the outcome.
8160
+ await dispatchShortVerbViaHostd(
8161
+ ctx,
8162
+ {
8163
+ v: 1,
8164
+ op: 'agent_restart',
8165
+ request_id: hostdRequestId('gw-restart-cross'),
8166
+ args: { name, force: true, reason: `user: /restart ${name} from chat` },
8167
+ },
8168
+ `restart ${name}`,
8169
+ ['agent', 'restart', name],
8170
+ )
7878
8171
  })
7879
8172
 
7880
8173
  // ─── /new and /reset ──────────────────────────────────────────────────────
@@ -7993,6 +8286,7 @@ async function handleNewOrResetCommand(ctx: Context, kind: 'new' | 'reset'): Pro
7993
8286
  args: { name, force: true, reason: `user: /${kind} from chat` },
7994
8287
  })
7995
8288
  if (hostdResp === 'not-configured') {
8289
+ warnLegacySpawnIfHostdDisabled('agent_restart')
7996
8290
  spawnSwitchroomDetached(
7997
8291
  ['agent', 'restart', name, '--force'],
7998
8292
  notifyDetachedFailure(chatId, threadId ?? null, `${kind} ${name}`),
@@ -8156,23 +8450,83 @@ bot.command('update', async ctx => {
8156
8450
  await sweepBeforeSelfRestart()
8157
8451
  const skipImages = passthrough.includes('--skip-images')
8158
8452
  const rebuild = passthrough.includes('--rebuild')
8453
+ const updateRequestId = hostdRequestId('gw-update')
8159
8454
  const hostdResp = await tryHostdDispatch(getMyAgentName(), {
8160
8455
  v: 1,
8161
8456
  op: 'update_apply',
8162
- request_id: hostdRequestId('gw-update'),
8457
+ request_id: updateRequestId,
8163
8458
  args: {
8164
8459
  ...(skipImages ? { skip_images: true } : {}),
8165
8460
  ...(rebuild ? { rebuild: true } : {}),
8166
8461
  },
8167
8462
  })
8168
8463
  if (hostdResp === 'not-configured') {
8464
+ warnLegacySpawnIfHostdDisabled('update_apply')
8169
8465
  spawnSwitchroomDetached(
8170
8466
  ['update', ...passthrough],
8171
8467
  notifyDetachedFailure(chatId, threadId ?? null, 'update'),
8172
8468
  )
8173
8469
  return
8174
8470
  }
8175
- if (hostdResp.result === 'started' || hostdResp.result === 'completed') {
8471
+ if (hostdResp.result === 'completed') {
8472
+ return
8473
+ }
8474
+ if (hostdResp.result === 'started') {
8475
+ // RFC C §5.3: long-running mutation. Poll get_status until terminal
8476
+ // or until the recreate kills this gateway (whichever happens first).
8477
+ // The success signal is the post-restart greeting card edited into
8478
+ // ackId via the restart marker. The poll is here so that
8479
+ // *fail-before-recreate* (image pull error, scaffold regen crash)
8480
+ // doesn't leave the operator staring at the orphan "🚀 update started"
8481
+ // ack indefinitely. Live repro: PR #1305.
8482
+ void (async () => {
8483
+ // 60s budget: RFC C §5.3 specs `apply` at 30s and `update_apply`
8484
+ // at 60s. Image pulls + scaffold regeneration dominate the wall
8485
+ // clock for update_apply, hence the larger budget. The poll
8486
+ // resolves earlier on any terminal state from the daemon.
8487
+ const terminal = await pollHostdStatus(getMyAgentName(), updateRequestId, {
8488
+ timeoutMs: 60_000,
8489
+ })
8490
+ if (terminal === 'not-configured') return
8491
+ // completed → recreate is about to run / has run; let the post-
8492
+ // restart greeting card handle the success message.
8493
+ if (terminal.result === 'completed') return
8494
+ // Anything else means the daemon's mutation failed before it could
8495
+ // kill us. Edit the ack to surface the tail and clear the marker
8496
+ // so the next gateway boot doesn't render a false success card.
8497
+ clearRestartMarker()
8498
+ const errBody =
8499
+ terminal.error ??
8500
+ terminal.stderr_tail ??
8501
+ terminal.stdout_tail ??
8502
+ '(no error tail returned)'
8503
+ const editedText =
8504
+ `🚀 <b>update started</b> — <b>FAILED</b> via hostd ` +
8505
+ `(result=${escapeHtmlForTg(terminal.result)}):\n` +
8506
+ preBlock(errBody)
8507
+ if (ackId != null) {
8508
+ try {
8509
+ await robustApiCall(
8510
+ () =>
8511
+ lockedBot.api.editMessageText(chatId, ackId!, editedText, {
8512
+ parse_mode: 'HTML',
8513
+ link_preview_options: { is_disabled: true },
8514
+ }),
8515
+ { verb: 'update.poll.editAck' },
8516
+ )
8517
+ } catch {
8518
+ // edit-failed (message deleted, parse error) — fall back to
8519
+ // a fresh reply so the failure isn't silent.
8520
+ try {
8521
+ await switchroomReply(ctx, editedText, { html: true })
8522
+ } catch {}
8523
+ }
8524
+ } else {
8525
+ try {
8526
+ await switchroomReply(ctx, editedText, { html: true })
8527
+ } catch {}
8528
+ }
8529
+ })()
8176
8530
  return
8177
8531
  }
8178
8532
  clearRestartMarker()
@@ -8209,6 +8563,81 @@ bot.command('upgrade', async ctx => {
8209
8563
  )
8210
8564
  })
8211
8565
 
8566
+ // /audit hostd — tail/filter the hostd audit log. Mirrors `/vault audit`
8567
+ // in spirit (operator observability over a privileged subsystem from any
8568
+ // admin DM). Admin-gated via ADMIN_COMMAND_NAMES. Reads the audit JSONL
8569
+ // at ~/.switchroom/host-control-audit.log directly — no hostd RPC needed
8570
+ // because the file is shared via the host bind mount on docker installs.
8571
+ bot.command('audit', async ctx => {
8572
+ if (!isAuthorizedSender(ctx)) return
8573
+ const arg = (ctx.match ?? '').trim()
8574
+ if (arg === '' || arg === 'help' || arg === '--help') {
8575
+ await switchroomReply(
8576
+ ctx,
8577
+ 'Usage: <code>/audit hostd [--tail N] [--agent &lt;name&gt;] [--op &lt;verb&gt;] [--error]</code>',
8578
+ { html: true },
8579
+ )
8580
+ return
8581
+ }
8582
+ const tokens = arg.split(/\s+/)
8583
+ const sub = tokens[0]
8584
+ if (sub !== 'hostd') {
8585
+ await switchroomReply(
8586
+ ctx,
8587
+ `Unknown audit target <code>${escapeHtmlForTg(sub ?? '')}</code>. ` +
8588
+ `Supported: <code>hostd</code>.`,
8589
+ { html: true },
8590
+ )
8591
+ return
8592
+ }
8593
+ // Build the CLI argv for switchroom hostd audit. Validate each
8594
+ // operator-supplied value to keep argv injection out of the picture.
8595
+ const ALLOWED_OPS = new Set([
8596
+ 'agent_start', 'agent_stop', 'agent_restart', 'apply',
8597
+ 'update_check', 'update_apply', 'update_status', 'upgrade_status',
8598
+ 'get_status', 'doctor', 'fleet_state',
8599
+ ])
8600
+ const argv: string[] = ['hostd', 'audit']
8601
+ for (let i = 1; i < tokens.length; i++) {
8602
+ const t = tokens[i]!
8603
+ if (t === '--error') { argv.push('--error'); continue }
8604
+ if (t === '--tail' || t === '--agent' || t === '--op') {
8605
+ const v = tokens[++i]
8606
+ if (v == null) {
8607
+ await switchroomReply(ctx, `Flag <code>${t}</code> requires a value.`, { html: true })
8608
+ return
8609
+ }
8610
+ if (t === '--tail' && !/^[0-9]{1,4}$/.test(v)) {
8611
+ await switchroomReply(ctx, `<code>--tail</code> must be an integer (1-9999).`, { html: true })
8612
+ return
8613
+ }
8614
+ if (t === '--agent' && !/^[a-z][a-z0-9-]{0,62}$/i.test(v)) {
8615
+ await switchroomReply(ctx, `<code>--agent</code> name has an invalid shape.`, { html: true })
8616
+ return
8617
+ }
8618
+ if (t === '--op' && !ALLOWED_OPS.has(v)) {
8619
+ await switchroomReply(
8620
+ ctx,
8621
+ `Unknown hostd verb <code>${escapeHtmlForTg(v)}</code>. ` +
8622
+ `Known: ${[...ALLOWED_OPS].sort().map(o => `<code>${o}</code>`).join(', ')}.`,
8623
+ { html: true },
8624
+ )
8625
+ return
8626
+ }
8627
+ argv.push(t, v)
8628
+ continue
8629
+ }
8630
+ await switchroomReply(
8631
+ ctx,
8632
+ `Unknown flag <code>${escapeHtmlForTg(t)}</code>. ` +
8633
+ `Allowed: <code>--tail</code>, <code>--agent</code>, <code>--op</code>, <code>--error</code>.`,
8634
+ { html: true },
8635
+ )
8636
+ return
8637
+ }
8638
+ await runSwitchroomCommand(ctx, argv, `hostd audit${argv.length > 2 ? ' …' : ''}`)
8639
+ })
8640
+
8212
8641
  // ─── /approve, /deny, /pending ────────────────────────────────────────────
8213
8642
  // Slash-command alternatives to the inline-button approval flow (useful for
8214
8643
  // desktop-only sessions and power-users). Share pendingPermissions state
@@ -8272,6 +8701,59 @@ async function handlePermissionSlash(ctx: Context, behavior: 'allow' | 'deny'):
8272
8701
  bot.command('approve', async ctx => handlePermissionSlash(ctx, 'allow'))
8273
8702
  bot.command('deny', async ctx => handlePermissionSlash(ctx, 'deny'))
8274
8703
 
8704
+ // ─── Drive folder picker (RFC E §4.1) ───────────────────────────────────
8705
+ // /folders — post a Telegram picker card listing this agent's top-level
8706
+ // Drive folders. Tap [Allow] on a folder to grant the agent
8707
+ // allow_always at doc:gdrive:folder/<id>/**; tap [Browse] to drill in.
8708
+ //
8709
+ // Authorisation: same dmCommandGate as the other operator slash
8710
+ // commands — only allowFrom users can post-trigger.
8711
+
8712
+ const folderPickerCache = new FolderListCache()
8713
+
8714
+ function buildFolderPickerDeps(): FolderPickerHandlerDeps {
8715
+ const agentName = getMyAgentName()
8716
+ return {
8717
+ agentName,
8718
+ cache: folderPickerCache,
8719
+ fetchPage: async ({ parent_id, page_token }) => {
8720
+ const handle = await loadFromAuthBroker()
8721
+ if (handle === null) {
8722
+ throw new Error(
8723
+ `auth-broker unreachable for agent ${agentName} — is the broker container running?`,
8724
+ )
8725
+ }
8726
+ return fetchFolderPage({
8727
+ access_token: handle.access_token,
8728
+ ...(parent_id !== undefined ? { parent_id } : {}),
8729
+ ...(page_token !== undefined ? { page_token } : {}),
8730
+ })
8731
+ },
8732
+ approvalRequest: async (args) => {
8733
+ const r = await kernelApprovalRequest({
8734
+ agent_unit: args.agent_unit,
8735
+ scope: args.scope,
8736
+ action: args.action,
8737
+ approver_set: args.approver_set,
8738
+ ...(args.why !== null && args.why !== undefined ? { why: args.why } : {}),
8739
+ ...(args.ttl_ms !== null && args.ttl_ms !== undefined ? { ttl_ms: args.ttl_ms } : {}),
8740
+ })
8741
+ if (r === null || r.state === 'rate_limited') return null
8742
+ return { request_id: r.request_id }
8743
+ },
8744
+ approvalConsume: async (id) => {
8745
+ const r = await kernelApprovalConsume(id)
8746
+ return r !== null && r.consumed
8747
+ },
8748
+ approvalRecord: async (args) => kernelApprovalRecord(args),
8749
+ }
8750
+ }
8751
+
8752
+ bot.command('folders', async ctx => {
8753
+ if (!isAuthorizedSender(ctx)) return
8754
+ await handleFoldersCommand(ctx, buildFolderPickerDeps())
8755
+ })
8756
+
8275
8757
  // /pending — list current pending permission prompts with their ids, so the
8276
8758
  // user can target a specific one via /approve <id> or /deny <id>.
8277
8759
  // Restricted to access.allowFrom DMs to match /approve and /deny — it
@@ -8303,16 +8785,12 @@ bot.command('interrupt', async ctx => {
8303
8785
  await runSwitchroomCommand(ctx, ['agent', 'interrupt', name], `interrupt ${name}`)
8304
8786
  })
8305
8787
 
8306
- // Shared auto-fallback state. `lockout` is a per-process in-memory
8307
- // guard against rapid re-fire between the scheduled poll and any
8308
- // manual trigger (see telegram-plugin/auto-fallback.ts).
8309
- //
8310
- // Pre-#417 fix this was always emptyLockout() at process start, so a
8311
- // gateway restart inside the cooldown window reset the timer and a
8312
- // quota-flap on the recovering slot could re-trigger fallback the
8313
- // moment the gateway came back. We now seed from disk on first use
8314
- // and persist on every transition. Errors are swallowed: losing the
8315
- // lockout file just degrades to in-memory-only behaviour.
8788
+ // Persist-ops bundle for the legacy auto-fallback lockout file. The
8789
+ // only remaining reader is `isAutoFallbackCooldownActive` (line ~2030)
8790
+ // used by the pending-restart drain cap to defer a forced restart
8791
+ // stacking on top of an in-flight slot rotation. The legacy poller
8792
+ // that USED to write this file was retired alongside this refactor;
8793
+ // existing on-disk lockouts age out via DEFAULT_FALLBACK_COOLDOWN_MS.
8316
8794
  const lockoutOps: LockoutPersistOps = {
8317
8795
  readFileSync: (p, enc) => readFileSync(p, enc),
8318
8796
  writeFileSync: (p, data, opts) => writeFileSync(p, data, opts),
@@ -8320,24 +8798,6 @@ const lockoutOps: LockoutPersistOps = {
8320
8798
  mkdirSync: (p, opts) => mkdirSync(p, opts),
8321
8799
  joinPath: (...parts) => join(...parts),
8322
8800
  }
8323
- let autoFallbackLockout: LockoutRecord = emptyLockout()
8324
- let autoFallbackLockoutSeeded = false
8325
- function seedAutoFallbackLockoutIfNeeded(agentDir: string): void {
8326
- if (autoFallbackLockoutSeeded) return
8327
- autoFallbackLockoutSeeded = true
8328
- try {
8329
- autoFallbackLockout = loadLockout(agentDir, lockoutOps)
8330
- } catch (err) {
8331
- process.stderr.write(`telegram gateway: auto-fallback lockout seed failed (using empty): ${(err as Error).message}\n`)
8332
- }
8333
- }
8334
- function persistLockout(agentDir: string): void {
8335
- try {
8336
- saveLockout(agentDir, autoFallbackLockout, lockoutOps)
8337
- } catch (err) {
8338
- process.stderr.write(`telegram gateway: auto-fallback lockout persist failed: ${(err as Error).message}\n`)
8339
- }
8340
- }
8341
8801
 
8342
8802
  // Pinned slot-banner state (#421). One banner per gateway process,
8343
8803
  // in the owner chat (access.allowFrom[0]). Per-topic forum support
@@ -8368,91 +8828,129 @@ async function refreshPinnedBanner(reason: string): Promise<void> {
8368
8828
  }
8369
8829
  }
8370
8830
 
8371
- type AutoFallbackCheckResult =
8372
- | { kind: 'no-action'; reason: string; decision: 'noop' | 'fallback-skipped' }
8373
- | { kind: 'executed'; previousSlot: string; newSlot: string }
8374
- | { kind: 'exhausted-all'; activeSlot: string }
8375
- | { kind: 'error'; message: string }
8831
+ /**
8832
+ * Re-entry guard + dedup window for `fireFleetAutoFallback`. The state
8833
+ * was lifted into `fleet-fallback-gate.ts` so it can be tested in
8834
+ * isolation (gateway.ts module state was unreachable from vitest). The
8835
+ * gate ALSO enforces the broker-reachability honesty contract: when the
8836
+ * broker is down, `wouldFire()` returns false so the model-unavailable
8837
+ * card stays honest instead of advertising a swap that would bail with
8838
+ * `reason=no-broker-client`.
8839
+ */
8840
+ const FLEET_FALLBACK_DEDUP_MS = 30_000
8841
+
8842
+ /** Synchronous reachability check for the auth-broker UDS. Used by the
8843
+ * fleet-fallback gate to keep the model-unavailable card honest: if the
8844
+ * broker socket isn't bound, the dispatcher would bail with
8845
+ * `reason=no-broker-client`, so `wouldFire()` should return false and
8846
+ * the card should fall back to the manual `/auth use <label>` hint. */
8847
+ function isAuthBrokerSocketReachable(): boolean {
8848
+ try {
8849
+ return existsSync(resolveAuthBrokerSocketPath())
8850
+ } catch {
8851
+ return false
8852
+ }
8853
+ }
8854
+
8855
+ const fleetFallbackGate = createFleetFallbackGate({
8856
+ dedupMs: FLEET_FALLBACK_DEDUP_MS,
8857
+ brokerReachable: isAuthBrokerSocketReachable,
8858
+ })
8859
+
8860
+ function wouldFireFleetAutoFallback(): boolean {
8861
+ return fleetFallbackGate.wouldFire()
8862
+ }
8376
8863
 
8377
- async function runAutoFallbackCheck(opts: { trigger: 'scheduled' | 'manual' }): Promise<AutoFallbackCheckResult> {
8378
- // All log lines in this path use the `[autofallback]` tag so a single
8379
- // grep against journalctl reconstructs the full decision history of
8380
- // a slot rotation: `journalctl -u switchroom-<agent>-gateway -g autofallback`.
8864
+ /**
8865
+ * Fleet-wide auto-fallback dispatcher (RFC H follow-up).
8866
+ *
8867
+ * Wired from the model-unavailable card render path so a quota-out
8868
+ * event on ANY agent immediately triggers a fleet-wide swap (via
8869
+ * broker.setActive — same path /auth use takes), not the per-agent
8870
+ * legacy `runAutoFallbackCheck`. Pre-fix, the card path never called
8871
+ * any fallback machinery; the scheduled poller (60-min interval, only
8872
+ * fires on utilization headers) was the only trigger and missed
8873
+ * hard-rejection events.
8874
+ *
8875
+ * Concurrency: collapses concurrent triggers via the in-flight
8876
+ * promise above. Subsequent calls within `FLEET_FALLBACK_DEDUP_MS` of
8877
+ * a recent fire are dropped silently — the broadcast announcement is
8878
+ * the user-visible signal that the swap happened, no need to repeat.
8879
+ *
8880
+ * Fire-and-forget: never throws into the caller's flow. Posts the
8881
+ * causal-shape announcement to every chat in `loadAccess().allowFrom`
8882
+ * so the user sees the outcome inline with the original "Model
8883
+ * unavailable" card.
8884
+ */
8885
+ async function fireFleetAutoFallback(triggerAgent: string): Promise<void> {
8886
+ return fleetFallbackGate.fire(
8887
+ () => doFireFleetAutoFallback(triggerAgent),
8888
+ (err) => {
8889
+ process.stderr.write(
8890
+ `telegram gateway: [fleet-fallback] error agent=${triggerAgent}: ${(err as Error)?.message ?? err}\n`,
8891
+ )
8892
+ },
8893
+ )
8894
+ }
8895
+
8896
+ /** Returns true iff the dispatcher actually performed a swap (and the
8897
+ * user-visible announcement was broadcast). False on no-op /
8898
+ * error / idempotent-skip — caller uses this to decide whether to
8899
+ * arm the post-fire suppression window. */
8900
+ async function doFireFleetAutoFallback(triggerAgent: string): Promise<boolean> {
8381
8901
  try {
8382
- const agentDir = resolveAgentDirFromEnv()
8383
- if (!agentDir) {
8384
- return { kind: 'no-action', reason: 'no agent dir', decision: 'noop' }
8385
- }
8386
- const agentName = getMyAgentName()
8387
- seedAutoFallbackLockoutIfNeeded(agentDir)
8388
- const active = currentActiveSlot(agentDir)
8389
- const quota = await fetchQuota({ claudeConfigDir: join(agentDir, '.claude') })
8390
- const decision = evaluateFallbackTrigger({
8391
- quota,
8392
- activeSlot: active,
8393
- now: Date.now(),
8394
- lockout: autoFallbackLockout,
8395
- })
8396
- if (decision.action !== 'fallback') {
8902
+ const client = await getAuthBrokerClient(triggerAgent)
8903
+ if (!client) {
8397
8904
  process.stderr.write(
8398
- `telegram gateway: [autofallback] noop trigger=${opts.trigger} agent=${agentName} active=${active ?? 'none'} reason=${decision.reason}\n`,
8905
+ `telegram gateway: [fleet-fallback] skipped agent=${triggerAgent} reason=no-broker-client\n`,
8399
8906
  )
8400
- return { kind: 'no-action', reason: decision.reason, decision: 'noop' }
8907
+ return false
8401
8908
  }
8402
- process.stderr.write(
8403
- `telegram gateway: [autofallback] decision=fallback trigger=${opts.trigger} agent=${agentName} active=${active ?? 'none'} reason=${decision.triggerReason} util=${decision.utilizationPct?.toFixed(1) ?? 'n/a'}%\n`,
8404
- )
8405
- const plan = performAutoFallback({
8406
- agentDir,
8407
- agentName,
8408
- decision,
8409
- deps: { currentActiveSlot, markSlotQuotaExhausted, fallbackToNextSlot },
8909
+ const state = await client.listState()
8910
+ // Probe live quota via the broker (#1336). Pre-fix this read
8911
+ // credentials.json off the agent HOME, which is never populated
8912
+ // post-RFC-H every account looked "no credentials" and the
8913
+ // fallback logic rolled blindly. Broker-routed probes use the
8914
+ // canonical stored tokens.
8915
+ const probeResp = state.accounts.length > 0
8916
+ ? await client.probeQuota(state.accounts.map((a) => a.label)).catch(() => ({ results: [] }))
8917
+ : { results: [] }
8918
+ const quotas = state.accounts.map((a) => {
8919
+ const hit = probeResp.results.find((r) => r.label === a.label)
8920
+ return hit?.result ?? { ok: false as const, reason: 'broker returned no result for account' }
8410
8921
  })
8411
- const ownerChatId = loadAccess().allowFrom[0]
8412
- await dispatchFallbackNotification({
8413
- bot,
8414
- ownerChatId,
8415
- plan,
8416
- onError: (err) => {
8417
- process.stderr.write(`telegram gateway: [autofallback] notify failed trigger=${opts.trigger} agent=${agentName}: ${err}\n`)
8418
- },
8922
+ const tz = process.env.SWITCHROOM_TIMEZONE ?? process.env.TZ ?? 'UTC'
8923
+ const outcome = await runFleetAutoFallback({
8924
+ state,
8925
+ quotas,
8926
+ setActive: (label) => client.setActive(label),
8927
+ triggerAgent,
8928
+ tz,
8419
8929
  })
8420
- if (plan.kind === 'executed') {
8421
- try { assertSafeAgentName(plan.agentName) }
8422
- catch {
8423
- process.stderr.write(`telegram gateway: [autofallback] invalid-agent-name agent=${plan.agentName}\n`)
8424
- return { kind: 'error', message: `invalid agent name: ${plan.agentName}` }
8425
- }
8426
- try {
8427
- // Preemptive failover (utilization-over-threshold / explicit) waits
8428
- // for the active turn to drain. Reactive failover (429-response)
8429
- // hard-restarts because the request that triggered it has already
8430
- // failed — there's no in-flight turn worth preserving. See #420.
8431
- const restartArgs = ['agent', 'restart', plan.agentName]
8432
- if (plan.triggerReason !== '429-response') {
8433
- restartArgs.push('--graceful-restart')
8434
- }
8435
- process.stderr.write(
8436
- `telegram gateway: [autofallback] executed agent=${plan.agentName} prev=${plan.previousSlot} next=${plan.newSlot} restart=${plan.triggerReason === '429-response' ? 'hard' : 'graceful'}\n`,
8437
- )
8438
- switchroomExec(restartArgs)
8439
- } catch (err) {
8440
- process.stderr.write(`telegram gateway: [autofallback] restart failed agent=${plan.agentName}: ${err}\n`)
8441
- }
8442
- autoFallbackLockout = nextLockout(plan.previousSlot, Date.now())
8443
- persistLockout(agentDir)
8444
- void refreshPinnedBanner('auto-fallback')
8445
- return { kind: 'executed', previousSlot: plan.previousSlot, newSlot: plan.newSlot }
8446
- }
8447
8930
  process.stderr.write(
8448
- `telegram gateway: [autofallback] exhausted-all agent=${agentName} active=${plan.activeSlot}\n`,
8931
+ `telegram gateway: [fleet-fallback] outcome=${outcome.kind} agent=${triggerAgent}` +
8932
+ (outcome.kind === 'switched' ? ` old=${outcome.oldLabel} new=${outcome.newLabel}` : '') +
8933
+ '\n',
8449
8934
  )
8450
- autoFallbackLockout = nextLockout(plan.activeSlot, Date.now())
8451
- persistLockout(agentDir)
8452
- return { kind: 'exhausted-all', activeSlot: plan.activeSlot }
8935
+ // Post the announcement to every authorized chat. Mirrors the
8936
+ // operator-event broadcast pattern (line ~2290) — DM-only opts
8937
+ // (no message_thread_id) so THREAD_NOT_FOUND can't fire here;
8938
+ // wrap in swallowingApiCall anyway per the codebase rule.
8939
+ const access = loadAccess()
8940
+ if (access.allowFrom.length === 0) return outcome.kind === 'switched'
8941
+ const opts = { parse_mode: 'HTML' as const }
8942
+ for (const chat_id of access.allowFrom) {
8943
+ void swallowingApiCall(
8944
+ () => bot.api.sendMessage(chat_id, outcome.announcement, opts),
8945
+ { chat_id, verb: 'fleet-fallback:notify' },
8946
+ )
8947
+ }
8948
+ return outcome.kind === 'switched'
8453
8949
  } catch (err) {
8454
- process.stderr.write(`telegram gateway: [autofallback] ${opts.trigger} poll error: ${err}\n`)
8455
- return { kind: 'error', message: String((err as Error).message ?? err) }
8950
+ process.stderr.write(
8951
+ `telegram gateway: [fleet-fallback] error agent=${triggerAgent}: ${(err as Error)?.message ?? err}\n`,
8952
+ )
8953
+ return false
8456
8954
  }
8457
8955
  }
8458
8956
 
@@ -8512,15 +9010,6 @@ async function runCreditWatch(): Promise<void> {
8512
9010
  }
8513
9011
  }
8514
9012
 
8515
- // /authfallback was removed in v0.6.12 — it duplicated the work of
8516
- // the dashboard's Switch primary picker (operator-facing surface) and
8517
- // the auto-fallback poller (transparent on-quota-wall case).
8518
- // Operators who want to manually shuffle the active credential now
8519
- // use the picker. The `runAutoFallbackCheck` function and the
8520
- // `case 'fallback':` callback dispatch stay in the codebase: any
8521
- // pinned messages from earlier versions still work, and the
8522
- // auto-fallback poller still calls runAutoFallbackCheck directly.
8523
-
8524
9013
  bot.command("auth", async ctx => {
8525
9014
  if (!isAuthorizedSender(ctx)) return
8526
9015
  const text = ctx.message?.text ?? ""
@@ -8614,8 +9103,60 @@ bot.command("auth", async ctx => {
8614
9103
  isAdmin,
8615
9104
  client,
8616
9105
  chatId,
9106
+ // Format 2 enricher — live quota probe via the broker (#1336).
9107
+ // Pre-broker this read `~/.switchroom/accounts/<label>/credentials.json`
9108
+ // off the agent's HOME, which post-RFC-H is never populated (broker
9109
+ // writes only the per-agent .claude/.credentials.json mirror) — so
9110
+ // every account showed "no credentials.json or accessToken" in
9111
+ // /auth show. The broker is the source of truth for tokens and now
9112
+ // does the Anthropic probe server-side via `probe-quota`. Tokens
9113
+ // never leave the broker container.
9114
+ liveQuotas: async (accounts) => {
9115
+ try {
9116
+ const { results } = await client.probeQuota(accounts.map((a) => a.label))
9117
+ // Preserve input order (broker also preserves it, but be defensive).
9118
+ return accounts.map((a) => {
9119
+ const hit = results.find((r) => r.label === a.label)
9120
+ if (!hit) return { ok: false as const, reason: "broker returned no result for account" }
9121
+ return hit.result
9122
+ })
9123
+ } catch (err) {
9124
+ // Surface a uniform per-account failure so the dashboard renders
9125
+ // gracefully (label badge stays UNKNOWN) instead of falling back
9126
+ // to the legacy table.
9127
+ const reason = `broker probe-quota failed: ${(err as Error)?.message ?? String(err)}`
9128
+ return accounts.map(() => ({ ok: false as const, reason }))
9129
+ }
9130
+ },
9131
+ tz: process.env.SWITCHROOM_TIMEZONE ?? process.env.TZ,
8617
9132
  })
8618
- await switchroomReply(ctx, reply.text, { html: reply.html })
9133
+ // Translate the handler's optional keyboard shape into grammy's
9134
+ // `reply_markup`. Buttons with `callbackData` become callback_data;
9135
+ // buttons with `insertText` become switch_inline_query_current_chat
9136
+ // (taps paste the slash-command into the user's input). Keep a
9137
+ // safe default for buttons missing both (shouldn't happen).
9138
+ if (reply.keyboard && reply.keyboard.length > 0) {
9139
+ // Build via grammy's InlineKeyboard so the type is correct
9140
+ // for switchroomReply's reply_markup field — no `as never`
9141
+ // cast needed.
9142
+ const kb = new InlineKeyboard()
9143
+ for (let i = 0; i < reply.keyboard.length; i++) {
9144
+ const row = reply.keyboard[i]!
9145
+ for (const b of row) {
9146
+ if (b.callbackData) kb.text(b.text, b.callbackData)
9147
+ else if (b.insertText) kb.switchInlineCurrent(b.text, b.insertText)
9148
+ else kb.text(b.text, 'auth:noop')
9149
+ }
9150
+ // grammy's row terminator — except after the last row.
9151
+ if (i < reply.keyboard.length - 1) kb.row()
9152
+ }
9153
+ await switchroomReply(ctx, reply.text, {
9154
+ html: reply.html,
9155
+ reply_markup: kb,
9156
+ })
9157
+ } else {
9158
+ await switchroomReply(ctx, reply.text, { html: reply.html })
9159
+ }
8619
9160
  })
8620
9161
 
8621
9162
  // Boot-card auth-row loader (issue #708, RFC H rewire). Queries the
@@ -10243,12 +10784,154 @@ async function handleOperatorEventCallback(ctx: Context, data: string): Promise<
10243
10784
  // stub so any stale pinned message that fires an `auth:*` tap is
10244
10785
  // silently dismissed instead of crashing the gateway.
10245
10786
  async function handleAuthDashboardCallback(ctx: Context): Promise<void> {
10787
+ const data = ctx.callbackQuery?.data ?? ''
10788
+ const currentAgent = getMyAgentName()
10789
+
10790
+ // auth:use:<label> — fleet-wide swap via broker.setActive (same path
10791
+ // /auth use takes from chat). Admin-gated via the broker's own
10792
+ // per-agent admin flag.
10793
+ if (data.startsWith('auth:use:')) {
10794
+ const label = data.slice('auth:use:'.length)
10795
+ if (!label) {
10796
+ try { await ctx.answerCallbackQuery({ text: 'Missing account label.', show_alert: false }) } catch { /* */ }
10797
+ return
10798
+ }
10799
+ try {
10800
+ const client = await getAuthBrokerClient(currentAgent)
10801
+ if (!client) {
10802
+ try { await ctx.answerCallbackQuery({ text: 'Broker unreachable.', show_alert: true }) } catch { /* */ }
10803
+ return
10804
+ }
10805
+ const result = await client.setActive(label)
10806
+ try {
10807
+ await ctx.answerCallbackQuery({
10808
+ text: `Switched fleet → ${result.active} (${result.fanned.length} agents)`,
10809
+ show_alert: false,
10810
+ })
10811
+ } catch { /* toast may fail on stale tap */ }
10812
+ // Edit the source message to reflect the new active. Leaving
10813
+ // the old keyboard intact would tempt a double-tap; we replace
10814
+ // the text + drop the keyboard so the user has to /auth again
10815
+ // to see fresh state.
10816
+ const msg = ctx.callbackQuery?.message
10817
+ if (msg) {
10818
+ // Wrap in swallowingApiCall per #1075 — stale callback-source
10819
+ // messages (deleted topic, expired) shouldn't crash the swap.
10820
+ await swallowingApiCall(
10821
+ () =>
10822
+ bot.api.editMessageText(
10823
+ msg.chat.id,
10824
+ msg.message_id,
10825
+ `<b>Active account →</b> <code>${escapeHtmlForTg(result.active)}</code>\n` +
10826
+ `<i>Re-mirrored credentials for ${result.fanned.length} agent${result.fanned.length === 1 ? '' : 's'}.</i>\n\n` +
10827
+ `<i>Tap /auth to see updated quota for the new active account.</i>`,
10828
+ { parse_mode: 'HTML' },
10829
+ ),
10830
+ { chat_id: String(msg.chat.id), verb: 'auth:use:edit' },
10831
+ )
10832
+ }
10833
+ } catch (err) {
10834
+ const msg = (err as Error)?.message ?? String(err)
10835
+ try {
10836
+ await ctx.answerCallbackQuery({
10837
+ text: `Switch failed: ${msg.slice(0, 180)}`,
10838
+ show_alert: true,
10839
+ })
10840
+ } catch { /* */ }
10841
+ }
10842
+ return
10843
+ }
10844
+
10845
+ // auth:refresh — re-render the /auth snapshot in-place with a fresh
10846
+ // live probe. Replaces the message body; keyboard stays.
10847
+ if (data === 'auth:refresh') {
10848
+ // Freshness throttle: each refresh fan-fires N live api.anthropic.com
10849
+ // probes (one per account, force=true bypasses the 5-min cache).
10850
+ // Without this, a user double-tapping the ↻ button burns through
10851
+ // their account's RPM budget on duplicate work. Cap at one per
10852
+ // AUTH_REFRESH_THROTTLE_MS per (chat, message) pair.
10853
+ const refreshMsg = ctx.callbackQuery?.message
10854
+ if (refreshMsg) {
10855
+ const key = `${refreshMsg.chat.id}:${refreshMsg.message_id}`
10856
+ const lastAtMs = lastAuthRefreshAtMs.get(key) ?? 0
10857
+ const sinceLastMs = Date.now() - lastAtMs
10858
+ if (sinceLastMs < AUTH_REFRESH_THROTTLE_MS) {
10859
+ const waitS = Math.ceil((AUTH_REFRESH_THROTTLE_MS - sinceLastMs) / 1000)
10860
+ try {
10861
+ await ctx.answerCallbackQuery({
10862
+ text: `Just refreshed — try again in ${waitS}s`,
10863
+ show_alert: false,
10864
+ })
10865
+ } catch { /* */ }
10866
+ return
10867
+ }
10868
+ lastAuthRefreshAtMs.set(key, Date.now())
10869
+ }
10870
+ try {
10871
+ const client = await getAuthBrokerClient(currentAgent)
10872
+ if (!client) {
10873
+ try { await ctx.answerCallbackQuery({ text: 'Broker unreachable.', show_alert: true }) } catch { /* */ }
10874
+ return
10875
+ }
10876
+ const state = await client.listState()
10877
+ // Broker-routed probe (#1336) — see gateway.ts:8910 for diagnosis.
10878
+ const probeResp = state.accounts.length > 0
10879
+ ? await client.probeQuota(state.accounts.map((a) => a.label)).catch(() => ({ results: [] }))
10880
+ : { results: [] }
10881
+ const quotas = state.accounts.map((a) => {
10882
+ const hit = probeResp.results.find((r) => r.label === a.label)
10883
+ return hit?.result ?? { ok: false as const, reason: 'broker returned no result for account' }
10884
+ })
10885
+ const tz = process.env.SWITCHROOM_TIMEZONE ?? process.env.TZ ?? 'UTC'
10886
+ const { renderAuthSnapshotFormat2, buildSnapshotsFromState, buildSnapshotKeyboard } = await import(
10887
+ '../auth-snapshot-format.js'
10888
+ )
10889
+ const snapshots = buildSnapshotsFromState(state, quotas)
10890
+ const text = renderAuthSnapshotFormat2(snapshots, {
10891
+ tz,
10892
+ now: new Date(),
10893
+ liveProbedAtMs: Date.now(),
10894
+ })
10895
+ const kbRows = buildSnapshotKeyboard(snapshots)
10896
+ const inline_keyboard = kbRows.map((row) =>
10897
+ row.map((b) => {
10898
+ if (b.callbackData) return { text: b.text, callback_data: b.callbackData }
10899
+ if (b.insertText) return { text: b.text, switch_inline_query_current_chat: b.insertText }
10900
+ return { text: b.text, callback_data: 'auth:noop' }
10901
+ }),
10902
+ )
10903
+ const msg = ctx.callbackQuery?.message
10904
+ if (msg) {
10905
+ await swallowingApiCall(
10906
+ () =>
10907
+ bot.api.editMessageText(msg.chat.id, msg.message_id, text, {
10908
+ parse_mode: 'HTML',
10909
+ reply_markup: { inline_keyboard },
10910
+ }),
10911
+ { chat_id: String(msg.chat.id), verb: 'auth:refresh:edit' },
10912
+ )
10913
+ }
10914
+ try { await ctx.answerCallbackQuery({ text: 'Refreshed.', show_alert: false }) } catch { /* */ }
10915
+ } catch (err) {
10916
+ const msg = (err as Error)?.message ?? String(err)
10917
+ try {
10918
+ await ctx.answerCallbackQuery({
10919
+ text: `Refresh failed: ${msg.slice(0, 180)}`,
10920
+ show_alert: true,
10921
+ })
10922
+ } catch { /* */ }
10923
+ }
10924
+ return
10925
+ }
10926
+
10927
+ // Unknown auth:* — likely from a too-old message. Dismiss with a
10928
+ // hint pointing at the canonical re-render verb.
10246
10929
  try {
10247
10930
  await ctx.answerCallbackQuery({
10248
- text: "This button is from the old /auth dashboard (removed in RFC H). Send /auth show instead.",
10931
+ text: 'Unknown auth button. Send /auth for current state.',
10249
10932
  show_alert: false,
10250
10933
  })
10251
- } catch { /* tap from a too-old message — drop */ }
10934
+ } catch { /* */ }
10252
10935
  }
10253
10936
 
10254
10937
  // /reauth was removed in v0.6.13 — the `/auth` dashboard's
@@ -10659,6 +11342,47 @@ bot.command('issues', async ctx => {
10659
11342
 
10660
11343
  bot.command('usage', async ctx => {
10661
11344
  if (!isAuthorizedSender(ctx)) return
11345
+ // Format 2 path: enumerate every account in the broker's known set,
11346
+ // probe live quota in parallel, render the health-grouped snapshot.
11347
+ // Falls back to the legacy single-agent shape when the broker is
11348
+ // unreachable, since /usage was historically callable against any
11349
+ // agent regardless of fleet state.
11350
+ const currentAgent = getMyAgentName()
11351
+ try {
11352
+ const client = await getAuthBrokerClient(currentAgent)
11353
+ if (client) {
11354
+ const state = await client.listState()
11355
+ if (state.accounts.length > 0) {
11356
+ // Broker-routed probe (#1336) — see gateway.ts:8910 for diagnosis.
11357
+ const probeResp = await client.probeQuota(state.accounts.map((a) => a.label)).catch(() => ({ results: [] }))
11358
+ const quotas = state.accounts.map((a) => {
11359
+ const hit = probeResp.results.find((r) => r.label === a.label)
11360
+ return hit?.result ?? { ok: false as const, reason: 'broker returned no result for account' }
11361
+ })
11362
+ const { renderAuthSnapshotFormat2, buildSnapshotsFromState } = await import(
11363
+ '../auth-snapshot-format.js'
11364
+ )
11365
+ const tz = process.env.SWITCHROOM_TIMEZONE ?? process.env.TZ ?? 'UTC'
11366
+ const snapshots = buildSnapshotsFromState(state, quotas)
11367
+ const text = renderAuthSnapshotFormat2(snapshots, {
11368
+ tz,
11369
+ now: new Date(),
11370
+ liveProbedAtMs: Date.now(),
11371
+ })
11372
+ await switchroomReply(ctx, text, { html: true })
11373
+ return
11374
+ }
11375
+ }
11376
+ } catch (err) {
11377
+ process.stderr.write(
11378
+ `telegram gateway: /usage Format 2 path failed agent=${currentAgent}: ${(err as Error)?.message ?? err}\n`,
11379
+ )
11380
+ // fall through to legacy single-agent path
11381
+ }
11382
+
11383
+ // Legacy single-agent path — kept as a graceful fallback when the
11384
+ // broker is unreachable (post-RFC-H rewire boot timing, broken
11385
+ // socket bind, etc.). Same shape /usage shipped with originally.
10662
11386
  const agentDir = resolveAgentDirFromEnv()
10663
11387
  if (!agentDir) {
10664
11388
  await switchroomReply(ctx, '<b>/usage:</b> cannot resolve agent dir.', { html: true })
@@ -10783,6 +11507,29 @@ bot.on('callback_query:data', async ctx => {
10783
11507
  return
10784
11508
  }
10785
11509
 
11510
+ // RFC E §4.1: drvpick:<verb>:<agent>[:<...>] — folder-picker card taps.
11511
+ // open / enter / back / refresh re-render the card in place;
11512
+ // grant writes an allow_always kernel decision at
11513
+ // doc:gdrive:folder/<id>/** and edits the card to a confirmation.
11514
+ //
11515
+ // Auth gate: the picker grant is an OPERATOR action (mirrors the
11516
+ // `op:`/`vd:`/`vg:` family, not the `apv:` agent-approval shape).
11517
+ // Mirror those patterns — refuse callbacks from anyone outside
11518
+ // `access.allowFrom`. Without this, a group member who isn't in
11519
+ // the operator allowlist could still tap [✅ Allow "<folder>"] on
11520
+ // a card that landed in the group and write an `allow_always`
11521
+ // decision attributed to themselves.
11522
+ if (data.startsWith('drvpick:')) {
11523
+ const access = loadAccess()
11524
+ const senderId = String(ctx.from?.id ?? '')
11525
+ if (!access.allowFrom.includes(senderId)) {
11526
+ await ctx.answerCallbackQuery({ text: 'Not authorized.' })
11527
+ return
11528
+ }
11529
+ await handleFolderPickerCallback(ctx, data, buildFolderPickerDeps())
11530
+ return
11531
+ }
11532
+
10786
11533
  // op:<action>:<encoded-agent> callbacks from operator-events.ts
10787
11534
  // renderOperatorEvent(). Agent name is URL-encoded at emit (issue #24).
10788
11535
  // Actions: dismiss, restart, reauth, swap-slot, add-slot, logs.
@@ -12723,23 +13470,6 @@ void (async () => {
12723
13470
  }
12724
13471
  } catch {}
12725
13472
 
12726
- // Auto-fallback on quota exhaustion. Periodically polls
12727
- // the active slot's rate-limit headers; when utilization >= 99.5%
12728
- // or a 429 is observed, marks the slot exhausted, swaps to the
12729
- // next healthy slot via src/auth, restarts the agent, and posts
12730
- // a notification to the owner chat. See telegram-plugin/auto-fallback.ts
12731
- // for the pure decision logic + notification builder.
12732
- //
12733
- // Default poll cadence: every 60 minutes. Set
12734
- // SWITCHROOM_AUTO_FALLBACK_POLL_MS=0 to disable the background
12735
- // poller. Pre-v0.6.12 a manual `/authfallback` typed command
12736
- // also ran the same check; that command was removed in favour
12737
- // of the `/auth` dashboard's Switch primary picker.
12738
- const AUTO_FALLBACK_POLL_MS = Number(process.env.SWITCHROOM_AUTO_FALLBACK_POLL_MS ?? 60 * 60_000)
12739
- if (AUTO_FALLBACK_POLL_MS > 0) {
12740
- setInterval(() => { void runAutoFallbackCheck({ trigger: 'scheduled' }) }, AUTO_FALLBACK_POLL_MS).unref()
12741
- }
12742
-
12743
13473
  // Credit-exhaustion watcher (#348). Reads `<agentDir>/.claude/.claude.json`
12744
13474
  // for `cachedExtraUsageDisabledReason`. Fires a Telegram notification
12745
13475
  // on transition into / out of fatal billing states (out_of_credits,