switchroom 0.10.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. package/README.md +5 -4
  2. package/dist/cli/drive-write-pretool.mjs +5418 -0
  3. package/dist/cli/switchroom.js +201 -24
  4. package/package.json +1 -1
  5. package/telegram-plugin/admin-commands/dispatch.test.ts +1 -1
  6. package/telegram-plugin/admin-commands/index.ts +2 -0
  7. package/telegram-plugin/auth-snapshot-format.ts +612 -0
  8. package/telegram-plugin/auto-fallback-fleet.ts +215 -0
  9. package/telegram-plugin/auto-fallback.ts +28 -301
  10. package/telegram-plugin/dist/gateway/gateway.js +4407 -2252
  11. package/telegram-plugin/fleet-fallback-gate.ts +105 -0
  12. package/telegram-plugin/gateway/approval-callback.test.ts +104 -0
  13. package/telegram-plugin/gateway/approval-callback.ts +31 -3
  14. package/telegram-plugin/gateway/auth-command.ts +121 -10
  15. package/telegram-plugin/gateway/auth-status-adapter.ts +101 -0
  16. package/telegram-plugin/gateway/boot-card.ts +1 -1
  17. package/telegram-plugin/gateway/boot-probes.ts +6 -9
  18. package/telegram-plugin/gateway/diff-preview-card.test.ts +192 -0
  19. package/telegram-plugin/gateway/diff-preview-card.ts +170 -0
  20. package/telegram-plugin/gateway/drive-write-approval.test.ts +312 -0
  21. package/telegram-plugin/gateway/drive-write-approval.ts +243 -0
  22. package/telegram-plugin/gateway/folder-picker-handler.test.ts +314 -0
  23. package/telegram-plugin/gateway/folder-picker-handler.ts +348 -0
  24. package/telegram-plugin/gateway/gateway.ts +876 -173
  25. package/telegram-plugin/gateway/hostd-dispatch.ts +127 -0
  26. package/telegram-plugin/gateway/ipc-protocol.ts +83 -2
  27. package/telegram-plugin/gateway/ipc-server.ts +69 -0
  28. package/telegram-plugin/hooks/sandbox-hint-posttool.mjs +103 -12
  29. package/telegram-plugin/model-unavailable.ts +28 -12
  30. package/telegram-plugin/silence-poke.ts +153 -1
  31. package/telegram-plugin/tests/auth-command-format2.test.ts +156 -0
  32. package/telegram-plugin/tests/auth-snapshot-format.test.ts +429 -0
  33. package/telegram-plugin/tests/auth-status-adapter.test.ts +129 -0
  34. package/telegram-plugin/tests/auto-fallback-fleet.test.ts +211 -0
  35. package/telegram-plugin/tests/auto-fallback.test.ts +60 -358
  36. package/telegram-plugin/tests/boot-probes.test.ts +16 -18
  37. package/telegram-plugin/tests/fleet-fallback-gate.test.ts +197 -0
  38. package/telegram-plugin/tests/model-unavailable.test.ts +30 -5
  39. package/telegram-plugin/tests/sandbox-hint-posttool.test.ts +212 -2
  40. package/telegram-plugin/tests/silence-poke.test.ts +237 -0
  41. package/telegram-plugin/tests/turn-flush-safety.test.ts +112 -0
  42. package/telegram-plugin/turn-flush-safety.ts +55 -1
  43. package/telegram-plugin/uat/SETUP.md +16 -12
  44. package/telegram-plugin/auto-fallback-dispatcher.ts +0 -68
  45. package/telegram-plugin/tests/auto-fallback-dispatcher.e2e.test.ts +0 -183
  46. package/telegram-plugin/tests/hostd-dispatch.test.ts +0 -129
@@ -52,6 +52,7 @@ import { OutboundDedupCache } from '../recent-outbound-dedup.js'
52
52
  import { createInboundCoalescer, inboundCoalesceKey } from './inbound-coalesce.js'
53
53
  import { StatusReactionController } from '../status-reactions.js'
54
54
  import { isTelegramReplyTool, isTelegramSurfaceTool } from '../tool-names.js'
55
+ import { toolLabel } from '../tool-labels.js'
55
56
  import { createTypingWrapper } from '../typing-wrap.js'
56
57
  import { type DraftStreamHandle } from '../draft-stream.js'
57
58
  import { handlePtyPartialPure, type PtyHandlerState } from '../pty-partial-handler.js'
@@ -94,6 +95,8 @@ import {
94
95
  import type { AuthBrokerClient } from './auth-command.js'
95
96
  import type { ListStateData } from './auth-line.js'
96
97
  import { getAuthBrokerClient, addAccountViaBroker } from './auth-broker-client.js'
98
+ import { resolveAuthBrokerSocketPath } from '../../src/auth/broker/client.js'
99
+ import { createFleetFallbackGate } from '../fleet-fallback-gate.js'
97
100
  import {
98
101
  pendingAuthAddFlows,
99
102
  startAccountAuthSession,
@@ -124,6 +127,8 @@ import {
124
127
  formatModelUnavailableCard,
125
128
  resolveModelUnavailableFromOperatorEvent,
126
129
  } from '../model-unavailable.js'
130
+ import { runFleetAutoFallback } from '../auto-fallback-fleet.js'
131
+ import { fetchAccountQuota } from '../quota-check.js'
127
132
  import { startRestartWatchdog } from './restart-watchdog.js'
128
133
  import { validateStringArray } from './access-validator.js'
129
134
 
@@ -160,6 +165,11 @@ import {
160
165
  TELEGRAM_SWITCHROOM_COMMANDS,
161
166
  type AgentMetadata, type AuthSummary, type StatusProbeRow,
162
167
  } from '../welcome-text.js'
168
+ import {
169
+ type BrokerStateView,
170
+ type ClaudeJsonView,
171
+ buildAuthSummaryFromBroker,
172
+ } from './auth-status-adapter.js'
163
173
  import {
164
174
  isContextExhaustionText,
165
175
  shouldArmOrphanedReplyTimeout,
@@ -186,39 +196,52 @@ import {
186
196
  import { sweepActiveReactions } from '../active-reactions-sweep.js'
187
197
  import { flushOnAgentDisconnect } from './disconnect-flush.js'
188
198
  import { PreambleSuppressor } from './preamble-suppressor.js'
199
+ import {
200
+ fetchFolderPage,
201
+ FolderListCache,
202
+ } from '../../src/drive/folder-list.js'
203
+ import { loadFromAuthBroker } from '../../src/drive/wrapper-broker.js'
204
+ import {
205
+ handleFoldersCommand,
206
+ handleFolderPickerCallback,
207
+ type FolderPickerHandlerDeps,
208
+ } from './folder-picker-handler.js'
209
+ import {
210
+ approvalConsume as kernelApprovalConsume,
211
+ approvalRecord as kernelApprovalRecord,
212
+ approvalRequest as kernelApprovalRequest,
213
+ } from '../../src/vault/approvals/client.js'
189
214
  import {
190
215
  fetchQuota,
191
216
  formatQuotaBlock,
192
217
  } from '../quota-check.js'
193
218
  import {
194
- evaluateFallbackTrigger,
195
- performAutoFallback,
196
- emptyLockout,
197
219
  loadLockout,
198
- nextLockout,
199
- saveLockout,
200
220
  DEFAULT_FALLBACK_COOLDOWN_MS,
201
- type LockoutRecord,
202
221
  type LockoutPersistOps,
203
222
  } from '../auto-fallback.js'
204
- import { markSlotQuotaExhausted, DEFAULT_SLOT } from '../../src/auth/accounts.js'
205
- import { fallbackToNextSlot, currentActiveSlot, type AuthCodeOutcome } from '../../src/auth/manager.js'
223
+ import { DEFAULT_SLOT } from '../../src/auth/accounts.js'
224
+ import { currentActiveSlot, type AuthCodeOutcome } from '../../src/auth/manager.js'
206
225
  import { injectSlashCommand as injectSlashCommandImpl } from '../../src/agents/inject.js'
207
226
  import { handleInjectCommand } from './inject-handler.js'
208
227
  import { type BannerState } from '../slot-banner.js'
209
228
  import { refreshBanner } from '../slot-banner-driver.js'
210
- import { dispatchFallbackNotification } from '../auto-fallback-dispatcher.js'
211
229
  import { loadConfig as loadSwitchroomConfig } from '../../src/config/loader.js'; import { resolveAgentConfig } from '../../src/config/merge.js'
212
230
  import {
213
231
  tryHostdDispatch,
214
232
  hostdRequestId,
215
233
  hostdWillBeUsed,
234
+ pollHostdStatus,
235
+ warnLegacySpawnIfHostdDisabled,
216
236
  _resetHostdEnabledCache,
217
237
  } from './hostd-dispatch.js'
238
+ import type { HostdRequest } from '../../src/host-control/protocol.js'
218
239
  import type { AgentAudit } from '../welcome-text.js'
219
240
  import { shouldSweepChatAtBoot } from './boot-sweep-filter.js'
220
241
 
221
242
  import { createIpcServer, type IpcClient, type IpcServer } from './ipc-server.js'
243
+ import { handleRequestDriveApproval } from './drive-write-approval.js'
244
+ import { buildDiffPreviewCard } from './diff-preview-card.js'
222
245
  import { createPendingInboundBuffer } from './pending-inbound-buffer.js'
223
246
  import {
224
247
  buildVaultGrantApprovedInbound,
@@ -1086,6 +1109,14 @@ type CurrentTurn = {
1086
1109
  gatewayReceiveAt: number
1087
1110
  replyCalled: boolean
1088
1111
  capturedText: string[]
1112
+ // #1291: snapshot of capturedText.length at the moment of the most
1113
+ // recent reply / stream_reply tool call. Used by decideTurnFlush to
1114
+ // isolate the post-reply tail (e.g. a soft-commit reply followed by
1115
+ // the real substantive answer in terminal text only) and flush it as
1116
+ // a follow-up message. Pre-#1291 the existence of ANY reply call
1117
+ // suppressed flush entirely — that lost long terminal-only answers
1118
+ // after a "let me check" interim reply.
1119
+ capturedTextLenAtLastReply: number
1089
1120
  orphanedReplyTimeoutId: ReturnType<typeof setTimeout> | null
1090
1121
  registryKey: string | null
1091
1122
  // Last assistant outbound message id for the current turn — populated
@@ -1974,6 +2005,13 @@ const awaitingAuthCodeAt = new Map<string, number>()
1974
2005
  const AUTH_CODE_CONTEXT_TTL_MS = 5 * 60_000 // 5 min — OAuth code lifetime
1975
2006
  const DEFERRED_SECRET_TTL_MS = 24 * 60 * 60_000 // 24 h — ignored one-tap cards
1976
2007
 
2008
+ // Freshness throttle for `auth:refresh` taps. Keyed by `<chat_id>:<message_id>`
2009
+ // so two different snapshot messages throttle independently. Each refresh
2010
+ // fan-fires N live api.anthropic.com probes (one per account), so we cap
2011
+ // rapid re-taps to one per AUTH_REFRESH_THROTTLE_MS.
2012
+ const lastAuthRefreshAtMs = new Map<string, number>()
2013
+ const AUTH_REFRESH_THROTTLE_MS = 5_000
2014
+
1977
2015
  // ─── TTL reaper ───────────────────────────────────────────────────────────
1978
2016
  // Pending state maps above all grow whenever a flow starts and only shrink
1979
2017
  // when the flow completes. Users abandoning a flow (closing Telegram, losing
@@ -2037,6 +2075,12 @@ const pendingStateReaper = setInterval(() => {
2037
2075
  for (const [k, v] of awaitingAuthCodeAt) {
2038
2076
  if (now - v > AUTH_CODE_CONTEXT_TTL_MS) awaitingAuthCodeAt.delete(k)
2039
2077
  }
2078
+ // Auth-refresh throttle entries decay quickly (5s window); sweep
2079
+ // anything older than 60s so abandoned snapshot messages don't pin
2080
+ // their key forever.
2081
+ for (const [k, v] of lastAuthRefreshAtMs) {
2082
+ if (now - v > 60_000) lastAuthRefreshAtMs.delete(k)
2083
+ }
2040
2084
  // /auth rm two-step confirm window — self-expires at `expiresAt`.
2041
2085
  for (const [k, v] of pendingAuthRmFlows) {
2042
2086
  if (now >= v.expiresAt) pendingAuthRmFlows.delete(k)
@@ -2241,11 +2285,33 @@ function emitGatewayOperatorEvent(event: OperatorEvent): void {
2241
2285
  let renderedText: string
2242
2286
  let renderedKeyboard: ReturnType<typeof renderOperatorEvent>['keyboard'] | undefined
2243
2287
  if (modelUnavailable) {
2288
+ // Two questions, asked synchronously to avoid the "card promises
2289
+ // an announcement that never arrives" trap:
2290
+ // 1. Is this a kind that AUTO-fallback can address?
2291
+ // 2. Will the dispatcher actually fire (vs. dedup-drop)?
2292
+ // Card text branches on the AND. wouldFireFleetAutoFallback is a
2293
+ // pure read of the dedup state; calling fireFleetAutoFallback only
2294
+ // when both are true keeps the card honest.
2295
+ const isAutoKind =
2296
+ modelUnavailable.kind === 'quota_exhausted' || modelUnavailable.kind === 'overload'
2297
+ const willActuallyFire = isAutoKind && wouldFireFleetAutoFallback()
2244
2298
  process.stderr.write(
2245
- `telegram gateway: operator-event suppressing-raw-stderr-for-model-unavailable agent=${agent} kind=${kind} detected=${modelUnavailable.kind}\n`,
2299
+ `telegram gateway: operator-event suppressing-raw-stderr-for-model-unavailable agent=${agent} kind=${kind} detected=${modelUnavailable.kind} autoKind=${isAutoKind} willFire=${willActuallyFire}\n`,
2246
2300
  )
2247
- renderedText = formatModelUnavailableCard(modelUnavailable, agent)
2301
+ renderedText = formatModelUnavailableCard(modelUnavailable, agent, {
2302
+ autoFallbackInFlight: willActuallyFire,
2303
+ })
2248
2304
  renderedKeyboard = undefined
2305
+ // Trigger fleet-wide auto-fallback. Pre-fix this branch only
2306
+ // rendered the card; the fallback machinery was unreachable from
2307
+ // here. We fire-and-forget so card delivery is never blocked on
2308
+ // broker / API latency. The fallback's own announcement is sent
2309
+ // separately with the causal-shape headline ("5-hour limit on
2310
+ // ken" instead of generic "quota exhausted") — see
2311
+ // auth-snapshot-format.ts → renderFallbackAnnouncement.
2312
+ if (willActuallyFire) {
2313
+ void fireFleetAutoFallback(agent)
2314
+ }
2249
2315
  } else {
2250
2316
  try {
2251
2317
  const r = renderOperatorEvent(event)
@@ -2513,6 +2579,7 @@ silencePoke.startTimer({
2513
2579
  const text = silencePoke.formatFrameworkFallbackText(
2514
2580
  ctx.fallbackKind,
2515
2581
  ctx.silenceMs,
2582
+ ctx.inFlightTools,
2516
2583
  )
2517
2584
  try {
2518
2585
  await robustApiCall(
@@ -2820,9 +2887,46 @@ const ipcServer: IpcServer = createIpcServer({
2820
2887
  const key = statusKey(currentTurn.sessionChatId, currentTurn.sessionThreadId)
2821
2888
  if (ev.kind === 'thinking') {
2822
2889
  silencePoke.noteThinking(key, Date.now())
2823
- } else if (ev.kind === 'tool_use' && (ev.toolName === 'Task' || ev.toolName === 'Agent')) {
2824
- // Built-in claude sub-agent dispatch extends soft threshold to 5min.
2825
- silencePoke.noteSubagentDispatch(key)
2890
+ } else if (ev.kind === 'tool_use') {
2891
+ if (ev.toolName === 'Task' || ev.toolName === 'Agent') {
2892
+ // Built-in claude sub-agent dispatch — extends soft threshold to 5min.
2893
+ silencePoke.noteSubagentDispatch(key)
2894
+ }
2895
+ // #1292: track in-flight tool calls so the 300s framework
2896
+ // fallback message can name the actual observable (e.g.
2897
+ // "running Grep \"foo\" for 4m") instead of the dishonest
2898
+ // generic "still working… no update in 5 min" when the agent
2899
+ // is clearly busy on tool calls. Telegram-surface tools are
2900
+ // excluded — their job IS the outbound message, the silence
2901
+ // clock resets via noteOutbound when they fire. Sub-agent
2902
+ // tool_use events (kind='sub_agent_tool_use') intentionally
2903
+ // NOT tracked: the parent's Task tool_use is already on the
2904
+ // map and represents the user-observable wait.
2905
+ if (
2906
+ ev.toolUseId != null
2907
+ && ev.toolUseId.length > 0
2908
+ && !isTelegramSurfaceTool(ev.toolName)
2909
+ ) {
2910
+ const label = toolLabel(
2911
+ ev.toolName,
2912
+ ev.input,
2913
+ /*preamble*/ undefined,
2914
+ ev.precomputedLabel,
2915
+ )
2916
+ silencePoke.noteToolStart(
2917
+ key,
2918
+ ev.toolUseId,
2919
+ ev.toolName,
2920
+ label.length > 0 ? label : null,
2921
+ Date.now(),
2922
+ )
2923
+ }
2924
+ } else if (ev.kind === 'tool_result') {
2925
+ // #1292: drain the in-flight entry. Idempotent on unknown ids
2926
+ // (covers Telegram-surface tools we skipped at start time).
2927
+ if (ev.toolUseId != null && ev.toolUseId.length > 0) {
2928
+ silencePoke.noteToolEnd(key, ev.toolUseId, Date.now())
2929
+ }
2826
2930
  }
2827
2931
  }
2828
2932
  },
@@ -2959,6 +3063,69 @@ const ipcServer: IpcServer = createIpcServer({
2959
3063
  * Logs every fire so an operator can correlate the agent's
2960
3064
  * transcript turn against the scheduler's audit row by `prompt_key`.
2961
3065
  */
3066
+ async onRequestDriveApproval(client: IpcClient, msg) {
3067
+ // RFC E §4.2 Cut 2 — Drive-write PreToolUse hook is asking the
3068
+ // gateway to post a diff-preview card so the user can decide.
3069
+ await handleRequestDriveApproval(client, msg, {
3070
+ agentName: getMyAgentName(),
3071
+ loadAllowFrom: () => loadAccess().allowFrom,
3072
+ loadTargetChat: () => {
3073
+ const access = loadAccess()
3074
+ const operator = access.allowFrom[0]
3075
+ if (operator === undefined) return null
3076
+ // For DM-paired setups the target chat IS the operator's
3077
+ // user id. For group setups the gateway already has a topic
3078
+ // routing surface (see how /folders posts) — this picks the
3079
+ // DM path which is the common case; group-routing follow-up
3080
+ // can extend this.
3081
+ return { chatId: operator }
3082
+ },
3083
+ registerApproval: async (args) => {
3084
+ const r = await kernelApprovalRequest({
3085
+ agent_unit: args.agent_unit,
3086
+ scope: args.scope,
3087
+ action: args.action,
3088
+ approver_set: args.approver_set,
3089
+ why: args.why,
3090
+ ttl_ms: args.ttl_ms,
3091
+ })
3092
+ if (r === null || r.state === 'rate_limited') return null
3093
+ return {
3094
+ request_id: r.request_id,
3095
+ expires_at_ms: r.expires_at,
3096
+ }
3097
+ },
3098
+ postCard: async (args) => {
3099
+ try {
3100
+ const sent = await robustApiCall(
3101
+ () =>
3102
+ bot.api.sendMessage(args.chatId, args.text, {
3103
+ parse_mode: 'HTML',
3104
+ ...(args.threadId !== undefined
3105
+ ? { message_thread_id: args.threadId }
3106
+ : {}),
3107
+ reply_markup: args.replyMarkup as never,
3108
+ }),
3109
+ {
3110
+ chat_id: String(args.chatId),
3111
+ verb: 'drive-approval-card',
3112
+ ...(args.threadId !== undefined ? { threadId: args.threadId } : {}),
3113
+ },
3114
+ )
3115
+ return { messageId: (sent as { message_id: number }).message_id }
3116
+ } catch (err) {
3117
+ process.stderr.write(
3118
+ `telegram gateway: drive-approval postCard failed: ${(err as Error).message}\n`,
3119
+ )
3120
+ return null
3121
+ }
3122
+ },
3123
+ buildCard: ({ preview, suggestRequestId }) =>
3124
+ buildDiffPreviewCard({ preview, suggestRequestId }),
3125
+ log: (m) => process.stderr.write(`telegram gateway: drive-approval — ${m}\n`),
3126
+ })
3127
+ },
3128
+
2962
3129
  onInjectInbound(_client: IpcClient, msg: InjectInboundMessage) {
2963
3130
  const promptKey = typeof msg.inbound.meta?.prompt_key === 'string'
2964
3131
  ? msg.inbound.meta.prompt_key
@@ -4638,6 +4805,7 @@ function handleSessionEvent(ev: SessionEvent): void {
4638
4805
  gatewayReceiveAt: startedAt,
4639
4806
  replyCalled: false,
4640
4807
  capturedText: [],
4808
+ capturedTextLenAtLastReply: 0,
4641
4809
  orphanedReplyTimeoutId: null,
4642
4810
  registryKey: null,
4643
4811
  lastAssistantMsgId: null,
@@ -4734,6 +4902,12 @@ function handleSessionEvent(ev: SessionEvent): void {
4734
4902
  // placeholder-heartbeat label, which has been retired.
4735
4903
  if (isTelegramReplyTool(name)) {
4736
4904
  turn.replyCalled = true
4905
+ // #1291: pin the captured-text index at the moment of this reply
4906
+ // tool call. Anything pushed into capturedText after this point
4907
+ // is the post-reply tail (e.g. the substantive answer composed
4908
+ // in terminal text after a soft-commit "on it, back in a few").
4909
+ // decideTurnFlush slices from this index to flush the tail.
4910
+ turn.capturedTextLenAtLastReply = turn.capturedText.length
4737
4911
  if (turn.orphanedReplyTimeoutId != null) {
4738
4912
  clearTimeout(turn.orphanedReplyTimeoutId)
4739
4913
  turn.orphanedReplyTimeoutId = null
@@ -4993,8 +5167,20 @@ function handleSessionEvent(ev: SessionEvent): void {
4993
5167
  chatId: turn.sessionChatId,
4994
5168
  replyCalled: turn.replyCalled,
4995
5169
  capturedText: turn.capturedText,
5170
+ capturedTextLenAtLastReply: turn.capturedTextLenAtLastReply,
4996
5171
  flushEnabled: TURN_FLUSH_SAFETY_ENABLED,
4997
5172
  })
5173
+ // #1291: when the model emitted a soft-commit reply followed by a
5174
+ // substantive terminal-only answer, decideTurnFlush returns
5175
+ // kind:'flush' with the post-reply tail. Log WARN so this case is
5176
+ // auditable — the model SHOULD have called reply for the tail, but
5177
+ // didn't, and the framework is covering for it.
5178
+ if (flushDecision.kind === 'flush' && turn.replyCalled) {
5179
+ process.stderr.write(
5180
+ `telegram gateway: WARN post-reply-tail flush (#1291) — model emitted ${flushDecision.text.length} chars after a prior reply call without a follow-up reply tool` +
5181
+ ` chat=${chatId} turnStartedAt=${turn.startedAt}\n`,
5182
+ )
5183
+ }
4998
5184
  if (flushDecision.kind === 'skip' && flushDecision.reason !== 'reply-called') {
4999
5185
  process.stderr.write(
5000
5186
  `telegram gateway: turn-flush skipped — reason=${flushDecision.reason}\n`,
@@ -5144,6 +5330,21 @@ function handleSessionEvent(ev: SessionEvent): void {
5144
5330
  // backup; reset the preamble buffer (its content is already in
5145
5331
  // the captured `capturedText`, which turn-flush is about to send).
5146
5332
  preambleSuppressor.dropNow()
5333
+ // #1289 fix — drain silence-poke + signal-tracker state for this
5334
+ // turn. The three sibling turn_end exit branches (context-exhaust
5335
+ // at ~5098, silent-marker at ~5097-5098, default reply-called tail
5336
+ // at ~5348-5349) all call signalTracker.clear + silencePoke.endTurn.
5337
+ // The flush-backstop branch was retrofitted in #1067 to null
5338
+ // currentTurn early but never had this cleanup added — leaving the
5339
+ // silence-poke state in the Map, so 300s after the original turn
5340
+ // start the framework fallback fires and the user sees
5341
+ // "still working… (no update from agent in 5 min)" on a turn the
5342
+ // gateway already considers over.
5343
+ {
5344
+ const tKey = statusKey(chatId, threadId)
5345
+ signalTracker.clear(tKey)
5346
+ silencePoke.endTurn(tKey)
5347
+ }
5147
5348
 
5148
5349
  void (async () => {
5149
5350
  await new Promise<void>(resolve => setTimeout(resolve, 500))
@@ -7388,6 +7589,75 @@ async function executeVaultOp(ctx: Context, chatId: string, op: 'list' | 'get' |
7388
7589
  }
7389
7590
  }
7390
7591
 
7592
+ /**
7593
+ * Dispatch a short-running verb (agent_start, agent_stop, cross-agent
7594
+ * agent_restart) through hostd when available, else fall back to the
7595
+ * legacy in-container CLI shell-out.
7596
+ *
7597
+ * Why: on docker-mode hosts the agent container has no docker binary,
7598
+ * so the legacy `runSwitchroomCommand` path silently exits 127 for any
7599
+ * verb that touches compose (RFC C §1, #926). Hostd runs on the host
7600
+ * with the docker socket mounted, so the verb actually works.
7601
+ *
7602
+ * Result handling:
7603
+ * - `not-configured` → fall back to {@link runSwitchroomCommand}.
7604
+ * (Operator opted out; let the legacy path's existing error
7605
+ * surfacing handle the exit-127 case.)
7606
+ * - `completed` → reply with the stdout tail (mirrors the legacy
7607
+ * path's formatted-output reply).
7608
+ * - `started` → reply with a brief "🔄 dispatched" ack. Verbs that
7609
+ * return `started` (agent_restart) finish asynchronously on the
7610
+ * daemon; the audit log is the canonical record.
7611
+ * - `error` / `denied` → reply with the error tail inline. No
7612
+ * fallback (RFC §7 hard-fail contract — operator opted in).
7613
+ */
7614
+ async function dispatchShortVerbViaHostd(
7615
+ ctx: Context,
7616
+ req: HostdRequest,
7617
+ label: string,
7618
+ legacyArgs: string[],
7619
+ ): Promise<void> {
7620
+ const hostdResp = await tryHostdDispatch(getMyAgentName(), req)
7621
+ if (hostdResp === 'not-configured') {
7622
+ warnLegacySpawnIfHostdDisabled(req.op)
7623
+ await runSwitchroomCommand(ctx, legacyArgs, label)
7624
+ return
7625
+ }
7626
+ if (hostdResp.result === 'completed') {
7627
+ const body = hostdResp.stdout_tail?.trim() || `${label}: done (exit ${hostdResp.exit_code})`
7628
+ const formatted = formatSwitchroomOutput(stripAnsi(body))
7629
+ if (formatted) {
7630
+ await switchroomReply(ctx, preBlock(formatted), { html: true })
7631
+ } else {
7632
+ await switchroomReply(ctx, `${label}: done (no output)`)
7633
+ }
7634
+ return
7635
+ }
7636
+ if (hostdResp.result === 'started') {
7637
+ await switchroomReply(
7638
+ ctx,
7639
+ `🔄 <b>${escapeHtmlForTg(label)}</b> dispatched via hostd ` +
7640
+ `(request_id=<code>${escapeHtmlForTg(hostdResp.request_id)}</code>). ` +
7641
+ `Check audit log for completion.`,
7642
+ { html: true },
7643
+ )
7644
+ return
7645
+ }
7646
+ // error / denied — surface inline. RFC §7 hard-fail: no spawn fallback.
7647
+ const errBody =
7648
+ hostdResp.error ??
7649
+ hostdResp.stderr_tail ??
7650
+ hostdResp.stdout_tail ??
7651
+ '(no error tail returned)'
7652
+ await switchroomReply(
7653
+ ctx,
7654
+ `❌ <b>${escapeHtmlForTg(label)} failed via hostd</b> ` +
7655
+ `(result=${escapeHtmlForTg(hostdResp.result)}):\n` +
7656
+ preBlock(stripAnsi(errBody)),
7657
+ { html: true },
7658
+ )
7659
+ }
7660
+
7391
7661
  async function runSwitchroomCommand(ctx: Context, args: string[], label: string): Promise<void> {
7392
7662
  try {
7393
7663
  const output = stripAnsi(switchroomExec(args))
@@ -7620,13 +7890,13 @@ function buildAgentAudit(agentName: string): AgentAudit | undefined {
7620
7890
  }
7621
7891
 
7622
7892
  // Build an AgentMetadata snapshot for the current agent by shelling out
7623
- // to `switchroom agent list --json` and `switchroom auth status --json`.
7624
- // TODO(rfc-h): the `auth status` verb was retired by RFC H. The shell
7625
- // fails silently and `authSummary` lands as null — /status renders
7626
- // without auth detail. Replace with an `auth show --json` adapter that
7627
- // maps the new fleet-broker shape to the per-agent AuthSummary fields.
7893
+ // to `switchroom agent list --json` and `switchroom auth show --json`.
7628
7894
  // Best-effort — any missing piece renders as a placeholder in the text
7629
- // templates rather than blocking the reply.
7895
+ // templates rather than blocking the reply. RFC H retired the per-agent
7896
+ // `auth status --json` shape; auth state is now derived from the
7897
+ // broker's fleet-wide `ListStateData` payload via
7898
+ // `buildAuthSummaryFromBroker`, with billingType pulled from the
7899
+ // agent's `.claude.json` (the broker doesn't track plan tier).
7630
7900
  async function buildAgentMetadata(agentName: string): Promise<AgentMetadata> {
7631
7901
  type AgentListResp = {
7632
7902
  agents: Array<{
@@ -7636,24 +7906,22 @@ async function buildAgentMetadata(agentName: string): Promise<AgentMetadata> {
7636
7906
  model?: string | null;
7637
7907
  }>
7638
7908
  }
7639
- type AuthStatusResp = {
7640
- agents: Array<{
7641
- name: string; authenticated: boolean; auth_source: string | null;
7642
- subscription_type: string | null; expires_in: string | null;
7643
- }>
7644
- }
7645
7909
  const list = switchroomExecJson<AgentListResp>(['agent', 'list'])
7646
- const auth = switchroomExecJson<AuthStatusResp>(['auth', 'status'])
7910
+ const brokerState = switchroomExecJson<BrokerStateView>(['auth', 'show'])
7647
7911
  const a = list?.agents?.find(x => x.name === agentName) ?? null
7648
- const au = auth?.agents?.find(x => x.name === agentName) ?? null
7649
- const authSummary: AuthSummary | null = au
7650
- ? {
7651
- authenticated: au.authenticated,
7652
- subscription_type: au.subscription_type,
7653
- expires_in: au.expires_in,
7654
- auth_source: au.auth_source,
7655
- }
7656
- : null
7912
+ let claudeJson: ClaudeJsonView | null = null
7913
+ try {
7914
+ const agentDir = resolveAgentDirFromEnv()
7915
+ if (agentDir) {
7916
+ const raw = readFileSync(join(agentDir, '.claude', '.claude.json'), 'utf8')
7917
+ claudeJson = JSON.parse(raw) as ClaudeJsonView
7918
+ }
7919
+ } catch { /* leave null — billingType becomes null in the summary */ }
7920
+ const authSummary: AuthSummary | null = buildAuthSummaryFromBroker(
7921
+ brokerState,
7922
+ agentName,
7923
+ claudeJson,
7924
+ )
7657
7925
  return {
7658
7926
  agentName,
7659
7927
  model: a?.model ?? null,
@@ -7798,14 +8066,24 @@ bot.command('agentstart', async ctx => {
7798
8066
  if (!isAuthorizedSender(ctx)) return
7799
8067
  const name = ctx.match?.trim() || getMyAgentName()
7800
8068
  try { assertSafeAgentName(name) } catch { await switchroomReply(ctx, 'Invalid agent name.'); return }
7801
- await runSwitchroomCommand(ctx, ['agent', 'start', name], `start ${name}`)
8069
+ await dispatchShortVerbViaHostd(
8070
+ ctx,
8071
+ { v: 1, op: 'agent_start', request_id: hostdRequestId('gw-start'), args: { name } },
8072
+ `start ${name}`,
8073
+ ['agent', 'start', name],
8074
+ )
7802
8075
  })
7803
8076
 
7804
8077
  bot.command('stop', async ctx => {
7805
8078
  if (!isAuthorizedSender(ctx)) return
7806
8079
  const name = ctx.match?.trim() || getMyAgentName()
7807
8080
  try { assertSafeAgentName(name) } catch { await switchroomReply(ctx, 'Invalid agent name.'); return }
7808
- await runSwitchroomCommand(ctx, ['agent', 'stop', name], `stop ${name}`)
8081
+ await dispatchShortVerbViaHostd(
8082
+ ctx,
8083
+ { v: 1, op: 'agent_stop', request_id: hostdRequestId('gw-stop'), args: { name } },
8084
+ `stop ${name}`,
8085
+ ['agent', 'stop', name],
8086
+ )
7809
8087
  })
7810
8088
 
7811
8089
  bot.command('restart', async ctx => {
@@ -7852,6 +8130,7 @@ bot.command('restart', async ctx => {
7852
8130
  args: { name, force: true, reason: 'user: /restart from chat' },
7853
8131
  })
7854
8132
  if (hostdResp === 'not-configured') {
8133
+ warnLegacySpawnIfHostdDisabled('agent_restart')
7855
8134
  spawnSwitchroomDetached(
7856
8135
  ['agent', 'restart', name, '--force'],
7857
8136
  notifyDetachedFailure(chatId, threadId ?? null, `restart ${name}`),
@@ -7874,7 +8153,22 @@ bot.command('restart', async ctx => {
7874
8153
  )
7875
8154
  return
7876
8155
  }
7877
- await runSwitchroomCommand(ctx, ['agent', 'restart', name], `restart ${name}`)
8156
+ // Cross-agent /restart <other>. Same hostd-first shape as self-target,
8157
+ // but no restart marker / no self-kill: another agent's container is
8158
+ // about to bounce, not ours. The daemon spawns the work and returns
8159
+ // "started" (per handleAgentRestart at server.ts:466), so the user
8160
+ // sees a brief dispatch ack and the audit log carries the outcome.
8161
+ await dispatchShortVerbViaHostd(
8162
+ ctx,
8163
+ {
8164
+ v: 1,
8165
+ op: 'agent_restart',
8166
+ request_id: hostdRequestId('gw-restart-cross'),
8167
+ args: { name, force: true, reason: `user: /restart ${name} from chat` },
8168
+ },
8169
+ `restart ${name}`,
8170
+ ['agent', 'restart', name],
8171
+ )
7878
8172
  })
7879
8173
 
7880
8174
  // ─── /new and /reset ──────────────────────────────────────────────────────
@@ -7993,6 +8287,7 @@ async function handleNewOrResetCommand(ctx: Context, kind: 'new' | 'reset'): Pro
7993
8287
  args: { name, force: true, reason: `user: /${kind} from chat` },
7994
8288
  })
7995
8289
  if (hostdResp === 'not-configured') {
8290
+ warnLegacySpawnIfHostdDisabled('agent_restart')
7996
8291
  spawnSwitchroomDetached(
7997
8292
  ['agent', 'restart', name, '--force'],
7998
8293
  notifyDetachedFailure(chatId, threadId ?? null, `${kind} ${name}`),
@@ -8156,23 +8451,83 @@ bot.command('update', async ctx => {
8156
8451
  await sweepBeforeSelfRestart()
8157
8452
  const skipImages = passthrough.includes('--skip-images')
8158
8453
  const rebuild = passthrough.includes('--rebuild')
8454
+ const updateRequestId = hostdRequestId('gw-update')
8159
8455
  const hostdResp = await tryHostdDispatch(getMyAgentName(), {
8160
8456
  v: 1,
8161
8457
  op: 'update_apply',
8162
- request_id: hostdRequestId('gw-update'),
8458
+ request_id: updateRequestId,
8163
8459
  args: {
8164
8460
  ...(skipImages ? { skip_images: true } : {}),
8165
8461
  ...(rebuild ? { rebuild: true } : {}),
8166
8462
  },
8167
8463
  })
8168
8464
  if (hostdResp === 'not-configured') {
8465
+ warnLegacySpawnIfHostdDisabled('update_apply')
8169
8466
  spawnSwitchroomDetached(
8170
8467
  ['update', ...passthrough],
8171
8468
  notifyDetachedFailure(chatId, threadId ?? null, 'update'),
8172
8469
  )
8173
8470
  return
8174
8471
  }
8175
- if (hostdResp.result === 'started' || hostdResp.result === 'completed') {
8472
+ if (hostdResp.result === 'completed') {
8473
+ return
8474
+ }
8475
+ if (hostdResp.result === 'started') {
8476
+ // RFC C §5.3: long-running mutation. Poll get_status until terminal
8477
+ // or until the recreate kills this gateway (whichever happens first).
8478
+ // The success signal is the post-restart greeting card edited into
8479
+ // ackId via the restart marker. The poll is here so that
8480
+ // *fail-before-recreate* (image pull error, scaffold regen crash)
8481
+ // doesn't leave the operator staring at the orphan "🚀 update started"
8482
+ // ack indefinitely. Live repro: PR #1305.
8483
+ void (async () => {
8484
+ // 60s budget: RFC C §5.3 specs `apply` at 30s and `update_apply`
8485
+ // at 60s. Image pulls + scaffold regeneration dominate the wall
8486
+ // clock for update_apply, hence the larger budget. The poll
8487
+ // resolves earlier on any terminal state from the daemon.
8488
+ const terminal = await pollHostdStatus(getMyAgentName(), updateRequestId, {
8489
+ timeoutMs: 60_000,
8490
+ })
8491
+ if (terminal === 'not-configured') return
8492
+ // completed → recreate is about to run / has run; let the post-
8493
+ // restart greeting card handle the success message.
8494
+ if (terminal.result === 'completed') return
8495
+ // Anything else means the daemon's mutation failed before it could
8496
+ // kill us. Edit the ack to surface the tail and clear the marker
8497
+ // so the next gateway boot doesn't render a false success card.
8498
+ clearRestartMarker()
8499
+ const errBody =
8500
+ terminal.error ??
8501
+ terminal.stderr_tail ??
8502
+ terminal.stdout_tail ??
8503
+ '(no error tail returned)'
8504
+ const editedText =
8505
+ `🚀 <b>update started</b> — <b>FAILED</b> via hostd ` +
8506
+ `(result=${escapeHtmlForTg(terminal.result)}):\n` +
8507
+ preBlock(errBody)
8508
+ if (ackId != null) {
8509
+ try {
8510
+ await robustApiCall(
8511
+ () =>
8512
+ lockedBot.api.editMessageText(chatId, ackId!, editedText, {
8513
+ parse_mode: 'HTML',
8514
+ link_preview_options: { is_disabled: true },
8515
+ }),
8516
+ { verb: 'update.poll.editAck' },
8517
+ )
8518
+ } catch {
8519
+ // edit-failed (message deleted, parse error) — fall back to
8520
+ // a fresh reply so the failure isn't silent.
8521
+ try {
8522
+ await switchroomReply(ctx, editedText, { html: true })
8523
+ } catch {}
8524
+ }
8525
+ } else {
8526
+ try {
8527
+ await switchroomReply(ctx, editedText, { html: true })
8528
+ } catch {}
8529
+ }
8530
+ })()
8176
8531
  return
8177
8532
  }
8178
8533
  clearRestartMarker()
@@ -8209,6 +8564,81 @@ bot.command('upgrade', async ctx => {
8209
8564
  )
8210
8565
  })
8211
8566
 
8567
+ // /audit hostd — tail/filter the hostd audit log. Mirrors `/vault audit`
8568
+ // in spirit (operator observability over a privileged subsystem from any
8569
+ // admin DM). Admin-gated via ADMIN_COMMAND_NAMES. Reads the audit JSONL
8570
+ // at ~/.switchroom/host-control-audit.log directly — no hostd RPC needed
8571
+ // because the file is shared via the host bind mount on docker installs.
8572
+ bot.command('audit', async ctx => {
8573
+ if (!isAuthorizedSender(ctx)) return
8574
+ const arg = (ctx.match ?? '').trim()
8575
+ if (arg === '' || arg === 'help' || arg === '--help') {
8576
+ await switchroomReply(
8577
+ ctx,
8578
+ 'Usage: <code>/audit hostd [--tail N] [--agent &lt;name&gt;] [--op &lt;verb&gt;] [--error]</code>',
8579
+ { html: true },
8580
+ )
8581
+ return
8582
+ }
8583
+ const tokens = arg.split(/\s+/)
8584
+ const sub = tokens[0]
8585
+ if (sub !== 'hostd') {
8586
+ await switchroomReply(
8587
+ ctx,
8588
+ `Unknown audit target <code>${escapeHtmlForTg(sub ?? '')}</code>. ` +
8589
+ `Supported: <code>hostd</code>.`,
8590
+ { html: true },
8591
+ )
8592
+ return
8593
+ }
8594
+ // Build the CLI argv for switchroom hostd audit. Validate each
8595
+ // operator-supplied value to keep argv injection out of the picture.
8596
+ const ALLOWED_OPS = new Set([
8597
+ 'agent_start', 'agent_stop', 'agent_restart', 'apply',
8598
+ 'update_check', 'update_apply', 'update_status', 'upgrade_status',
8599
+ 'get_status', 'doctor', 'fleet_state',
8600
+ ])
8601
+ const argv: string[] = ['hostd', 'audit']
8602
+ for (let i = 1; i < tokens.length; i++) {
8603
+ const t = tokens[i]!
8604
+ if (t === '--error') { argv.push('--error'); continue }
8605
+ if (t === '--tail' || t === '--agent' || t === '--op') {
8606
+ const v = tokens[++i]
8607
+ if (v == null) {
8608
+ await switchroomReply(ctx, `Flag <code>${t}</code> requires a value.`, { html: true })
8609
+ return
8610
+ }
8611
+ if (t === '--tail' && !/^[0-9]{1,4}$/.test(v)) {
8612
+ await switchroomReply(ctx, `<code>--tail</code> must be an integer (1-9999).`, { html: true })
8613
+ return
8614
+ }
8615
+ if (t === '--agent' && !/^[a-z][a-z0-9-]{0,62}$/i.test(v)) {
8616
+ await switchroomReply(ctx, `<code>--agent</code> name has an invalid shape.`, { html: true })
8617
+ return
8618
+ }
8619
+ if (t === '--op' && !ALLOWED_OPS.has(v)) {
8620
+ await switchroomReply(
8621
+ ctx,
8622
+ `Unknown hostd verb <code>${escapeHtmlForTg(v)}</code>. ` +
8623
+ `Known: ${[...ALLOWED_OPS].sort().map(o => `<code>${o}</code>`).join(', ')}.`,
8624
+ { html: true },
8625
+ )
8626
+ return
8627
+ }
8628
+ argv.push(t, v)
8629
+ continue
8630
+ }
8631
+ await switchroomReply(
8632
+ ctx,
8633
+ `Unknown flag <code>${escapeHtmlForTg(t)}</code>. ` +
8634
+ `Allowed: <code>--tail</code>, <code>--agent</code>, <code>--op</code>, <code>--error</code>.`,
8635
+ { html: true },
8636
+ )
8637
+ return
8638
+ }
8639
+ await runSwitchroomCommand(ctx, argv, `hostd audit${argv.length > 2 ? ' …' : ''}`)
8640
+ })
8641
+
8212
8642
  // ─── /approve, /deny, /pending ────────────────────────────────────────────
8213
8643
  // Slash-command alternatives to the inline-button approval flow (useful for
8214
8644
  // desktop-only sessions and power-users). Share pendingPermissions state
@@ -8272,6 +8702,59 @@ async function handlePermissionSlash(ctx: Context, behavior: 'allow' | 'deny'):
8272
8702
  bot.command('approve', async ctx => handlePermissionSlash(ctx, 'allow'))
8273
8703
  bot.command('deny', async ctx => handlePermissionSlash(ctx, 'deny'))
8274
8704
 
8705
+ // ─── Drive folder picker (RFC E §4.1) ───────────────────────────────────
8706
+ // /folders — post a Telegram picker card listing this agent's top-level
8707
+ // Drive folders. Tap [Allow] on a folder to grant the agent
8708
+ // allow_always at doc:gdrive:folder/<id>/**; tap [Browse] to drill in.
8709
+ //
8710
+ // Authorisation: same dmCommandGate as the other operator slash
8711
+ // commands — only allowFrom users can post-trigger.
8712
+
8713
+ const folderPickerCache = new FolderListCache()
8714
+
8715
+ function buildFolderPickerDeps(): FolderPickerHandlerDeps {
8716
+ const agentName = getMyAgentName()
8717
+ return {
8718
+ agentName,
8719
+ cache: folderPickerCache,
8720
+ fetchPage: async ({ parent_id, page_token }) => {
8721
+ const handle = await loadFromAuthBroker()
8722
+ if (handle === null) {
8723
+ throw new Error(
8724
+ `auth-broker unreachable for agent ${agentName} — is the broker container running?`,
8725
+ )
8726
+ }
8727
+ return fetchFolderPage({
8728
+ access_token: handle.access_token,
8729
+ ...(parent_id !== undefined ? { parent_id } : {}),
8730
+ ...(page_token !== undefined ? { page_token } : {}),
8731
+ })
8732
+ },
8733
+ approvalRequest: async (args) => {
8734
+ const r = await kernelApprovalRequest({
8735
+ agent_unit: args.agent_unit,
8736
+ scope: args.scope,
8737
+ action: args.action,
8738
+ approver_set: args.approver_set,
8739
+ ...(args.why !== null && args.why !== undefined ? { why: args.why } : {}),
8740
+ ...(args.ttl_ms !== null && args.ttl_ms !== undefined ? { ttl_ms: args.ttl_ms } : {}),
8741
+ })
8742
+ if (r === null || r.state === 'rate_limited') return null
8743
+ return { request_id: r.request_id }
8744
+ },
8745
+ approvalConsume: async (id) => {
8746
+ const r = await kernelApprovalConsume(id)
8747
+ return r !== null && r.consumed
8748
+ },
8749
+ approvalRecord: async (args) => kernelApprovalRecord(args),
8750
+ }
8751
+ }
8752
+
8753
+ bot.command('folders', async ctx => {
8754
+ if (!isAuthorizedSender(ctx)) return
8755
+ await handleFoldersCommand(ctx, buildFolderPickerDeps())
8756
+ })
8757
+
8275
8758
  // /pending — list current pending permission prompts with their ids, so the
8276
8759
  // user can target a specific one via /approve <id> or /deny <id>.
8277
8760
  // Restricted to access.allowFrom DMs to match /approve and /deny — it
@@ -8303,16 +8786,12 @@ bot.command('interrupt', async ctx => {
8303
8786
  await runSwitchroomCommand(ctx, ['agent', 'interrupt', name], `interrupt ${name}`)
8304
8787
  })
8305
8788
 
8306
- // Shared auto-fallback state. `lockout` is a per-process in-memory
8307
- // guard against rapid re-fire between the scheduled poll and any
8308
- // manual trigger (see telegram-plugin/auto-fallback.ts).
8309
- //
8310
- // Pre-#417 fix this was always emptyLockout() at process start, so a
8311
- // gateway restart inside the cooldown window reset the timer and a
8312
- // quota-flap on the recovering slot could re-trigger fallback the
8313
- // moment the gateway came back. We now seed from disk on first use
8314
- // and persist on every transition. Errors are swallowed: losing the
8315
- // lockout file just degrades to in-memory-only behaviour.
8789
+ // Persist-ops bundle for the legacy auto-fallback lockout file. The
8790
+ // only remaining reader is `isAutoFallbackCooldownActive` (line ~2030)
8791
+ // used by the pending-restart drain cap to defer a forced restart
8792
+ // stacking on top of an in-flight slot rotation. The legacy poller
8793
+ // that USED to write this file was retired alongside this refactor;
8794
+ // existing on-disk lockouts age out via DEFAULT_FALLBACK_COOLDOWN_MS.
8316
8795
  const lockoutOps: LockoutPersistOps = {
8317
8796
  readFileSync: (p, enc) => readFileSync(p, enc),
8318
8797
  writeFileSync: (p, data, opts) => writeFileSync(p, data, opts),
@@ -8320,24 +8799,6 @@ const lockoutOps: LockoutPersistOps = {
8320
8799
  mkdirSync: (p, opts) => mkdirSync(p, opts),
8321
8800
  joinPath: (...parts) => join(...parts),
8322
8801
  }
8323
- let autoFallbackLockout: LockoutRecord = emptyLockout()
8324
- let autoFallbackLockoutSeeded = false
8325
- function seedAutoFallbackLockoutIfNeeded(agentDir: string): void {
8326
- if (autoFallbackLockoutSeeded) return
8327
- autoFallbackLockoutSeeded = true
8328
- try {
8329
- autoFallbackLockout = loadLockout(agentDir, lockoutOps)
8330
- } catch (err) {
8331
- process.stderr.write(`telegram gateway: auto-fallback lockout seed failed (using empty): ${(err as Error).message}\n`)
8332
- }
8333
- }
8334
- function persistLockout(agentDir: string): void {
8335
- try {
8336
- saveLockout(agentDir, autoFallbackLockout, lockoutOps)
8337
- } catch (err) {
8338
- process.stderr.write(`telegram gateway: auto-fallback lockout persist failed: ${(err as Error).message}\n`)
8339
- }
8340
- }
8341
8802
 
8342
8803
  // Pinned slot-banner state (#421). One banner per gateway process,
8343
8804
  // in the owner chat (access.allowFrom[0]). Per-topic forum support
@@ -8368,91 +8829,123 @@ async function refreshPinnedBanner(reason: string): Promise<void> {
8368
8829
  }
8369
8830
  }
8370
8831
 
8371
- type AutoFallbackCheckResult =
8372
- | { kind: 'no-action'; reason: string; decision: 'noop' | 'fallback-skipped' }
8373
- | { kind: 'executed'; previousSlot: string; newSlot: string }
8374
- | { kind: 'exhausted-all'; activeSlot: string }
8375
- | { kind: 'error'; message: string }
8832
+ /**
8833
+ * Re-entry guard + dedup window for `fireFleetAutoFallback`. The state
8834
+ * was lifted into `fleet-fallback-gate.ts` so it can be tested in
8835
+ * isolation (gateway.ts module state was unreachable from vitest). The
8836
+ * gate ALSO enforces the broker-reachability honesty contract: when the
8837
+ * broker is down, `wouldFire()` returns false so the model-unavailable
8838
+ * card stays honest instead of advertising a swap that would bail with
8839
+ * `reason=no-broker-client`.
8840
+ */
8841
+ const FLEET_FALLBACK_DEDUP_MS = 30_000
8842
+
8843
+ /** Synchronous reachability check for the auth-broker UDS. Used by the
8844
+ * fleet-fallback gate to keep the model-unavailable card honest: if the
8845
+ * broker socket isn't bound, the dispatcher would bail with
8846
+ * `reason=no-broker-client`, so `wouldFire()` should return false and
8847
+ * the card should fall back to the manual `/auth use <label>` hint. */
8848
+ function isAuthBrokerSocketReachable(): boolean {
8849
+ try {
8850
+ return existsSync(resolveAuthBrokerSocketPath())
8851
+ } catch {
8852
+ return false
8853
+ }
8854
+ }
8855
+
8856
+ const fleetFallbackGate = createFleetFallbackGate({
8857
+ dedupMs: FLEET_FALLBACK_DEDUP_MS,
8858
+ brokerReachable: isAuthBrokerSocketReachable,
8859
+ })
8860
+
8861
+ function wouldFireFleetAutoFallback(): boolean {
8862
+ return fleetFallbackGate.wouldFire()
8863
+ }
8864
+
8865
+ /**
8866
+ * Fleet-wide auto-fallback dispatcher (RFC H follow-up).
8867
+ *
8868
+ * Wired from the model-unavailable card render path so a quota-out
8869
+ * event on ANY agent immediately triggers a fleet-wide swap (via
8870
+ * broker.setActive — same path /auth use takes), not the per-agent
8871
+ * legacy `runAutoFallbackCheck`. Pre-fix, the card path never called
8872
+ * any fallback machinery; the scheduled poller (60-min interval, only
8873
+ * fires on utilization headers) was the only trigger and missed
8874
+ * hard-rejection events.
8875
+ *
8876
+ * Concurrency: collapses concurrent triggers via the in-flight
8877
+ * promise above. Subsequent calls within `FLEET_FALLBACK_DEDUP_MS` of
8878
+ * a recent fire are dropped silently — the broadcast announcement is
8879
+ * the user-visible signal that the swap happened, no need to repeat.
8880
+ *
8881
+ * Fire-and-forget: never throws into the caller's flow. Posts the
8882
+ * causal-shape announcement to every chat in `loadAccess().allowFrom`
8883
+ * so the user sees the outcome inline with the original "Model
8884
+ * unavailable" card.
8885
+ */
8886
+ async function fireFleetAutoFallback(triggerAgent: string): Promise<void> {
8887
+ return fleetFallbackGate.fire(
8888
+ () => doFireFleetAutoFallback(triggerAgent),
8889
+ (err) => {
8890
+ process.stderr.write(
8891
+ `telegram gateway: [fleet-fallback] error agent=${triggerAgent}: ${(err as Error)?.message ?? err}\n`,
8892
+ )
8893
+ },
8894
+ )
8895
+ }
8376
8896
 
8377
- async function runAutoFallbackCheck(opts: { trigger: 'scheduled' | 'manual' }): Promise<AutoFallbackCheckResult> {
8378
- // All log lines in this path use the `[autofallback]` tag so a single
8379
- // grep against journalctl reconstructs the full decision history of
8380
- // a slot rotation: `journalctl -u switchroom-<agent>-gateway -g autofallback`.
8897
+ /** Returns true iff the dispatcher actually performed a swap (and the
8898
+ * user-visible announcement was broadcast). False on no-op /
8899
+ * error / idempotent-skip caller uses this to decide whether to
8900
+ * arm the post-fire suppression window. */
8901
+ async function doFireFleetAutoFallback(triggerAgent: string): Promise<boolean> {
8381
8902
  try {
8382
- const agentDir = resolveAgentDirFromEnv()
8383
- if (!agentDir) {
8384
- return { kind: 'no-action', reason: 'no agent dir', decision: 'noop' }
8385
- }
8386
- const agentName = getMyAgentName()
8387
- seedAutoFallbackLockoutIfNeeded(agentDir)
8388
- const active = currentActiveSlot(agentDir)
8389
- const quota = await fetchQuota({ claudeConfigDir: join(agentDir, '.claude') })
8390
- const decision = evaluateFallbackTrigger({
8391
- quota,
8392
- activeSlot: active,
8393
- now: Date.now(),
8394
- lockout: autoFallbackLockout,
8395
- })
8396
- if (decision.action !== 'fallback') {
8903
+ const client = await getAuthBrokerClient(triggerAgent)
8904
+ if (!client) {
8397
8905
  process.stderr.write(
8398
- `telegram gateway: [autofallback] noop trigger=${opts.trigger} agent=${agentName} active=${active ?? 'none'} reason=${decision.reason}\n`,
8906
+ `telegram gateway: [fleet-fallback] skipped agent=${triggerAgent} reason=no-broker-client\n`,
8399
8907
  )
8400
- return { kind: 'no-action', reason: decision.reason, decision: 'noop' }
8908
+ return false
8401
8909
  }
8402
- process.stderr.write(
8403
- `telegram gateway: [autofallback] decision=fallback trigger=${opts.trigger} agent=${agentName} active=${active ?? 'none'} reason=${decision.triggerReason} util=${decision.utilizationPct?.toFixed(1) ?? 'n/a'}%\n`,
8910
+ const state = await client.listState()
8911
+ // Probe live quota for every account in parallel. force:true
8912
+ // bypasses the 5-min in-process cache — we want the freshest data
8913
+ // for the swap decision, not a cached stale read.
8914
+ const quotas = await Promise.all(
8915
+ state.accounts.map((a) => fetchAccountQuota(a.label, { force: true })),
8404
8916
  )
8405
- const plan = performAutoFallback({
8406
- agentDir,
8407
- agentName,
8408
- decision,
8409
- deps: { currentActiveSlot, markSlotQuotaExhausted, fallbackToNextSlot },
8917
+ const tz = process.env.SWITCHROOM_TIMEZONE ?? process.env.TZ ?? 'UTC'
8918
+ const outcome = await runFleetAutoFallback({
8919
+ state,
8920
+ quotas,
8921
+ setActive: (label) => client.setActive(label),
8922
+ triggerAgent,
8923
+ tz,
8410
8924
  })
8411
- const ownerChatId = loadAccess().allowFrom[0]
8412
- await dispatchFallbackNotification({
8413
- bot,
8414
- ownerChatId,
8415
- plan,
8416
- onError: (err) => {
8417
- process.stderr.write(`telegram gateway: [autofallback] notify failed trigger=${opts.trigger} agent=${agentName}: ${err}\n`)
8418
- },
8419
- })
8420
- if (plan.kind === 'executed') {
8421
- try { assertSafeAgentName(plan.agentName) }
8422
- catch {
8423
- process.stderr.write(`telegram gateway: [autofallback] invalid-agent-name agent=${plan.agentName}\n`)
8424
- return { kind: 'error', message: `invalid agent name: ${plan.agentName}` }
8425
- }
8426
- try {
8427
- // Preemptive failover (utilization-over-threshold / explicit) waits
8428
- // for the active turn to drain. Reactive failover (429-response)
8429
- // hard-restarts because the request that triggered it has already
8430
- // failed — there's no in-flight turn worth preserving. See #420.
8431
- const restartArgs = ['agent', 'restart', plan.agentName]
8432
- if (plan.triggerReason !== '429-response') {
8433
- restartArgs.push('--graceful-restart')
8434
- }
8435
- process.stderr.write(
8436
- `telegram gateway: [autofallback] executed agent=${plan.agentName} prev=${plan.previousSlot} next=${plan.newSlot} restart=${plan.triggerReason === '429-response' ? 'hard' : 'graceful'}\n`,
8437
- )
8438
- switchroomExec(restartArgs)
8439
- } catch (err) {
8440
- process.stderr.write(`telegram gateway: [autofallback] restart failed agent=${plan.agentName}: ${err}\n`)
8441
- }
8442
- autoFallbackLockout = nextLockout(plan.previousSlot, Date.now())
8443
- persistLockout(agentDir)
8444
- void refreshPinnedBanner('auto-fallback')
8445
- return { kind: 'executed', previousSlot: plan.previousSlot, newSlot: plan.newSlot }
8446
- }
8447
8925
  process.stderr.write(
8448
- `telegram gateway: [autofallback] exhausted-all agent=${agentName} active=${plan.activeSlot}\n`,
8926
+ `telegram gateway: [fleet-fallback] outcome=${outcome.kind} agent=${triggerAgent}` +
8927
+ (outcome.kind === 'switched' ? ` old=${outcome.oldLabel} new=${outcome.newLabel}` : '') +
8928
+ '\n',
8449
8929
  )
8450
- autoFallbackLockout = nextLockout(plan.activeSlot, Date.now())
8451
- persistLockout(agentDir)
8452
- return { kind: 'exhausted-all', activeSlot: plan.activeSlot }
8930
+ // Post the announcement to every authorized chat. Mirrors the
8931
+ // operator-event broadcast pattern (line ~2290) — DM-only opts
8932
+ // (no message_thread_id) so THREAD_NOT_FOUND can't fire here;
8933
+ // wrap in swallowingApiCall anyway per the codebase rule.
8934
+ const access = loadAccess()
8935
+ if (access.allowFrom.length === 0) return outcome.kind === 'switched'
8936
+ const opts = { parse_mode: 'HTML' as const }
8937
+ for (const chat_id of access.allowFrom) {
8938
+ void swallowingApiCall(
8939
+ () => bot.api.sendMessage(chat_id, outcome.announcement, opts),
8940
+ { chat_id, verb: 'fleet-fallback:notify' },
8941
+ )
8942
+ }
8943
+ return outcome.kind === 'switched'
8453
8944
  } catch (err) {
8454
- process.stderr.write(`telegram gateway: [autofallback] ${opts.trigger} poll error: ${err}\n`)
8455
- return { kind: 'error', message: String((err as Error).message ?? err) }
8945
+ process.stderr.write(
8946
+ `telegram gateway: [fleet-fallback] error agent=${triggerAgent}: ${(err as Error)?.message ?? err}\n`,
8947
+ )
8948
+ return false
8456
8949
  }
8457
8950
  }
8458
8951
 
@@ -8512,15 +9005,6 @@ async function runCreditWatch(): Promise<void> {
8512
9005
  }
8513
9006
  }
8514
9007
 
8515
- // /authfallback was removed in v0.6.12 — it duplicated the work of
8516
- // the dashboard's Switch primary picker (operator-facing surface) and
8517
- // the auto-fallback poller (transparent on-quota-wall case).
8518
- // Operators who want to manually shuffle the active credential now
8519
- // use the picker. The `runAutoFallbackCheck` function and the
8520
- // `case 'fallback':` callback dispatch stay in the codebase: any
8521
- // pinned messages from earlier versions still work, and the
8522
- // auto-fallback poller still calls runAutoFallbackCheck directly.
8523
-
8524
9008
  bot.command("auth", async ctx => {
8525
9009
  if (!isAuthorizedSender(ctx)) return
8526
9010
  const text = ctx.message?.text ?? ""
@@ -8614,8 +9098,46 @@ bot.command("auth", async ctx => {
8614
9098
  isAdmin,
8615
9099
  client,
8616
9100
  chatId,
9101
+ // Format 2 enricher — probe live quota for every account in
9102
+ // parallel so the snapshot reflects current Anthropic-side
9103
+ // utilization, not the broker's potentially-days-stale
9104
+ // disk-cached `quota.json`. force:true bypasses the 5-min
9105
+ // in-process cache for this call. ~500-800ms per account
9106
+ // serial; in parallel ~800ms total for typical 3-account
9107
+ // fleets — acceptable for an interactive command.
9108
+ liveQuotas: async (accounts) =>
9109
+ Promise.all(
9110
+ accounts.map((a) => fetchAccountQuota(a.label, { force: true })),
9111
+ ),
9112
+ tz: process.env.SWITCHROOM_TIMEZONE ?? process.env.TZ,
8617
9113
  })
8618
- await switchroomReply(ctx, reply.text, { html: reply.html })
9114
+ // Translate the handler's optional keyboard shape into grammy's
9115
+ // `reply_markup`. Buttons with `callbackData` become callback_data;
9116
+ // buttons with `insertText` become switch_inline_query_current_chat
9117
+ // (taps paste the slash-command into the user's input). Keep a
9118
+ // safe default for buttons missing both (shouldn't happen).
9119
+ if (reply.keyboard && reply.keyboard.length > 0) {
9120
+ // Build via grammy's InlineKeyboard so the type is correct
9121
+ // for switchroomReply's reply_markup field — no `as never`
9122
+ // cast needed.
9123
+ const kb = new InlineKeyboard()
9124
+ for (let i = 0; i < reply.keyboard.length; i++) {
9125
+ const row = reply.keyboard[i]!
9126
+ for (const b of row) {
9127
+ if (b.callbackData) kb.text(b.text, b.callbackData)
9128
+ else if (b.insertText) kb.switchInlineCurrent(b.text, b.insertText)
9129
+ else kb.text(b.text, 'auth:noop')
9130
+ }
9131
+ // grammy's row terminator — except after the last row.
9132
+ if (i < reply.keyboard.length - 1) kb.row()
9133
+ }
9134
+ await switchroomReply(ctx, reply.text, {
9135
+ html: reply.html,
9136
+ reply_markup: kb,
9137
+ })
9138
+ } else {
9139
+ await switchroomReply(ctx, reply.text, { html: reply.html })
9140
+ }
8619
9141
  })
8620
9142
 
8621
9143
  // Boot-card auth-row loader (issue #708, RFC H rewire). Queries the
@@ -10243,12 +10765,149 @@ async function handleOperatorEventCallback(ctx: Context, data: string): Promise<
10243
10765
  // stub so any stale pinned message that fires an `auth:*` tap is
10244
10766
  // silently dismissed instead of crashing the gateway.
10245
10767
  async function handleAuthDashboardCallback(ctx: Context): Promise<void> {
10768
+ const data = ctx.callbackQuery?.data ?? ''
10769
+ const currentAgent = getMyAgentName()
10770
+
10771
+ // auth:use:<label> — fleet-wide swap via broker.setActive (same path
10772
+ // /auth use takes from chat). Admin-gated via the broker's own
10773
+ // per-agent admin flag.
10774
+ if (data.startsWith('auth:use:')) {
10775
+ const label = data.slice('auth:use:'.length)
10776
+ if (!label) {
10777
+ try { await ctx.answerCallbackQuery({ text: 'Missing account label.', show_alert: false }) } catch { /* */ }
10778
+ return
10779
+ }
10780
+ try {
10781
+ const client = await getAuthBrokerClient(currentAgent)
10782
+ if (!client) {
10783
+ try { await ctx.answerCallbackQuery({ text: 'Broker unreachable.', show_alert: true }) } catch { /* */ }
10784
+ return
10785
+ }
10786
+ const result = await client.setActive(label)
10787
+ try {
10788
+ await ctx.answerCallbackQuery({
10789
+ text: `Switched fleet → ${result.active} (${result.fanned.length} agents)`,
10790
+ show_alert: false,
10791
+ })
10792
+ } catch { /* toast may fail on stale tap */ }
10793
+ // Edit the source message to reflect the new active. Leaving
10794
+ // the old keyboard intact would tempt a double-tap; we replace
10795
+ // the text + drop the keyboard so the user has to /auth again
10796
+ // to see fresh state.
10797
+ const msg = ctx.callbackQuery?.message
10798
+ if (msg) {
10799
+ // Wrap in swallowingApiCall per #1075 — stale callback-source
10800
+ // messages (deleted topic, expired) shouldn't crash the swap.
10801
+ await swallowingApiCall(
10802
+ () =>
10803
+ bot.api.editMessageText(
10804
+ msg.chat.id,
10805
+ msg.message_id,
10806
+ `<b>Active account →</b> <code>${escapeHtmlForTg(result.active)}</code>\n` +
10807
+ `<i>Re-mirrored credentials for ${result.fanned.length} agent${result.fanned.length === 1 ? '' : 's'}.</i>\n\n` +
10808
+ `<i>Tap /auth to see updated quota for the new active account.</i>`,
10809
+ { parse_mode: 'HTML' },
10810
+ ),
10811
+ { chat_id: String(msg.chat.id), verb: 'auth:use:edit' },
10812
+ )
10813
+ }
10814
+ } catch (err) {
10815
+ const msg = (err as Error)?.message ?? String(err)
10816
+ try {
10817
+ await ctx.answerCallbackQuery({
10818
+ text: `Switch failed: ${msg.slice(0, 180)}`,
10819
+ show_alert: true,
10820
+ })
10821
+ } catch { /* */ }
10822
+ }
10823
+ return
10824
+ }
10825
+
10826
+ // auth:refresh — re-render the /auth snapshot in-place with a fresh
10827
+ // live probe. Replaces the message body; keyboard stays.
10828
+ if (data === 'auth:refresh') {
10829
+ // Freshness throttle: each refresh fan-fires N live api.anthropic.com
10830
+ // probes (one per account, force=true bypasses the 5-min cache).
10831
+ // Without this, a user double-tapping the ↻ button burns through
10832
+ // their account's RPM budget on duplicate work. Cap at one per
10833
+ // AUTH_REFRESH_THROTTLE_MS per (chat, message) pair.
10834
+ const refreshMsg = ctx.callbackQuery?.message
10835
+ if (refreshMsg) {
10836
+ const key = `${refreshMsg.chat.id}:${refreshMsg.message_id}`
10837
+ const lastAtMs = lastAuthRefreshAtMs.get(key) ?? 0
10838
+ const sinceLastMs = Date.now() - lastAtMs
10839
+ if (sinceLastMs < AUTH_REFRESH_THROTTLE_MS) {
10840
+ const waitS = Math.ceil((AUTH_REFRESH_THROTTLE_MS - sinceLastMs) / 1000)
10841
+ try {
10842
+ await ctx.answerCallbackQuery({
10843
+ text: `Just refreshed — try again in ${waitS}s`,
10844
+ show_alert: false,
10845
+ })
10846
+ } catch { /* */ }
10847
+ return
10848
+ }
10849
+ lastAuthRefreshAtMs.set(key, Date.now())
10850
+ }
10851
+ try {
10852
+ const client = await getAuthBrokerClient(currentAgent)
10853
+ if (!client) {
10854
+ try { await ctx.answerCallbackQuery({ text: 'Broker unreachable.', show_alert: true }) } catch { /* */ }
10855
+ return
10856
+ }
10857
+ const state = await client.listState()
10858
+ const quotas = await Promise.all(
10859
+ state.accounts.map((a) => fetchAccountQuota(a.label, { force: true })),
10860
+ )
10861
+ const tz = process.env.SWITCHROOM_TIMEZONE ?? process.env.TZ ?? 'UTC'
10862
+ const { renderAuthSnapshotFormat2, buildSnapshotsFromState, buildSnapshotKeyboard } = await import(
10863
+ '../auth-snapshot-format.js'
10864
+ )
10865
+ const snapshots = buildSnapshotsFromState(state, quotas)
10866
+ const text = renderAuthSnapshotFormat2(snapshots, {
10867
+ tz,
10868
+ now: new Date(),
10869
+ liveProbedAtMs: Date.now(),
10870
+ })
10871
+ const kbRows = buildSnapshotKeyboard(snapshots)
10872
+ const inline_keyboard = kbRows.map((row) =>
10873
+ row.map((b) => {
10874
+ if (b.callbackData) return { text: b.text, callback_data: b.callbackData }
10875
+ if (b.insertText) return { text: b.text, switch_inline_query_current_chat: b.insertText }
10876
+ return { text: b.text, callback_data: 'auth:noop' }
10877
+ }),
10878
+ )
10879
+ const msg = ctx.callbackQuery?.message
10880
+ if (msg) {
10881
+ await swallowingApiCall(
10882
+ () =>
10883
+ bot.api.editMessageText(msg.chat.id, msg.message_id, text, {
10884
+ parse_mode: 'HTML',
10885
+ reply_markup: { inline_keyboard },
10886
+ }),
10887
+ { chat_id: String(msg.chat.id), verb: 'auth:refresh:edit' },
10888
+ )
10889
+ }
10890
+ try { await ctx.answerCallbackQuery({ text: 'Refreshed.', show_alert: false }) } catch { /* */ }
10891
+ } catch (err) {
10892
+ const msg = (err as Error)?.message ?? String(err)
10893
+ try {
10894
+ await ctx.answerCallbackQuery({
10895
+ text: `Refresh failed: ${msg.slice(0, 180)}`,
10896
+ show_alert: true,
10897
+ })
10898
+ } catch { /* */ }
10899
+ }
10900
+ return
10901
+ }
10902
+
10903
+ // Unknown auth:* — likely from a too-old message. Dismiss with a
10904
+ // hint pointing at the canonical re-render verb.
10246
10905
  try {
10247
10906
  await ctx.answerCallbackQuery({
10248
- text: "This button is from the old /auth dashboard (removed in RFC H). Send /auth show instead.",
10907
+ text: 'Unknown auth button. Send /auth for current state.',
10249
10908
  show_alert: false,
10250
10909
  })
10251
- } catch { /* tap from a too-old message — drop */ }
10910
+ } catch { /* */ }
10252
10911
  }
10253
10912
 
10254
10913
  // /reauth was removed in v0.6.13 — the `/auth` dashboard's
@@ -10659,6 +11318,44 @@ bot.command('issues', async ctx => {
10659
11318
 
10660
11319
  bot.command('usage', async ctx => {
10661
11320
  if (!isAuthorizedSender(ctx)) return
11321
+ // Format 2 path: enumerate every account in the broker's known set,
11322
+ // probe live quota in parallel, render the health-grouped snapshot.
11323
+ // Falls back to the legacy single-agent shape when the broker is
11324
+ // unreachable, since /usage was historically callable against any
11325
+ // agent regardless of fleet state.
11326
+ const currentAgent = getMyAgentName()
11327
+ try {
11328
+ const client = await getAuthBrokerClient(currentAgent)
11329
+ if (client) {
11330
+ const state = await client.listState()
11331
+ if (state.accounts.length > 0) {
11332
+ const quotas = await Promise.all(
11333
+ state.accounts.map((a) => fetchAccountQuota(a.label, { force: true })),
11334
+ )
11335
+ const { renderAuthSnapshotFormat2, buildSnapshotsFromState } = await import(
11336
+ '../auth-snapshot-format.js'
11337
+ )
11338
+ const tz = process.env.SWITCHROOM_TIMEZONE ?? process.env.TZ ?? 'UTC'
11339
+ const snapshots = buildSnapshotsFromState(state, quotas)
11340
+ const text = renderAuthSnapshotFormat2(snapshots, {
11341
+ tz,
11342
+ now: new Date(),
11343
+ liveProbedAtMs: Date.now(),
11344
+ })
11345
+ await switchroomReply(ctx, text, { html: true })
11346
+ return
11347
+ }
11348
+ }
11349
+ } catch (err) {
11350
+ process.stderr.write(
11351
+ `telegram gateway: /usage Format 2 path failed agent=${currentAgent}: ${(err as Error)?.message ?? err}\n`,
11352
+ )
11353
+ // fall through to legacy single-agent path
11354
+ }
11355
+
11356
+ // Legacy single-agent path — kept as a graceful fallback when the
11357
+ // broker is unreachable (post-RFC-H rewire boot timing, broken
11358
+ // socket bind, etc.). Same shape /usage shipped with originally.
10662
11359
  const agentDir = resolveAgentDirFromEnv()
10663
11360
  if (!agentDir) {
10664
11361
  await switchroomReply(ctx, '<b>/usage:</b> cannot resolve agent dir.', { html: true })
@@ -10783,6 +11480,29 @@ bot.on('callback_query:data', async ctx => {
10783
11480
  return
10784
11481
  }
10785
11482
 
11483
+ // RFC E §4.1: drvpick:<verb>:<agent>[:<...>] — folder-picker card taps.
11484
+ // open / enter / back / refresh re-render the card in place;
11485
+ // grant writes an allow_always kernel decision at
11486
+ // doc:gdrive:folder/<id>/** and edits the card to a confirmation.
11487
+ //
11488
+ // Auth gate: the picker grant is an OPERATOR action (mirrors the
11489
+ // `op:`/`vd:`/`vg:` family, not the `apv:` agent-approval shape).
11490
+ // Mirror those patterns — refuse callbacks from anyone outside
11491
+ // `access.allowFrom`. Without this, a group member who isn't in
11492
+ // the operator allowlist could still tap [✅ Allow "<folder>"] on
11493
+ // a card that landed in the group and write an `allow_always`
11494
+ // decision attributed to themselves.
11495
+ if (data.startsWith('drvpick:')) {
11496
+ const access = loadAccess()
11497
+ const senderId = String(ctx.from?.id ?? '')
11498
+ if (!access.allowFrom.includes(senderId)) {
11499
+ await ctx.answerCallbackQuery({ text: 'Not authorized.' })
11500
+ return
11501
+ }
11502
+ await handleFolderPickerCallback(ctx, data, buildFolderPickerDeps())
11503
+ return
11504
+ }
11505
+
10786
11506
  // op:<action>:<encoded-agent> callbacks from operator-events.ts
10787
11507
  // renderOperatorEvent(). Agent name is URL-encoded at emit (issue #24).
10788
11508
  // Actions: dismiss, restart, reauth, swap-slot, add-slot, logs.
@@ -12723,23 +13443,6 @@ void (async () => {
12723
13443
  }
12724
13444
  } catch {}
12725
13445
 
12726
- // Auto-fallback on quota exhaustion. Periodically polls
12727
- // the active slot's rate-limit headers; when utilization >= 99.5%
12728
- // or a 429 is observed, marks the slot exhausted, swaps to the
12729
- // next healthy slot via src/auth, restarts the agent, and posts
12730
- // a notification to the owner chat. See telegram-plugin/auto-fallback.ts
12731
- // for the pure decision logic + notification builder.
12732
- //
12733
- // Default poll cadence: every 60 minutes. Set
12734
- // SWITCHROOM_AUTO_FALLBACK_POLL_MS=0 to disable the background
12735
- // poller. Pre-v0.6.12 a manual `/authfallback` typed command
12736
- // also ran the same check; that command was removed in favour
12737
- // of the `/auth` dashboard's Switch primary picker.
12738
- const AUTO_FALLBACK_POLL_MS = Number(process.env.SWITCHROOM_AUTO_FALLBACK_POLL_MS ?? 60 * 60_000)
12739
- if (AUTO_FALLBACK_POLL_MS > 0) {
12740
- setInterval(() => { void runAutoFallbackCheck({ trigger: 'scheduled' }) }, AUTO_FALLBACK_POLL_MS).unref()
12741
- }
12742
-
12743
13446
  // Credit-exhaustion watcher (#348). Reads `<agentDir>/.claude/.claude.json`
12744
13447
  // for `cachedExtraUsageDisabledReason`. Fires a Telegram notification
12745
13448
  // on transition into / out of fatal billing states (out_of_credits,