switchroom 0.8.1 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. package/README.md +54 -61
  2. package/bin/timezone-hook.sh +9 -7
  3. package/dist/agent-scheduler/index.js +285 -45
  4. package/dist/auth-broker/index.js +13932 -0
  5. package/dist/cli/drive-write-pretool.mjs +5418 -0
  6. package/dist/cli/switchroom.js +8890 -5560
  7. package/dist/host-control/main.js +582 -43
  8. package/dist/vault/approvals/kernel-server.js +276 -47
  9. package/dist/vault/broker/server.js +333 -69
  10. package/examples/minimal.yaml +63 -0
  11. package/examples/personal-google-workspace-mcp/.env.example +34 -0
  12. package/examples/personal-google-workspace-mcp/README.md +194 -0
  13. package/examples/personal-google-workspace-mcp/compose.yaml +66 -0
  14. package/examples/switchroom.yaml +220 -0
  15. package/package.json +6 -4
  16. package/profiles/_base/start.sh.hbs +3 -3
  17. package/profiles/_shared/agent-self-service.md.hbs +126 -0
  18. package/profiles/default/CLAUDE.md +10 -0
  19. package/profiles/default/CLAUDE.md.hbs +16 -0
  20. package/skills/buildkite-agent-infrastructure/SKILL.md +30 -11
  21. package/skills/buildkite-agent-runtime/SKILL.md +44 -11
  22. package/skills/buildkite-api/SKILL.md +31 -8
  23. package/skills/buildkite-cli/SKILL.md +27 -9
  24. package/skills/buildkite-migration/SKILL.md +22 -9
  25. package/skills/buildkite-pipelines/SKILL.md +26 -9
  26. package/skills/buildkite-secure-delivery/SKILL.md +23 -9
  27. package/skills/buildkite-test-engine/SKILL.md +25 -8
  28. package/skills/docx/SKILL.md +1 -1
  29. package/skills/file-bug/SKILL.md +34 -6
  30. package/skills/humanizer/SKILL.md +15 -0
  31. package/skills/humanizer-calibrate/SKILL.md +7 -1
  32. package/skills/mcp-builder/SKILL.md +1 -1
  33. package/skills/pdf/SKILL.md +1 -1
  34. package/skills/pptx/SKILL.md +1 -1
  35. package/skills/skill-creator/SKILL.md +21 -1
  36. package/skills/skill-creator/scripts/__pycache__/__init__.cpython-313.pyc +0 -0
  37. package/skills/skill-creator/scripts/__pycache__/generate_report.cpython-313.pyc +0 -0
  38. package/skills/skill-creator/scripts/__pycache__/improve_description.cpython-313.pyc +0 -0
  39. package/skills/skill-creator/scripts/__pycache__/run_eval.cpython-313.pyc +0 -0
  40. package/skills/skill-creator/scripts/__pycache__/run_loop.cpython-313.pyc +0 -0
  41. package/skills/skill-creator/scripts/__pycache__/utils.cpython-313.pyc +0 -0
  42. package/skills/switchroom-cli/SKILL.md +63 -64
  43. package/skills/switchroom-health/SKILL.md +23 -10
  44. package/skills/switchroom-install/SKILL.md +3 -3
  45. package/skills/switchroom-manage/SKILL.md +26 -19
  46. package/skills/switchroom-runtime/SKILL.md +67 -15
  47. package/skills/switchroom-status/SKILL.md +26 -1
  48. package/skills/telegram-test-harness/SKILL.md +3 -0
  49. package/skills/webapp-testing/SKILL.md +31 -1
  50. package/skills/xlsx/SKILL.md +1 -1
  51. package/telegram-plugin/admin-commands/dispatch.test.ts +1 -1
  52. package/telegram-plugin/admin-commands/index.ts +9 -5
  53. package/telegram-plugin/auth-snapshot-format.ts +612 -0
  54. package/telegram-plugin/auto-fallback-fleet.ts +215 -0
  55. package/telegram-plugin/auto-fallback.ts +28 -301
  56. package/telegram-plugin/dist/gateway/gateway.js +17453 -15100
  57. package/telegram-plugin/fleet-fallback-gate.ts +105 -0
  58. package/telegram-plugin/gateway/approval-callback.test.ts +104 -0
  59. package/telegram-plugin/gateway/approval-callback.ts +31 -3
  60. package/telegram-plugin/gateway/auth-add-flow.ts +326 -0
  61. package/telegram-plugin/gateway/auth-broker-client.ts +75 -0
  62. package/telegram-plugin/gateway/auth-command.ts +905 -0
  63. package/telegram-plugin/gateway/auth-line.ts +123 -0
  64. package/telegram-plugin/gateway/auth-status-adapter.ts +101 -0
  65. package/telegram-plugin/gateway/boot-card.ts +23 -37
  66. package/telegram-plugin/gateway/boot-probes.ts +9 -12
  67. package/telegram-plugin/gateway/diff-preview-card.test.ts +192 -0
  68. package/telegram-plugin/gateway/diff-preview-card.ts +170 -0
  69. package/telegram-plugin/gateway/drive-write-approval.test.ts +312 -0
  70. package/telegram-plugin/gateway/drive-write-approval.ts +243 -0
  71. package/telegram-plugin/gateway/folder-picker-handler.test.ts +314 -0
  72. package/telegram-plugin/gateway/folder-picker-handler.ts +348 -0
  73. package/telegram-plugin/gateway/gateway.ts +1156 -938
  74. package/telegram-plugin/gateway/hostd-dispatch.ts +244 -0
  75. package/telegram-plugin/gateway/ipc-protocol.ts +83 -2
  76. package/telegram-plugin/gateway/ipc-server.ts +69 -0
  77. package/telegram-plugin/hooks/sandbox-hint-posttool.mjs +103 -12
  78. package/telegram-plugin/hooks/tool-label-pretool.mjs +11 -0
  79. package/telegram-plugin/hooks/wedge-detect-posttool.mjs +303 -0
  80. package/telegram-plugin/model-unavailable.ts +28 -12
  81. package/telegram-plugin/permission-title.ts +56 -0
  82. package/telegram-plugin/quota-check.ts +19 -41
  83. package/telegram-plugin/scripts/build.mjs +0 -1
  84. package/telegram-plugin/shared/bot-runtime.ts +5 -4
  85. package/telegram-plugin/silence-poke.ts +153 -1
  86. package/telegram-plugin/tests/auth-add-flow.test.ts +559 -0
  87. package/telegram-plugin/tests/auth-code-redact.test.ts +8 -4
  88. package/telegram-plugin/tests/auth-command-format2.test.ts +156 -0
  89. package/telegram-plugin/tests/auth-command-vernacular.test.ts +531 -0
  90. package/telegram-plugin/tests/auth-snapshot-format.test.ts +429 -0
  91. package/telegram-plugin/tests/auth-status-adapter.test.ts +129 -0
  92. package/telegram-plugin/tests/auto-fallback-fleet.test.ts +211 -0
  93. package/telegram-plugin/tests/auto-fallback.test.ts +60 -358
  94. package/telegram-plugin/tests/boot-probes.test.ts +27 -22
  95. package/telegram-plugin/tests/fleet-fallback-gate.test.ts +197 -0
  96. package/telegram-plugin/tests/model-unavailable.test.ts +30 -5
  97. package/telegram-plugin/tests/permission-title.test.ts +31 -0
  98. package/telegram-plugin/tests/quota-check.test.ts +5 -35
  99. package/telegram-plugin/tests/sandbox-hint-posttool.test.ts +212 -2
  100. package/telegram-plugin/tests/silence-poke.test.ts +237 -0
  101. package/telegram-plugin/tests/turn-flush-safety.test.ts +112 -0
  102. package/telegram-plugin/turn-flush-safety.ts +55 -1
  103. package/telegram-plugin/uat/SETUP.md +35 -1
  104. package/telegram-plugin/uat/runners/agent-self-sufficiency.ts +457 -0
  105. package/telegram-plugin/uat/runners/paraphrases.ts +231 -0
  106. package/telegram-plugin/uat/runners/report.ts +150 -0
  107. package/telegram-plugin/uat/runners/run-agent-self-sufficiency.sh +50 -0
  108. package/telegram-plugin/uat/runners/scorer.test.ts +196 -0
  109. package/telegram-plugin/uat/runners/scorer.ts +106 -0
  110. package/telegram-plugin/uat/runners/skill-coverage.test.ts +100 -0
  111. package/telegram-plugin/uat/runners/skill-coverage.ts +620 -0
  112. package/telegram-plugin/uat/scenarios/jtbd-interrupt-marker-dm.test.ts +7 -1
  113. package/telegram-plugin/uat/scenarios/jtbd-rapid-followup-dm.test.ts +7 -1
  114. package/telegram-plugin/auth-dashboard.ts +0 -1104
  115. package/telegram-plugin/auth-slot-parser.ts +0 -497
  116. package/telegram-plugin/auto-fallback-dispatcher.ts +0 -68
  117. package/telegram-plugin/dist/foreman/foreman.js +0 -31358
  118. package/telegram-plugin/foreman/foreman-create-flow.ts +0 -202
  119. package/telegram-plugin/foreman/foreman-handlers.ts +0 -493
  120. package/telegram-plugin/foreman/foreman.ts +0 -1165
  121. package/telegram-plugin/foreman/setup-flow.ts +0 -345
  122. package/telegram-plugin/foreman/setup-state.ts +0 -239
  123. package/telegram-plugin/foreman/state.ts +0 -203
  124. package/telegram-plugin/tests/auth-account-identity-surface.test.ts +0 -118
  125. package/telegram-plugin/tests/auth-dashboard-edge-cases.test.ts +0 -260
  126. package/telegram-plugin/tests/auth-dashboard-restart-flow.test.ts +0 -140
  127. package/telegram-plugin/tests/auth-dashboard-v3b.test.ts +0 -559
  128. package/telegram-plugin/tests/auth-dashboard.test.ts +0 -1045
  129. package/telegram-plugin/tests/auth-slot-commands.test.ts +0 -640
  130. package/telegram-plugin/tests/auto-fallback-dispatcher.e2e.test.ts +0 -183
  131. package/telegram-plugin/tests/boot-card-account-quota.test.ts +0 -137
  132. package/telegram-plugin/tests/foreman-create-flow.test.ts +0 -359
  133. package/telegram-plugin/tests/foreman-handlers.test.ts +0 -347
  134. package/telegram-plugin/tests/foreman-state.test.ts +0 -164
  135. package/telegram-plugin/tests/foreman-write-ops.test.ts +0 -214
  136. package/telegram-plugin/tests/setup-flow.test.ts +0 -510
  137. package/telegram-plugin/tests/setup-state.test.ts +0 -146
@@ -52,6 +52,7 @@ import { OutboundDedupCache } from '../recent-outbound-dedup.js'
52
52
  import { createInboundCoalescer, inboundCoalesceKey } from './inbound-coalesce.js'
53
53
  import { StatusReactionController } from '../status-reactions.js'
54
54
  import { isTelegramReplyTool, isTelegramSurfaceTool } from '../tool-names.js'
55
+ import { toolLabel } from '../tool-labels.js'
55
56
  import { createTypingWrapper } from '../typing-wrap.js'
56
57
  import { type DraftStreamHandle } from '../draft-stream.js'
57
58
  import { handlePtyPartialPure, type PtyHandlerState } from '../pty-partial-handler.js'
@@ -82,32 +83,27 @@ import {
82
83
  import { clearStaleTelegramPollingState } from '../startup-reset.js'
83
84
  import { gatewayStartupRetry } from './startup-network-retry.js'
84
85
  import { writeQuarantineMarker } from './quarantine.js'
86
+ // RFC H §7.3: auth-dashboard + auth-slot-parser deleted. Three chat
87
+ // verbs (/auth show | use | rotate) talk to switchroom-auth-broker
88
+ // via the thin client in src/auth/broker/client.ts.
85
89
  import {
86
- parseAuthSubCommand,
87
- checkRemoveSafety,
88
- formatSlotList,
89
- type SlotListingFromCli,
90
- } from '../auth-slot-parser.js'
90
+ parseAuthCommand,
91
+ handleAuthCommand,
92
+ isAuthAdmin,
93
+ pendingAuthRmFlows,
94
+ } from './auth-command.js'
95
+ import type { AuthBrokerClient } from './auth-command.js'
96
+ import type { ListStateData } from './auth-line.js'
97
+ import { getAuthBrokerClient, addAccountViaBroker } from './auth-broker-client.js'
98
+ import { resolveAuthBrokerSocketPath } from '../../src/auth/broker/client.js'
99
+ import { createFleetFallbackGate } from '../fleet-fallback-gate.js'
91
100
  import {
92
- buildDashboard,
93
- buildRemoveConfirmKeyboard,
94
- buildAccountConfirmKeyboard,
95
- buildAccountPromoteConfirmKeyboard,
96
- buildSwitchPrimaryKeyboard,
97
- buildAccountSubViewText,
98
- buildAccountSubViewKeyboard,
99
- buildAccountRemoveConfirmKeyboard,
100
- parseCallbackData,
101
- encodeCallbackData,
102
- isQuotaHot,
103
- isAccountQuotaHot,
104
- ACCOUNTS_DISPLAY_CAP,
105
- type DashboardState,
106
- type DashboardSlot,
107
- type SlotHealth,
108
- type AccountSummary,
109
- type AccountHealth,
110
- } from '../auth-dashboard.js'
101
+ pendingAuthAddFlows,
102
+ startAccountAuthSession,
103
+ submitAccountAuthCode,
104
+ cancelAccountAuthSession,
105
+ cleanScratchDir as cleanAuthAddScratchDir,
106
+ } from './auth-add-flow.js'
111
107
  import {
112
108
  initHistory, recordInbound, recordOutbound, recordEdit,
113
109
  deleteFromHistory, query as queryHistory, getLatestInboundMessageId,
@@ -131,6 +127,8 @@ import {
131
127
  formatModelUnavailableCard,
132
128
  resolveModelUnavailableFromOperatorEvent,
133
129
  } from '../model-unavailable.js'
130
+ import { runFleetAutoFallback } from '../auto-fallback-fleet.js'
131
+ import { fetchAccountQuota } from '../quota-check.js'
134
132
  import { startRestartWatchdog } from './restart-watchdog.js'
135
133
  import { validateStringArray } from './access-validator.js'
136
134
 
@@ -167,6 +165,11 @@ import {
167
165
  TELEGRAM_SWITCHROOM_COMMANDS,
168
166
  type AgentMetadata, type AuthSummary, type StatusProbeRow,
169
167
  } from '../welcome-text.js'
168
+ import {
169
+ type BrokerStateView,
170
+ type ClaudeJsonView,
171
+ buildAuthSummaryFromBroker,
172
+ } from './auth-status-adapter.js'
170
173
  import {
171
174
  isContextExhaustionText,
172
175
  shouldArmOrphanedReplyTimeout,
@@ -193,37 +196,52 @@ import {
193
196
  import { sweepActiveReactions } from '../active-reactions-sweep.js'
194
197
  import { flushOnAgentDisconnect } from './disconnect-flush.js'
195
198
  import { PreambleSuppressor } from './preamble-suppressor.js'
199
+ import {
200
+ fetchFolderPage,
201
+ FolderListCache,
202
+ } from '../../src/drive/folder-list.js'
203
+ import { loadFromAuthBroker } from '../../src/drive/wrapper-broker.js'
204
+ import {
205
+ handleFoldersCommand,
206
+ handleFolderPickerCallback,
207
+ type FolderPickerHandlerDeps,
208
+ } from './folder-picker-handler.js'
209
+ import {
210
+ approvalConsume as kernelApprovalConsume,
211
+ approvalRecord as kernelApprovalRecord,
212
+ approvalRequest as kernelApprovalRequest,
213
+ } from '../../src/vault/approvals/client.js'
196
214
  import {
197
215
  fetchQuota,
198
216
  formatQuotaBlock,
199
- getCachedAccountQuota,
200
- prefetchAccountQuotaIfStale,
201
- hydrateAccountQuotaCacheFromDisk,
202
- clearAccountQuotaCache,
203
217
  } from '../quota-check.js'
204
218
  import {
205
- evaluateFallbackTrigger,
206
- performAutoFallback,
207
- emptyLockout,
208
219
  loadLockout,
209
- nextLockout,
210
- saveLockout,
211
220
  DEFAULT_FALLBACK_COOLDOWN_MS,
212
- type LockoutRecord,
213
221
  type LockoutPersistOps,
214
222
  } from '../auto-fallback.js'
215
- import { markSlotQuotaExhausted, DEFAULT_SLOT } from '../../src/auth/accounts.js'
216
- import { fallbackToNextSlot, currentActiveSlot, type AuthCodeOutcome } from '../../src/auth/manager.js'
223
+ import { DEFAULT_SLOT } from '../../src/auth/accounts.js'
224
+ import { currentActiveSlot, type AuthCodeOutcome } from '../../src/auth/manager.js'
217
225
  import { injectSlashCommand as injectSlashCommandImpl } from '../../src/agents/inject.js'
218
226
  import { handleInjectCommand } from './inject-handler.js'
219
227
  import { type BannerState } from '../slot-banner.js'
220
228
  import { refreshBanner } from '../slot-banner-driver.js'
221
- import { dispatchFallbackNotification } from '../auto-fallback-dispatcher.js'
222
229
  import { loadConfig as loadSwitchroomConfig } from '../../src/config/loader.js'; import { resolveAgentConfig } from '../../src/config/merge.js'
230
+ import {
231
+ tryHostdDispatch,
232
+ hostdRequestId,
233
+ hostdWillBeUsed,
234
+ pollHostdStatus,
235
+ warnLegacySpawnIfHostdDisabled,
236
+ _resetHostdEnabledCache,
237
+ } from './hostd-dispatch.js'
238
+ import type { HostdRequest } from '../../src/host-control/protocol.js'
223
239
  import type { AgentAudit } from '../welcome-text.js'
224
240
  import { shouldSweepChatAtBoot } from './boot-sweep-filter.js'
225
241
 
226
242
  import { createIpcServer, type IpcClient, type IpcServer } from './ipc-server.js'
243
+ import { handleRequestDriveApproval } from './drive-write-approval.js'
244
+ import { buildDiffPreviewCard } from './diff-preview-card.js'
227
245
  import { createPendingInboundBuffer } from './pending-inbound-buffer.js'
228
246
  import {
229
247
  buildVaultGrantApprovedInbound,
@@ -373,9 +391,14 @@ const INBOX_DIR = join(STATE_DIR, 'inbox')
373
391
  * gateway plugin (we're a child of claude inside the same container).
374
392
  * `targetAgent` is informational only here — we can't restart a
375
393
  * different agent's container from inside our own (no docker.sock).
376
- * - else (legacy systemd): detached `systemctl --user restart` of the
377
- * two units. The detach is required so the systemctl job survives
378
- * us being SIGTERM'd by systemd itself.
394
+ * - else (v0.6 legacy non-docker path, scheduled for removal in
395
+ * Phase 3 of the host-control daemon rollout see
396
+ * `docs/rfcs/host-control-daemon.md`): detached `systemctl --user
397
+ * restart` of the two units. This branch is never reached on
398
+ * v0.7+ docker installs (the `isDocker` guard above takes the
399
+ * docker branch); only callable on legacy systemd hosts that
400
+ * ran the gateway as a user unit. Don't add new dependencies
401
+ * on this path.
379
402
  *
380
403
  * `targetAgent` defaults to `SWITCHROOM_AGENT_NAME`; pass a different
381
404
  * value only for the inline restart-button callback handler. Under
@@ -1086,6 +1109,14 @@ type CurrentTurn = {
1086
1109
  gatewayReceiveAt: number
1087
1110
  replyCalled: boolean
1088
1111
  capturedText: string[]
1112
+ // #1291: snapshot of capturedText.length at the moment of the most
1113
+ // recent reply / stream_reply tool call. Used by decideTurnFlush to
1114
+ // isolate the post-reply tail (e.g. a soft-commit reply followed by
1115
+ // the real substantive answer in terminal text only) and flush it as
1116
+ // a follow-up message. Pre-#1291 the existence of ANY reply call
1117
+ // suppressed flush entirely — that lost long terminal-only answers
1118
+ // after a "let me check" interim reply.
1119
+ capturedTextLenAtLastReply: number
1089
1120
  orphanedReplyTimeoutId: ReturnType<typeof setTimeout> | null
1090
1121
  registryKey: string | null
1091
1122
  // Last assistant outbound message id for the current turn — populated
@@ -1974,6 +2005,13 @@ const awaitingAuthCodeAt = new Map<string, number>()
1974
2005
  const AUTH_CODE_CONTEXT_TTL_MS = 5 * 60_000 // 5 min — OAuth code lifetime
1975
2006
  const DEFERRED_SECRET_TTL_MS = 24 * 60 * 60_000 // 24 h — ignored one-tap cards
1976
2007
 
2008
+ // Freshness throttle for `auth:refresh` taps. Keyed by `<chat_id>:<message_id>`
2009
+ // so two different snapshot messages throttle independently. Each refresh
2010
+ // fan-fires N live api.anthropic.com probes (one per account), so we cap
2011
+ // rapid re-taps to one per AUTH_REFRESH_THROTTLE_MS.
2012
+ const lastAuthRefreshAtMs = new Map<string, number>()
2013
+ const AUTH_REFRESH_THROTTLE_MS = 5_000
2014
+
1977
2015
  // ─── TTL reaper ───────────────────────────────────────────────────────────
1978
2016
  // Pending state maps above all grow whenever a flow starts and only shrink
1979
2017
  // when the flow completes. Users abandoning a flow (closing Telegram, losing
@@ -2024,9 +2062,29 @@ function isAutoFallbackCooldownActive(_agentName: string, now: number): boolean
2024
2062
  // 60-second sweep drops anything past its documented TTL.
2025
2063
  const pendingStateReaper = setInterval(() => {
2026
2064
  const now = Date.now()
2065
+ // OAuth-code state grouped first (pinned by secret-detect-oauth-code.test.ts).
2027
2066
  for (const [k, v] of pendingReauthFlows) {
2028
2067
  if (now - v.startedAt > REAUTH_INTERCEPT_TTL_MS) pendingReauthFlows.delete(k)
2029
2068
  }
2069
+ for (const [k, v] of pendingAuthAddFlows) {
2070
+ if (now - v.startedAt > REAUTH_INTERCEPT_TTL_MS) {
2071
+ cancelAccountAuthSession(v)
2072
+ pendingAuthAddFlows.delete(k)
2073
+ }
2074
+ }
2075
+ for (const [k, v] of awaitingAuthCodeAt) {
2076
+ if (now - v > AUTH_CODE_CONTEXT_TTL_MS) awaitingAuthCodeAt.delete(k)
2077
+ }
2078
+ // Auth-refresh throttle entries decay quickly (5s window); sweep
2079
+ // anything older than 60s so abandoned snapshot messages don't pin
2080
+ // their key forever.
2081
+ for (const [k, v] of lastAuthRefreshAtMs) {
2082
+ if (now - v > 60_000) lastAuthRefreshAtMs.delete(k)
2083
+ }
2084
+ // /auth rm two-step confirm window — self-expires at `expiresAt`.
2085
+ for (const [k, v] of pendingAuthRmFlows) {
2086
+ if (now >= v.expiresAt) pendingAuthRmFlows.delete(k)
2087
+ }
2030
2088
  for (const [k, v] of pendingVaultOps) {
2031
2089
  if (now - v.startedAt > VAULT_INPUT_TTL_MS) pendingVaultOps.delete(k)
2032
2090
  }
@@ -2036,9 +2094,6 @@ const pendingStateReaper = setInterval(() => {
2036
2094
  for (const [k, v] of vaultPassphraseCache) {
2037
2095
  if (now > v.expiresAt) vaultPassphraseCache.delete(k)
2038
2096
  }
2039
- for (const [k, v] of awaitingAuthCodeAt) {
2040
- if (now - v > AUTH_CODE_CONTEXT_TTL_MS) awaitingAuthCodeAt.delete(k)
2041
- }
2042
2097
  for (const [k, v] of deferredSecrets) {
2043
2098
  if (now - v.staged_at > DEFERRED_SECRET_TTL_MS) deferredSecrets.delete(k)
2044
2099
  }
@@ -2230,11 +2285,33 @@ function emitGatewayOperatorEvent(event: OperatorEvent): void {
2230
2285
  let renderedText: string
2231
2286
  let renderedKeyboard: ReturnType<typeof renderOperatorEvent>['keyboard'] | undefined
2232
2287
  if (modelUnavailable) {
2288
+ // Two questions, asked synchronously to avoid the "card promises
2289
+ // an announcement that never arrives" trap:
2290
+ // 1. Is this a kind that AUTO-fallback can address?
2291
+ // 2. Will the dispatcher actually fire (vs. dedup-drop)?
2292
+ // Card text branches on the AND. wouldFireFleetAutoFallback is a
2293
+ // pure read of the dedup state; calling fireFleetAutoFallback only
2294
+ // when both are true keeps the card honest.
2295
+ const isAutoKind =
2296
+ modelUnavailable.kind === 'quota_exhausted' || modelUnavailable.kind === 'overload'
2297
+ const willActuallyFire = isAutoKind && wouldFireFleetAutoFallback()
2233
2298
  process.stderr.write(
2234
- `telegram gateway: operator-event suppressing-raw-stderr-for-model-unavailable agent=${agent} kind=${kind} detected=${modelUnavailable.kind}\n`,
2299
+ `telegram gateway: operator-event suppressing-raw-stderr-for-model-unavailable agent=${agent} kind=${kind} detected=${modelUnavailable.kind} autoKind=${isAutoKind} willFire=${willActuallyFire}\n`,
2235
2300
  )
2236
- renderedText = formatModelUnavailableCard(modelUnavailable, agent)
2301
+ renderedText = formatModelUnavailableCard(modelUnavailable, agent, {
2302
+ autoFallbackInFlight: willActuallyFire,
2303
+ })
2237
2304
  renderedKeyboard = undefined
2305
+ // Trigger fleet-wide auto-fallback. Pre-fix this branch only
2306
+ // rendered the card; the fallback machinery was unreachable from
2307
+ // here. We fire-and-forget so card delivery is never blocked on
2308
+ // broker / API latency. The fallback's own announcement is sent
2309
+ // separately with the causal-shape headline ("5-hour limit on
2310
+ // ken" instead of generic "quota exhausted") — see
2311
+ // auth-snapshot-format.ts → renderFallbackAnnouncement.
2312
+ if (willActuallyFire) {
2313
+ void fireFleetAutoFallback(agent)
2314
+ }
2238
2315
  } else {
2239
2316
  try {
2240
2317
  const r = renderOperatorEvent(event)
@@ -2502,6 +2579,7 @@ silencePoke.startTimer({
2502
2579
  const text = silencePoke.formatFrameworkFallbackText(
2503
2580
  ctx.fallbackKind,
2504
2581
  ctx.silenceMs,
2582
+ ctx.inFlightTools,
2505
2583
  )
2506
2584
  try {
2507
2585
  await robustApiCall(
@@ -2809,9 +2887,46 @@ const ipcServer: IpcServer = createIpcServer({
2809
2887
  const key = statusKey(currentTurn.sessionChatId, currentTurn.sessionThreadId)
2810
2888
  if (ev.kind === 'thinking') {
2811
2889
  silencePoke.noteThinking(key, Date.now())
2812
- } else if (ev.kind === 'tool_use' && (ev.toolName === 'Task' || ev.toolName === 'Agent')) {
2813
- // Built-in claude sub-agent dispatch extends soft threshold to 5min.
2814
- silencePoke.noteSubagentDispatch(key)
2890
+ } else if (ev.kind === 'tool_use') {
2891
+ if (ev.toolName === 'Task' || ev.toolName === 'Agent') {
2892
+ // Built-in claude sub-agent dispatch — extends soft threshold to 5min.
2893
+ silencePoke.noteSubagentDispatch(key)
2894
+ }
2895
+ // #1292: track in-flight tool calls so the 300s framework
2896
+ // fallback message can name the actual observable (e.g.
2897
+ // "running Grep \"foo\" for 4m") instead of the dishonest
2898
+ // generic "still working… no update in 5 min" when the agent
2899
+ // is clearly busy on tool calls. Telegram-surface tools are
2900
+ // excluded — their job IS the outbound message, the silence
2901
+ // clock resets via noteOutbound when they fire. Sub-agent
2902
+ // tool_use events (kind='sub_agent_tool_use') intentionally
2903
+ // NOT tracked: the parent's Task tool_use is already on the
2904
+ // map and represents the user-observable wait.
2905
+ if (
2906
+ ev.toolUseId != null
2907
+ && ev.toolUseId.length > 0
2908
+ && !isTelegramSurfaceTool(ev.toolName)
2909
+ ) {
2910
+ const label = toolLabel(
2911
+ ev.toolName,
2912
+ ev.input,
2913
+ /*preamble*/ undefined,
2914
+ ev.precomputedLabel,
2915
+ )
2916
+ silencePoke.noteToolStart(
2917
+ key,
2918
+ ev.toolUseId,
2919
+ ev.toolName,
2920
+ label.length > 0 ? label : null,
2921
+ Date.now(),
2922
+ )
2923
+ }
2924
+ } else if (ev.kind === 'tool_result') {
2925
+ // #1292: drain the in-flight entry. Idempotent on unknown ids
2926
+ // (covers Telegram-surface tools we skipped at start time).
2927
+ if (ev.toolUseId != null && ev.toolUseId.length > 0) {
2928
+ silencePoke.noteToolEnd(key, ev.toolUseId, Date.now())
2929
+ }
2815
2930
  }
2816
2931
  }
2817
2932
  },
@@ -2948,6 +3063,69 @@ const ipcServer: IpcServer = createIpcServer({
2948
3063
  * Logs every fire so an operator can correlate the agent's
2949
3064
  * transcript turn against the scheduler's audit row by `prompt_key`.
2950
3065
  */
3066
+ async onRequestDriveApproval(client: IpcClient, msg) {
3067
+ // RFC E §4.2 Cut 2 — Drive-write PreToolUse hook is asking the
3068
+ // gateway to post a diff-preview card so the user can decide.
3069
+ await handleRequestDriveApproval(client, msg, {
3070
+ agentName: getMyAgentName(),
3071
+ loadAllowFrom: () => loadAccess().allowFrom,
3072
+ loadTargetChat: () => {
3073
+ const access = loadAccess()
3074
+ const operator = access.allowFrom[0]
3075
+ if (operator === undefined) return null
3076
+ // For DM-paired setups the target chat IS the operator's
3077
+ // user id. For group setups the gateway already has a topic
3078
+ // routing surface (see how /folders posts) — this picks the
3079
+ // DM path which is the common case; group-routing follow-up
3080
+ // can extend this.
3081
+ return { chatId: operator }
3082
+ },
3083
+ registerApproval: async (args) => {
3084
+ const r = await kernelApprovalRequest({
3085
+ agent_unit: args.agent_unit,
3086
+ scope: args.scope,
3087
+ action: args.action,
3088
+ approver_set: args.approver_set,
3089
+ why: args.why,
3090
+ ttl_ms: args.ttl_ms,
3091
+ })
3092
+ if (r === null || r.state === 'rate_limited') return null
3093
+ return {
3094
+ request_id: r.request_id,
3095
+ expires_at_ms: r.expires_at,
3096
+ }
3097
+ },
3098
+ postCard: async (args) => {
3099
+ try {
3100
+ const sent = await robustApiCall(
3101
+ () =>
3102
+ bot.api.sendMessage(args.chatId, args.text, {
3103
+ parse_mode: 'HTML',
3104
+ ...(args.threadId !== undefined
3105
+ ? { message_thread_id: args.threadId }
3106
+ : {}),
3107
+ reply_markup: args.replyMarkup as never,
3108
+ }),
3109
+ {
3110
+ chat_id: String(args.chatId),
3111
+ verb: 'drive-approval-card',
3112
+ ...(args.threadId !== undefined ? { threadId: args.threadId } : {}),
3113
+ },
3114
+ )
3115
+ return { messageId: (sent as { message_id: number }).message_id }
3116
+ } catch (err) {
3117
+ process.stderr.write(
3118
+ `telegram gateway: drive-approval postCard failed: ${(err as Error).message}\n`,
3119
+ )
3120
+ return null
3121
+ }
3122
+ },
3123
+ buildCard: ({ preview, suggestRequestId }) =>
3124
+ buildDiffPreviewCard({ preview, suggestRequestId }),
3125
+ log: (m) => process.stderr.write(`telegram gateway: drive-approval — ${m}\n`),
3126
+ })
3127
+ },
3128
+
2951
3129
  onInjectInbound(_client: IpcClient, msg: InjectInboundMessage) {
2952
3130
  const promptKey = typeof msg.inbound.meta?.prompt_key === 'string'
2953
3131
  ? msg.inbound.meta.prompt_key
@@ -4627,6 +4805,7 @@ function handleSessionEvent(ev: SessionEvent): void {
4627
4805
  gatewayReceiveAt: startedAt,
4628
4806
  replyCalled: false,
4629
4807
  capturedText: [],
4808
+ capturedTextLenAtLastReply: 0,
4630
4809
  orphanedReplyTimeoutId: null,
4631
4810
  registryKey: null,
4632
4811
  lastAssistantMsgId: null,
@@ -4723,6 +4902,12 @@ function handleSessionEvent(ev: SessionEvent): void {
4723
4902
  // placeholder-heartbeat label, which has been retired.
4724
4903
  if (isTelegramReplyTool(name)) {
4725
4904
  turn.replyCalled = true
4905
+ // #1291: pin the captured-text index at the moment of this reply
4906
+ // tool call. Anything pushed into capturedText after this point
4907
+ // is the post-reply tail (e.g. the substantive answer composed
4908
+ // in terminal text after a soft-commit "on it, back in a few").
4909
+ // decideTurnFlush slices from this index to flush the tail.
4910
+ turn.capturedTextLenAtLastReply = turn.capturedText.length
4726
4911
  if (turn.orphanedReplyTimeoutId != null) {
4727
4912
  clearTimeout(turn.orphanedReplyTimeoutId)
4728
4913
  turn.orphanedReplyTimeoutId = null
@@ -4982,8 +5167,20 @@ function handleSessionEvent(ev: SessionEvent): void {
4982
5167
  chatId: turn.sessionChatId,
4983
5168
  replyCalled: turn.replyCalled,
4984
5169
  capturedText: turn.capturedText,
5170
+ capturedTextLenAtLastReply: turn.capturedTextLenAtLastReply,
4985
5171
  flushEnabled: TURN_FLUSH_SAFETY_ENABLED,
4986
5172
  })
5173
+ // #1291: when the model emitted a soft-commit reply followed by a
5174
+ // substantive terminal-only answer, decideTurnFlush returns
5175
+ // kind:'flush' with the post-reply tail. Log WARN so this case is
5176
+ // auditable — the model SHOULD have called reply for the tail, but
5177
+ // didn't, and the framework is covering for it.
5178
+ if (flushDecision.kind === 'flush' && turn.replyCalled) {
5179
+ process.stderr.write(
5180
+ `telegram gateway: WARN post-reply-tail flush (#1291) — model emitted ${flushDecision.text.length} chars after a prior reply call without a follow-up reply tool` +
5181
+ ` chat=${chatId} turnStartedAt=${turn.startedAt}\n`,
5182
+ )
5183
+ }
4987
5184
  if (flushDecision.kind === 'skip' && flushDecision.reason !== 'reply-called') {
4988
5185
  process.stderr.write(
4989
5186
  `telegram gateway: turn-flush skipped — reason=${flushDecision.reason}\n`,
@@ -5133,6 +5330,21 @@ function handleSessionEvent(ev: SessionEvent): void {
5133
5330
  // backup; reset the preamble buffer (its content is already in
5134
5331
  // the captured `capturedText`, which turn-flush is about to send).
5135
5332
  preambleSuppressor.dropNow()
5333
+ // #1289 fix — drain silence-poke + signal-tracker state for this
5334
+ // turn. The three sibling turn_end exit branches (context-exhaust
5335
+ // at ~5098, silent-marker at ~5097-5098, default reply-called tail
5336
+ // at ~5348-5349) all call signalTracker.clear + silencePoke.endTurn.
5337
+ // The flush-backstop branch was retrofitted in #1067 to null
5338
+ // currentTurn early but never had this cleanup added — leaving the
5339
+ // silence-poke state in the Map, so 300s after the original turn
5340
+ // start the framework fallback fires and the user sees
5341
+ // "still working… (no update from agent in 5 min)" on a turn the
5342
+ // gateway already considers over.
5343
+ {
5344
+ const tKey = statusKey(chatId, threadId)
5345
+ signalTracker.clear(tKey)
5346
+ silencePoke.endTurn(tKey)
5347
+ }
5136
5348
 
5137
5349
  void (async () => {
5138
5350
  await new Promise<void>(resolve => setTimeout(resolve, 500))
@@ -5942,6 +6154,60 @@ async function handleInbound(
5942
6154
  return
5943
6155
  }
5944
6156
 
6157
+ // `/auth add` paste-back intercept — sibling to pendingReauthFlows.
6158
+ // Both intercepts are deliberate so the LLM never sees the OAuth
6159
+ // code (it doesn't need to + plaintext OAuth in chat history is bad
6160
+ // hygiene). The add-flow intercept comes first because /auth add
6161
+ // creates fresh credentials at the broker layer, vs /reauth which
6162
+ // mutates an existing agent's slot — different success paths.
6163
+ const pendingAdd = pendingAuthAddFlows.get(chat_id)
6164
+ if (pendingAdd && looksLikeAuthCode(text)) {
6165
+ const elapsed = Date.now() - pendingAdd.startedAt
6166
+ if (elapsed < REAUTH_INTERCEPT_TTL_MS) {
6167
+ pendingAuthAddFlows.delete(chat_id)
6168
+ try {
6169
+ const credentials = await submitAccountAuthCode(pendingAdd, text.trim())
6170
+ try {
6171
+ await addAccountViaBroker(pendingAdd.label, credentials, { replace: false })
6172
+ // success — wipe scratch dir now that the broker owns the creds
6173
+ cleanAuthAddScratchDir(pendingAdd.scratchDir)
6174
+ await switchroomReply(
6175
+ ctx,
6176
+ `✓ Account <code>${escapeHtmlForTg(pendingAdd.label)}</code> added.\n` +
6177
+ `The fleet's active account hasn't changed. Send ` +
6178
+ `<code>/auth use ${escapeHtmlForTg(pendingAdd.label)}</code> to switch to it.`,
6179
+ { html: true },
6180
+ )
6181
+ } catch (brokerErr) {
6182
+ // Broker rejected (e.g. label already exists). Wipe scratch
6183
+ // either way — the credentials are useless without broker
6184
+ // bookkeeping.
6185
+ cleanAuthAddScratchDir(pendingAdd.scratchDir)
6186
+ await switchroomReply(
6187
+ ctx,
6188
+ `<b>/auth add failed at broker:</b> ${escapeHtmlForTg((brokerErr as Error)?.message ?? String(brokerErr))}`,
6189
+ { html: true },
6190
+ )
6191
+ }
6192
+ } catch (err) {
6193
+ // submitAccountAuthCode wiped the scratch dir on its own
6194
+ // failure paths (timeout, child exit, stdin broken).
6195
+ await switchroomReply(
6196
+ ctx,
6197
+ `<b>/auth add code failed:</b> ${escapeHtmlForTg((err as Error)?.message ?? String(err))}`,
6198
+ { html: true },
6199
+ )
6200
+ }
6201
+ // Redact the OAuth code paste from chat history (#488).
6202
+ redactAuthCodeMessage(bot.api as never, chat_id, msgId ?? null, line => process.stderr.write(line))
6203
+ return
6204
+ }
6205
+ // Stale — drop the pending entry but let the message fall through
6206
+ // to other intercepts (defensively wipe scratch).
6207
+ cancelAccountAuthSession(pendingAdd)
6208
+ pendingAuthAddFlows.delete(chat_id)
6209
+ }
6210
+
5945
6211
  // Auth-code intercept
5946
6212
  const pendingReauth = pendingReauthFlows.get(chat_id)
5947
6213
  if (pendingReauth && looksLikeAuthCode(text)) {
@@ -6982,6 +7248,11 @@ export function _resetDockerReachableCache(): void {
6982
7248
  _dockerReachable = undefined
6983
7249
  }
6984
7250
 
7251
+ // hostd dispatch lives in `hostd-dispatch.ts` (extracted for testability).
7252
+ // Re-export the cache-reset so existing test patterns that reach into
7253
+ // gateway.ts for `_resetDockerReachableCache` find a parallel hook.
7254
+ export { _resetHostdEnabledCache }
7255
+
6985
7256
  function spawnSwitchroomDetached(
6986
7257
  args: string[],
6987
7258
  onFailure?: (info: { code: number; tail: string }) => void,
@@ -7318,6 +7589,75 @@ async function executeVaultOp(ctx: Context, chatId: string, op: 'list' | 'get' |
7318
7589
  }
7319
7590
  }
7320
7591
 
7592
+ /**
7593
+ * Dispatch a short-running verb (agent_start, agent_stop, cross-agent
7594
+ * agent_restart) through hostd when available, else fall back to the
7595
+ * legacy in-container CLI shell-out.
7596
+ *
7597
+ * Why: on docker-mode hosts the agent container has no docker binary,
7598
+ * so the legacy `runSwitchroomCommand` path silently exits 127 for any
7599
+ * verb that touches compose (RFC C §1, #926). Hostd runs on the host
7600
+ * with the docker socket mounted, so the verb actually works.
7601
+ *
7602
+ * Result handling:
7603
+ * - `not-configured` → fall back to {@link runSwitchroomCommand}.
7604
+ * (Operator opted out; let the legacy path's existing error
7605
+ * surfacing handle the exit-127 case.)
7606
+ * - `completed` → reply with the stdout tail (mirrors the legacy
7607
+ * path's formatted-output reply).
7608
+ * - `started` → reply with a brief "🔄 dispatched" ack. Verbs that
7609
+ * return `started` (agent_restart) finish asynchronously on the
7610
+ * daemon; the audit log is the canonical record.
7611
+ * - `error` / `denied` → reply with the error tail inline. No
7612
+ * fallback (RFC §7 hard-fail contract — operator opted in).
7613
+ */
7614
+ async function dispatchShortVerbViaHostd(
7615
+ ctx: Context,
7616
+ req: HostdRequest,
7617
+ label: string,
7618
+ legacyArgs: string[],
7619
+ ): Promise<void> {
7620
+ const hostdResp = await tryHostdDispatch(getMyAgentName(), req)
7621
+ if (hostdResp === 'not-configured') {
7622
+ warnLegacySpawnIfHostdDisabled(req.op)
7623
+ await runSwitchroomCommand(ctx, legacyArgs, label)
7624
+ return
7625
+ }
7626
+ if (hostdResp.result === 'completed') {
7627
+ const body = hostdResp.stdout_tail?.trim() || `${label}: done (exit ${hostdResp.exit_code})`
7628
+ const formatted = formatSwitchroomOutput(stripAnsi(body))
7629
+ if (formatted) {
7630
+ await switchroomReply(ctx, preBlock(formatted), { html: true })
7631
+ } else {
7632
+ await switchroomReply(ctx, `${label}: done (no output)`)
7633
+ }
7634
+ return
7635
+ }
7636
+ if (hostdResp.result === 'started') {
7637
+ await switchroomReply(
7638
+ ctx,
7639
+ `🔄 <b>${escapeHtmlForTg(label)}</b> dispatched via hostd ` +
7640
+ `(request_id=<code>${escapeHtmlForTg(hostdResp.request_id)}</code>). ` +
7641
+ `Check audit log for completion.`,
7642
+ { html: true },
7643
+ )
7644
+ return
7645
+ }
7646
+ // error / denied — surface inline. RFC §7 hard-fail: no spawn fallback.
7647
+ const errBody =
7648
+ hostdResp.error ??
7649
+ hostdResp.stderr_tail ??
7650
+ hostdResp.stdout_tail ??
7651
+ '(no error tail returned)'
7652
+ await switchroomReply(
7653
+ ctx,
7654
+ `❌ <b>${escapeHtmlForTg(label)} failed via hostd</b> ` +
7655
+ `(result=${escapeHtmlForTg(hostdResp.result)}):\n` +
7656
+ preBlock(stripAnsi(errBody)),
7657
+ { html: true },
7658
+ )
7659
+ }
7660
+
7321
7661
  async function runSwitchroomCommand(ctx: Context, args: string[], label: string): Promise<void> {
7322
7662
  try {
7323
7663
  const output = stripAnsi(switchroomExec(args))
@@ -7364,7 +7704,7 @@ function renderAuthCodeOutcome(outcome: AuthCodeOutcome | null | undefined): str
7364
7704
  case 'pane-not-ready':
7365
7705
  return `Auth pane not ready — tap <b>Retry</b>.`
7366
7706
  case 'timeout':
7367
- return `Still waiting after 2 min — tap <b>Retry</b> or check <code>switchroom auth status</code>.${tail}`
7707
+ return `Still waiting after 2 min — tap <b>Retry</b> or check <code>switchroom auth list</code>.${tail}`
7368
7708
  }
7369
7709
  }
7370
7710
 
@@ -7550,9 +7890,13 @@ function buildAgentAudit(agentName: string): AgentAudit | undefined {
7550
7890
  }
7551
7891
 
7552
7892
  // Build an AgentMetadata snapshot for the current agent by shelling out
7553
- // to `switchroom agent list --json` and `switchroom auth status --json`.
7893
+ // to `switchroom agent list --json` and `switchroom auth show --json`.
7554
7894
  // Best-effort — any missing piece renders as a placeholder in the text
7555
- // templates rather than blocking the reply.
7895
+ // templates rather than blocking the reply. RFC H retired the per-agent
7896
+ // `auth status --json` shape; auth state is now derived from the
7897
+ // broker's fleet-wide `ListStateData` payload via
7898
+ // `buildAuthSummaryFromBroker`, with billingType pulled from the
7899
+ // agent's `.claude.json` (the broker doesn't track plan tier).
7556
7900
  async function buildAgentMetadata(agentName: string): Promise<AgentMetadata> {
7557
7901
  type AgentListResp = {
7558
7902
  agents: Array<{
@@ -7562,24 +7906,22 @@ async function buildAgentMetadata(agentName: string): Promise<AgentMetadata> {
7562
7906
  model?: string | null;
7563
7907
  }>
7564
7908
  }
7565
- type AuthStatusResp = {
7566
- agents: Array<{
7567
- name: string; authenticated: boolean; auth_source: string | null;
7568
- subscription_type: string | null; expires_in: string | null;
7569
- }>
7570
- }
7571
7909
  const list = switchroomExecJson<AgentListResp>(['agent', 'list'])
7572
- const auth = switchroomExecJson<AuthStatusResp>(['auth', 'status'])
7910
+ const brokerState = switchroomExecJson<BrokerStateView>(['auth', 'show'])
7573
7911
  const a = list?.agents?.find(x => x.name === agentName) ?? null
7574
- const au = auth?.agents?.find(x => x.name === agentName) ?? null
7575
- const authSummary: AuthSummary | null = au
7576
- ? {
7577
- authenticated: au.authenticated,
7578
- subscription_type: au.subscription_type,
7579
- expires_in: au.expires_in,
7580
- auth_source: au.auth_source,
7581
- }
7582
- : null
7912
+ let claudeJson: ClaudeJsonView | null = null
7913
+ try {
7914
+ const agentDir = resolveAgentDirFromEnv()
7915
+ if (agentDir) {
7916
+ const raw = readFileSync(join(agentDir, '.claude', '.claude.json'), 'utf8')
7917
+ claudeJson = JSON.parse(raw) as ClaudeJsonView
7918
+ }
7919
+ } catch { /* leave null — billingType becomes null in the summary */ }
7920
+ const authSummary: AuthSummary | null = buildAuthSummaryFromBroker(
7921
+ brokerState,
7922
+ agentName,
7923
+ claudeJson,
7924
+ )
7583
7925
  return {
7584
7926
  agentName,
7585
7927
  model: a?.model ?? null,
@@ -7724,14 +8066,24 @@ bot.command('agentstart', async ctx => {
7724
8066
  if (!isAuthorizedSender(ctx)) return
7725
8067
  const name = ctx.match?.trim() || getMyAgentName()
7726
8068
  try { assertSafeAgentName(name) } catch { await switchroomReply(ctx, 'Invalid agent name.'); return }
7727
- await runSwitchroomCommand(ctx, ['agent', 'start', name], `start ${name}`)
8069
+ await dispatchShortVerbViaHostd(
8070
+ ctx,
8071
+ { v: 1, op: 'agent_start', request_id: hostdRequestId('gw-start'), args: { name } },
8072
+ `start ${name}`,
8073
+ ['agent', 'start', name],
8074
+ )
7728
8075
  })
7729
8076
 
7730
8077
  bot.command('stop', async ctx => {
7731
8078
  if (!isAuthorizedSender(ctx)) return
7732
8079
  const name = ctx.match?.trim() || getMyAgentName()
7733
8080
  try { assertSafeAgentName(name) } catch { await switchroomReply(ctx, 'Invalid agent name.'); return }
7734
- await runSwitchroomCommand(ctx, ['agent', 'stop', name], `stop ${name}`)
8081
+ await dispatchShortVerbViaHostd(
8082
+ ctx,
8083
+ { v: 1, op: 'agent_stop', request_id: hostdRequestId('gw-stop'), args: { name } },
8084
+ `stop ${name}`,
8085
+ ['agent', 'stop', name],
8086
+ )
7735
8087
  })
7736
8088
 
7737
8089
  bot.command('restart', async ctx => {
@@ -7771,13 +8123,52 @@ bot.command('restart', async ctx => {
7771
8123
  // of whatever reason the downstream CLI would default to.
7772
8124
  stampUserRestartReason('user: /restart from chat')
7773
8125
  await sweepBeforeSelfRestart()
7774
- spawnSwitchroomDetached(
7775
- ['agent', 'restart', name, '--force'],
7776
- notifyDetachedFailure(chatId, threadId ?? null, `restart ${name}`),
8126
+ const hostdResp = await tryHostdDispatch(getMyAgentName(), {
8127
+ v: 1,
8128
+ op: 'agent_restart',
8129
+ request_id: hostdRequestId('gw-restart'),
8130
+ args: { name, force: true, reason: 'user: /restart from chat' },
8131
+ })
8132
+ if (hostdResp === 'not-configured') {
8133
+ warnLegacySpawnIfHostdDisabled('agent_restart')
8134
+ spawnSwitchroomDetached(
8135
+ ['agent', 'restart', name, '--force'],
8136
+ notifyDetachedFailure(chatId, threadId ?? null, `restart ${name}`),
8137
+ )
8138
+ return
8139
+ }
8140
+ if (hostdResp.result === 'started' || hostdResp.result === 'completed') {
8141
+ // Dispatched via hostd. The recreate will kill this gateway
8142
+ // shortly; the new gateway reads the marker and edits the ack.
8143
+ return
8144
+ }
8145
+ // hostd was attempted but errored/denied — clear marker and surface.
8146
+ clearRestartMarker()
8147
+ await switchroomReply(
8148
+ ctx,
8149
+ `❌ <b>restart ${escapeHtmlForTg(name)} failed via hostd</b> ` +
8150
+ `(result=${escapeHtmlForTg(hostdResp.result)}):\n` +
8151
+ preBlock(hostdResp.error ?? '(no error message)'),
8152
+ { html: true },
7777
8153
  )
7778
8154
  return
7779
8155
  }
7780
- await runSwitchroomCommand(ctx, ['agent', 'restart', name], `restart ${name}`)
8156
+ // Cross-agent /restart <other>. Same hostd-first shape as self-target,
8157
+ // but no restart marker / no self-kill: another agent's container is
8158
+ // about to bounce, not ours. The daemon spawns the work and returns
8159
+ // "started" (per handleAgentRestart at server.ts:466), so the user
8160
+ // sees a brief dispatch ack and the audit log carries the outcome.
8161
+ await dispatchShortVerbViaHostd(
8162
+ ctx,
8163
+ {
8164
+ v: 1,
8165
+ op: 'agent_restart',
8166
+ request_id: hostdRequestId('gw-restart-cross'),
8167
+ args: { name, force: true, reason: `user: /restart ${name} from chat` },
8168
+ },
8169
+ `restart ${name}`,
8170
+ ['agent', 'restart', name],
8171
+ )
7781
8172
  })
7782
8173
 
7783
8174
  // ─── /new and /reset ──────────────────────────────────────────────────────
@@ -7889,9 +8280,30 @@ async function handleNewOrResetCommand(ctx: Context, kind: 'new' | 'reset'): Pro
7889
8280
  // /new" / "user: /reset" rather than the downstream CLI default.
7890
8281
  stampUserRestartReason(`user: /${kind} from chat`)
7891
8282
  await sweepBeforeSelfRestart()
7892
- spawnSwitchroomDetached(
7893
- ['agent', 'restart', name, '--force'],
7894
- notifyDetachedFailure(chatId, threadId ?? null, `${kind} ${name}`),
8283
+ const hostdResp = await tryHostdDispatch(getMyAgentName(), {
8284
+ v: 1,
8285
+ op: 'agent_restart',
8286
+ request_id: hostdRequestId(`gw-${kind}`),
8287
+ args: { name, force: true, reason: `user: /${kind} from chat` },
8288
+ })
8289
+ if (hostdResp === 'not-configured') {
8290
+ warnLegacySpawnIfHostdDisabled('agent_restart')
8291
+ spawnSwitchroomDetached(
8292
+ ['agent', 'restart', name, '--force'],
8293
+ notifyDetachedFailure(chatId, threadId ?? null, `${kind} ${name}`),
8294
+ )
8295
+ return
8296
+ }
8297
+ if (hostdResp.result === 'started' || hostdResp.result === 'completed') {
8298
+ return
8299
+ }
8300
+ clearRestartMarker()
8301
+ await switchroomReply(
8302
+ ctx,
8303
+ `❌ <b>${escapeHtmlForTg(kind)} ${escapeHtmlForTg(name)} failed via hostd</b> ` +
8304
+ `(result=${escapeHtmlForTg(hostdResp.result)}):\n` +
8305
+ preBlock(hostdResp.error ?? '(no error message)'),
8306
+ { html: true },
7895
8307
  )
7896
8308
  }
7897
8309
 
@@ -7947,22 +8359,23 @@ bot.command('update', async ctx => {
7947
8359
  // container, which has the switchroom CLI baked in but no docker
7948
8360
  // binary and no /var/run/docker.sock mount. So `switchroom update`'s
7949
8361
  // pull-images and recreate-containers steps would fail with
7950
- // "docker: command not found". Without this guard, the operator
7951
- // sees an opaque "❌ update failed (exit 127)" via
7952
- // notifyDetachedFailure ~5s after the ack.
8362
+ // "docker: command not found".
7953
8363
  //
7954
- // Surface a clean explanation instead, pointing them at the host
7955
- // CLI as the working path. /update (dry-run) does NOT need docker
7956
- // and is unaffected only /update apply.
7957
- if (!isDockerReachable()) {
8364
+ // BYPASSED when hostd is on (#1175 Phase 2 RFC C): hostd runs on the
8365
+ // host with the docker socket mounted, so the in-container docker
8366
+ // dependency goes away. Skip the guard so /update apply can dispatch
8367
+ // through hostd. When hostd is NOT in play, keep the guard so the
8368
+ // operator gets a clean explanation instead of an opaque exit-127.
8369
+ if (!hostdWillBeUsed(getMyAgentName()) && !isDockerReachable()) {
7958
8370
  await switchroomReply(
7959
8371
  ctx,
7960
8372
  `❌ <b>/update apply</b> needs docker access from inside the agent ` +
7961
8373
  `container, but it's not available (no <code>docker</code> binary on ` +
7962
8374
  `PATH, no <code>/var/run/docker.sock</code> mount).\n\n` +
7963
- `On docker installs, run <code>switchroom update</code> from the ` +
7964
- `host shell instead.\n\n` +
7965
- `<i>Tracked as #926 host-side update daemon would close this gap.</i>`,
8375
+ `On docker installs, either run <code>switchroom update</code> from ` +
8376
+ `the host shell, or enable <code>host_control.enabled</code> in ` +
8377
+ `<code>switchroom.yaml</code> and <code>switchroom hostd install</code> ` +
8378
+ `so this verb dispatches through the host-side daemon.`,
7966
8379
  { html: true },
7967
8380
  )
7968
8381
  return
@@ -8036,9 +8449,94 @@ bot.command('update', async ctx => {
8036
8449
  // pinned-progress-card surface is the headline feature per CLAUDE.md;
8037
8450
  // leaving one pinned across the recreate would surprise the operator.
8038
8451
  await sweepBeforeSelfRestart()
8039
- spawnSwitchroomDetached(
8040
- ['update', ...passthrough],
8041
- notifyDetachedFailure(chatId, threadId ?? null, 'update'),
8452
+ const skipImages = passthrough.includes('--skip-images')
8453
+ const rebuild = passthrough.includes('--rebuild')
8454
+ const updateRequestId = hostdRequestId('gw-update')
8455
+ const hostdResp = await tryHostdDispatch(getMyAgentName(), {
8456
+ v: 1,
8457
+ op: 'update_apply',
8458
+ request_id: updateRequestId,
8459
+ args: {
8460
+ ...(skipImages ? { skip_images: true } : {}),
8461
+ ...(rebuild ? { rebuild: true } : {}),
8462
+ },
8463
+ })
8464
+ if (hostdResp === 'not-configured') {
8465
+ warnLegacySpawnIfHostdDisabled('update_apply')
8466
+ spawnSwitchroomDetached(
8467
+ ['update', ...passthrough],
8468
+ notifyDetachedFailure(chatId, threadId ?? null, 'update'),
8469
+ )
8470
+ return
8471
+ }
8472
+ if (hostdResp.result === 'completed') {
8473
+ return
8474
+ }
8475
+ if (hostdResp.result === 'started') {
8476
+ // RFC C §5.3: long-running mutation. Poll get_status until terminal
8477
+ // or until the recreate kills this gateway (whichever happens first).
8478
+ // The success signal is the post-restart greeting card edited into
8479
+ // ackId via the restart marker. The poll is here so that
8480
+ // *fail-before-recreate* (image pull error, scaffold regen crash)
8481
+ // doesn't leave the operator staring at the orphan "🚀 update started"
8482
+ // ack indefinitely. Live repro: PR #1305.
8483
+ void (async () => {
8484
+ // 60s budget: RFC C §5.3 specs `apply` at 30s and `update_apply`
8485
+ // at 60s. Image pulls + scaffold regeneration dominate the wall
8486
+ // clock for update_apply, hence the larger budget. The poll
8487
+ // resolves earlier on any terminal state from the daemon.
8488
+ const terminal = await pollHostdStatus(getMyAgentName(), updateRequestId, {
8489
+ timeoutMs: 60_000,
8490
+ })
8491
+ if (terminal === 'not-configured') return
8492
+ // completed → recreate is about to run / has run; let the post-
8493
+ // restart greeting card handle the success message.
8494
+ if (terminal.result === 'completed') return
8495
+ // Anything else means the daemon's mutation failed before it could
8496
+ // kill us. Edit the ack to surface the tail and clear the marker
8497
+ // so the next gateway boot doesn't render a false success card.
8498
+ clearRestartMarker()
8499
+ const errBody =
8500
+ terminal.error ??
8501
+ terminal.stderr_tail ??
8502
+ terminal.stdout_tail ??
8503
+ '(no error tail returned)'
8504
+ const editedText =
8505
+ `🚀 <b>update started</b> — <b>FAILED</b> via hostd ` +
8506
+ `(result=${escapeHtmlForTg(terminal.result)}):\n` +
8507
+ preBlock(errBody)
8508
+ if (ackId != null) {
8509
+ try {
8510
+ await robustApiCall(
8511
+ () =>
8512
+ lockedBot.api.editMessageText(chatId, ackId!, editedText, {
8513
+ parse_mode: 'HTML',
8514
+ link_preview_options: { is_disabled: true },
8515
+ }),
8516
+ { verb: 'update.poll.editAck' },
8517
+ )
8518
+ } catch {
8519
+ // edit-failed (message deleted, parse error) — fall back to
8520
+ // a fresh reply so the failure isn't silent.
8521
+ try {
8522
+ await switchroomReply(ctx, editedText, { html: true })
8523
+ } catch {}
8524
+ }
8525
+ } else {
8526
+ try {
8527
+ await switchroomReply(ctx, editedText, { html: true })
8528
+ } catch {}
8529
+ }
8530
+ })()
8531
+ return
8532
+ }
8533
+ clearRestartMarker()
8534
+ await switchroomReply(
8535
+ ctx,
8536
+ `❌ <b>/update apply failed via hostd</b> ` +
8537
+ `(result=${escapeHtmlForTg(hostdResp.result)}):\n` +
8538
+ preBlock(hostdResp.error ?? '(no error message)'),
8539
+ { html: true },
8042
8540
  )
8043
8541
  })
8044
8542
 
@@ -8066,6 +8564,81 @@ bot.command('upgrade', async ctx => {
8066
8564
  )
8067
8565
  })
8068
8566
 
8567
+ // /audit hostd — tail/filter the hostd audit log. Mirrors `/vault audit`
8568
+ // in spirit (operator observability over a privileged subsystem from any
8569
+ // admin DM). Admin-gated via ADMIN_COMMAND_NAMES. Reads the audit JSONL
8570
+ // at ~/.switchroom/host-control-audit.log directly — no hostd RPC needed
8571
+ // because the file is shared via the host bind mount on docker installs.
8572
+ bot.command('audit', async ctx => {
8573
+ if (!isAuthorizedSender(ctx)) return
8574
+ const arg = (ctx.match ?? '').trim()
8575
+ if (arg === '' || arg === 'help' || arg === '--help') {
8576
+ await switchroomReply(
8577
+ ctx,
8578
+ 'Usage: <code>/audit hostd [--tail N] [--agent &lt;name&gt;] [--op &lt;verb&gt;] [--error]</code>',
8579
+ { html: true },
8580
+ )
8581
+ return
8582
+ }
8583
+ const tokens = arg.split(/\s+/)
8584
+ const sub = tokens[0]
8585
+ if (sub !== 'hostd') {
8586
+ await switchroomReply(
8587
+ ctx,
8588
+ `Unknown audit target <code>${escapeHtmlForTg(sub ?? '')}</code>. ` +
8589
+ `Supported: <code>hostd</code>.`,
8590
+ { html: true },
8591
+ )
8592
+ return
8593
+ }
8594
+ // Build the CLI argv for switchroom hostd audit. Validate each
8595
+ // operator-supplied value to keep argv injection out of the picture.
8596
+ const ALLOWED_OPS = new Set([
8597
+ 'agent_start', 'agent_stop', 'agent_restart', 'apply',
8598
+ 'update_check', 'update_apply', 'update_status', 'upgrade_status',
8599
+ 'get_status', 'doctor', 'fleet_state',
8600
+ ])
8601
+ const argv: string[] = ['hostd', 'audit']
8602
+ for (let i = 1; i < tokens.length; i++) {
8603
+ const t = tokens[i]!
8604
+ if (t === '--error') { argv.push('--error'); continue }
8605
+ if (t === '--tail' || t === '--agent' || t === '--op') {
8606
+ const v = tokens[++i]
8607
+ if (v == null) {
8608
+ await switchroomReply(ctx, `Flag <code>${t}</code> requires a value.`, { html: true })
8609
+ return
8610
+ }
8611
+ if (t === '--tail' && !/^[0-9]{1,4}$/.test(v)) {
8612
+ await switchroomReply(ctx, `<code>--tail</code> must be an integer (1-9999).`, { html: true })
8613
+ return
8614
+ }
8615
+ if (t === '--agent' && !/^[a-z][a-z0-9-]{0,62}$/i.test(v)) {
8616
+ await switchroomReply(ctx, `<code>--agent</code> name has an invalid shape.`, { html: true })
8617
+ return
8618
+ }
8619
+ if (t === '--op' && !ALLOWED_OPS.has(v)) {
8620
+ await switchroomReply(
8621
+ ctx,
8622
+ `Unknown hostd verb <code>${escapeHtmlForTg(v)}</code>. ` +
8623
+ `Known: ${[...ALLOWED_OPS].sort().map(o => `<code>${o}</code>`).join(', ')}.`,
8624
+ { html: true },
8625
+ )
8626
+ return
8627
+ }
8628
+ argv.push(t, v)
8629
+ continue
8630
+ }
8631
+ await switchroomReply(
8632
+ ctx,
8633
+ `Unknown flag <code>${escapeHtmlForTg(t)}</code>. ` +
8634
+ `Allowed: <code>--tail</code>, <code>--agent</code>, <code>--op</code>, <code>--error</code>.`,
8635
+ { html: true },
8636
+ )
8637
+ return
8638
+ }
8639
+ await runSwitchroomCommand(ctx, argv, `hostd audit${argv.length > 2 ? ' …' : ''}`)
8640
+ })
8641
+
8069
8642
  // ─── /approve, /deny, /pending ────────────────────────────────────────────
8070
8643
  // Slash-command alternatives to the inline-button approval flow (useful for
8071
8644
  // desktop-only sessions and power-users). Share pendingPermissions state
@@ -8129,6 +8702,59 @@ async function handlePermissionSlash(ctx: Context, behavior: 'allow' | 'deny'):
8129
8702
  bot.command('approve', async ctx => handlePermissionSlash(ctx, 'allow'))
8130
8703
  bot.command('deny', async ctx => handlePermissionSlash(ctx, 'deny'))
8131
8704
 
8705
+ // ─── Drive folder picker (RFC E §4.1) ───────────────────────────────────
8706
+ // /folders — post a Telegram picker card listing this agent's top-level
8707
+ // Drive folders. Tap [Allow] on a folder to grant the agent
8708
+ // allow_always at doc:gdrive:folder/<id>/**; tap [Browse] to drill in.
8709
+ //
8710
+ // Authorisation: same dmCommandGate as the other operator slash
8711
+ // commands — only allowFrom users can post-trigger.
8712
+
8713
+ const folderPickerCache = new FolderListCache()
8714
+
8715
+ function buildFolderPickerDeps(): FolderPickerHandlerDeps {
8716
+ const agentName = getMyAgentName()
8717
+ return {
8718
+ agentName,
8719
+ cache: folderPickerCache,
8720
+ fetchPage: async ({ parent_id, page_token }) => {
8721
+ const handle = await loadFromAuthBroker()
8722
+ if (handle === null) {
8723
+ throw new Error(
8724
+ `auth-broker unreachable for agent ${agentName} — is the broker container running?`,
8725
+ )
8726
+ }
8727
+ return fetchFolderPage({
8728
+ access_token: handle.access_token,
8729
+ ...(parent_id !== undefined ? { parent_id } : {}),
8730
+ ...(page_token !== undefined ? { page_token } : {}),
8731
+ })
8732
+ },
8733
+ approvalRequest: async (args) => {
8734
+ const r = await kernelApprovalRequest({
8735
+ agent_unit: args.agent_unit,
8736
+ scope: args.scope,
8737
+ action: args.action,
8738
+ approver_set: args.approver_set,
8739
+ ...(args.why !== null && args.why !== undefined ? { why: args.why } : {}),
8740
+ ...(args.ttl_ms !== null && args.ttl_ms !== undefined ? { ttl_ms: args.ttl_ms } : {}),
8741
+ })
8742
+ if (r === null || r.state === 'rate_limited') return null
8743
+ return { request_id: r.request_id }
8744
+ },
8745
+ approvalConsume: async (id) => {
8746
+ const r = await kernelApprovalConsume(id)
8747
+ return r !== null && r.consumed
8748
+ },
8749
+ approvalRecord: async (args) => kernelApprovalRecord(args),
8750
+ }
8751
+ }
8752
+
8753
+ bot.command('folders', async ctx => {
8754
+ if (!isAuthorizedSender(ctx)) return
8755
+ await handleFoldersCommand(ctx, buildFolderPickerDeps())
8756
+ })
8757
+
8132
8758
  // /pending — list current pending permission prompts with their ids, so the
8133
8759
  // user can target a specific one via /approve <id> or /deny <id>.
8134
8760
  // Restricted to access.allowFrom DMs to match /approve and /deny — it
@@ -8160,16 +8786,12 @@ bot.command('interrupt', async ctx => {
8160
8786
  await runSwitchroomCommand(ctx, ['agent', 'interrupt', name], `interrupt ${name}`)
8161
8787
  })
8162
8788
 
8163
- // Shared auto-fallback state. `lockout` is a per-process in-memory
8164
- // guard against rapid re-fire between the scheduled poll and any
8165
- // manual trigger (see telegram-plugin/auto-fallback.ts).
8166
- //
8167
- // Pre-#417 fix this was always emptyLockout() at process start, so a
8168
- // gateway restart inside the cooldown window reset the timer and a
8169
- // quota-flap on the recovering slot could re-trigger fallback the
8170
- // moment the gateway came back. We now seed from disk on first use
8171
- // and persist on every transition. Errors are swallowed: losing the
8172
- // lockout file just degrades to in-memory-only behaviour.
8789
+ // Persist-ops bundle for the legacy auto-fallback lockout file. The
8790
+ // only remaining reader is `isAutoFallbackCooldownActive` (line ~2030)
8791
+ // used by the pending-restart drain cap to defer a forced restart
8792
+ // stacking on top of an in-flight slot rotation. The legacy poller
8793
+ // that USED to write this file was retired alongside this refactor;
8794
+ // existing on-disk lockouts age out via DEFAULT_FALLBACK_COOLDOWN_MS.
8173
8795
  const lockoutOps: LockoutPersistOps = {
8174
8796
  readFileSync: (p, enc) => readFileSync(p, enc),
8175
8797
  writeFileSync: (p, data, opts) => writeFileSync(p, data, opts),
@@ -8177,24 +8799,6 @@ const lockoutOps: LockoutPersistOps = {
8177
8799
  mkdirSync: (p, opts) => mkdirSync(p, opts),
8178
8800
  joinPath: (...parts) => join(...parts),
8179
8801
  }
8180
- let autoFallbackLockout: LockoutRecord = emptyLockout()
8181
- let autoFallbackLockoutSeeded = false
8182
- function seedAutoFallbackLockoutIfNeeded(agentDir: string): void {
8183
- if (autoFallbackLockoutSeeded) return
8184
- autoFallbackLockoutSeeded = true
8185
- try {
8186
- autoFallbackLockout = loadLockout(agentDir, lockoutOps)
8187
- } catch (err) {
8188
- process.stderr.write(`telegram gateway: auto-fallback lockout seed failed (using empty): ${(err as Error).message}\n`)
8189
- }
8190
- }
8191
- function persistLockout(agentDir: string): void {
8192
- try {
8193
- saveLockout(agentDir, autoFallbackLockout, lockoutOps)
8194
- } catch (err) {
8195
- process.stderr.write(`telegram gateway: auto-fallback lockout persist failed: ${(err as Error).message}\n`)
8196
- }
8197
- }
8198
8802
 
8199
8803
  // Pinned slot-banner state (#421). One banner per gateway process,
8200
8804
  // in the owner chat (access.allowFrom[0]). Per-topic forum support
@@ -8225,91 +8829,123 @@ async function refreshPinnedBanner(reason: string): Promise<void> {
8225
8829
  }
8226
8830
  }
8227
8831
 
8228
- type AutoFallbackCheckResult =
8229
- | { kind: 'no-action'; reason: string; decision: 'noop' | 'fallback-skipped' }
8230
- | { kind: 'executed'; previousSlot: string; newSlot: string }
8231
- | { kind: 'exhausted-all'; activeSlot: string }
8232
- | { kind: 'error'; message: string }
8832
+ /**
8833
+ * Re-entry guard + dedup window for `fireFleetAutoFallback`. The state
8834
+ * was lifted into `fleet-fallback-gate.ts` so it can be tested in
8835
+ * isolation (gateway.ts module state was unreachable from vitest). The
8836
+ * gate ALSO enforces the broker-reachability honesty contract: when the
8837
+ * broker is down, `wouldFire()` returns false so the model-unavailable
8838
+ * card stays honest instead of advertising a swap that would bail with
8839
+ * `reason=no-broker-client`.
8840
+ */
8841
+ const FLEET_FALLBACK_DEDUP_MS = 30_000
8842
+
8843
+ /** Synchronous reachability check for the auth-broker UDS. Used by the
8844
+ * fleet-fallback gate to keep the model-unavailable card honest: if the
8845
+ * broker socket isn't bound, the dispatcher would bail with
8846
+ * `reason=no-broker-client`, so `wouldFire()` should return false and
8847
+ * the card should fall back to the manual `/auth use <label>` hint. */
8848
+ function isAuthBrokerSocketReachable(): boolean {
8849
+ try {
8850
+ return existsSync(resolveAuthBrokerSocketPath())
8851
+ } catch {
8852
+ return false
8853
+ }
8854
+ }
8855
+
8856
+ const fleetFallbackGate = createFleetFallbackGate({
8857
+ dedupMs: FLEET_FALLBACK_DEDUP_MS,
8858
+ brokerReachable: isAuthBrokerSocketReachable,
8859
+ })
8860
+
8861
+ function wouldFireFleetAutoFallback(): boolean {
8862
+ return fleetFallbackGate.wouldFire()
8863
+ }
8864
+
8865
+ /**
8866
+ * Fleet-wide auto-fallback dispatcher (RFC H follow-up).
8867
+ *
8868
+ * Wired from the model-unavailable card render path so a quota-out
8869
+ * event on ANY agent immediately triggers a fleet-wide swap (via
8870
+ * broker.setActive — same path /auth use takes), not the per-agent
8871
+ * legacy `runAutoFallbackCheck`. Pre-fix, the card path never called
8872
+ * any fallback machinery; the scheduled poller (60-min interval, only
8873
+ * fires on utilization headers) was the only trigger and missed
8874
+ * hard-rejection events.
8875
+ *
8876
+ * Concurrency: collapses concurrent triggers via the in-flight
8877
+ * promise above. Subsequent calls within `FLEET_FALLBACK_DEDUP_MS` of
8878
+ * a recent fire are dropped silently — the broadcast announcement is
8879
+ * the user-visible signal that the swap happened, no need to repeat.
8880
+ *
8881
+ * Fire-and-forget: never throws into the caller's flow. Posts the
8882
+ * causal-shape announcement to every chat in `loadAccess().allowFrom`
8883
+ * so the user sees the outcome inline with the original "Model
8884
+ * unavailable" card.
8885
+ */
8886
+ async function fireFleetAutoFallback(triggerAgent: string): Promise<void> {
8887
+ return fleetFallbackGate.fire(
8888
+ () => doFireFleetAutoFallback(triggerAgent),
8889
+ (err) => {
8890
+ process.stderr.write(
8891
+ `telegram gateway: [fleet-fallback] error agent=${triggerAgent}: ${(err as Error)?.message ?? err}\n`,
8892
+ )
8893
+ },
8894
+ )
8895
+ }
8233
8896
 
8234
- async function runAutoFallbackCheck(opts: { trigger: 'scheduled' | 'manual' }): Promise<AutoFallbackCheckResult> {
8235
- // All log lines in this path use the `[autofallback]` tag so a single
8236
- // grep against journalctl reconstructs the full decision history of
8237
- // a slot rotation: `journalctl -u switchroom-<agent>-gateway -g autofallback`.
8897
+ /** Returns true iff the dispatcher actually performed a swap (and the
8898
+ * user-visible announcement was broadcast). False on no-op /
8899
+ * error / idempotent-skip caller uses this to decide whether to
8900
+ * arm the post-fire suppression window. */
8901
+ async function doFireFleetAutoFallback(triggerAgent: string): Promise<boolean> {
8238
8902
  try {
8239
- const agentDir = resolveAgentDirFromEnv()
8240
- if (!agentDir) {
8241
- return { kind: 'no-action', reason: 'no agent dir', decision: 'noop' }
8242
- }
8243
- const agentName = getMyAgentName()
8244
- seedAutoFallbackLockoutIfNeeded(agentDir)
8245
- const active = currentActiveSlot(agentDir)
8246
- const quota = await fetchQuota({ claudeConfigDir: join(agentDir, '.claude') })
8247
- const decision = evaluateFallbackTrigger({
8248
- quota,
8249
- activeSlot: active,
8250
- now: Date.now(),
8251
- lockout: autoFallbackLockout,
8252
- })
8253
- if (decision.action !== 'fallback') {
8903
+ const client = await getAuthBrokerClient(triggerAgent)
8904
+ if (!client) {
8254
8905
  process.stderr.write(
8255
- `telegram gateway: [autofallback] noop trigger=${opts.trigger} agent=${agentName} active=${active ?? 'none'} reason=${decision.reason}\n`,
8906
+ `telegram gateway: [fleet-fallback] skipped agent=${triggerAgent} reason=no-broker-client\n`,
8256
8907
  )
8257
- return { kind: 'no-action', reason: decision.reason, decision: 'noop' }
8908
+ return false
8258
8909
  }
8259
- process.stderr.write(
8260
- `telegram gateway: [autofallback] decision=fallback trigger=${opts.trigger} agent=${agentName} active=${active ?? 'none'} reason=${decision.triggerReason} util=${decision.utilizationPct?.toFixed(1) ?? 'n/a'}%\n`,
8910
+ const state = await client.listState()
8911
+ // Probe live quota for every account in parallel. force:true
8912
+ // bypasses the 5-min in-process cache — we want the freshest data
8913
+ // for the swap decision, not a cached stale read.
8914
+ const quotas = await Promise.all(
8915
+ state.accounts.map((a) => fetchAccountQuota(a.label, { force: true })),
8261
8916
  )
8262
- const plan = performAutoFallback({
8263
- agentDir,
8264
- agentName,
8265
- decision,
8266
- deps: { currentActiveSlot, markSlotQuotaExhausted, fallbackToNextSlot },
8917
+ const tz = process.env.SWITCHROOM_TIMEZONE ?? process.env.TZ ?? 'UTC'
8918
+ const outcome = await runFleetAutoFallback({
8919
+ state,
8920
+ quotas,
8921
+ setActive: (label) => client.setActive(label),
8922
+ triggerAgent,
8923
+ tz,
8267
8924
  })
8268
- const ownerChatId = loadAccess().allowFrom[0]
8269
- await dispatchFallbackNotification({
8270
- bot,
8271
- ownerChatId,
8272
- plan,
8273
- onError: (err) => {
8274
- process.stderr.write(`telegram gateway: [autofallback] notify failed trigger=${opts.trigger} agent=${agentName}: ${err}\n`)
8275
- },
8276
- })
8277
- if (plan.kind === 'executed') {
8278
- try { assertSafeAgentName(plan.agentName) }
8279
- catch {
8280
- process.stderr.write(`telegram gateway: [autofallback] invalid-agent-name agent=${plan.agentName}\n`)
8281
- return { kind: 'error', message: `invalid agent name: ${plan.agentName}` }
8282
- }
8283
- try {
8284
- // Preemptive failover (utilization-over-threshold / explicit) waits
8285
- // for the active turn to drain. Reactive failover (429-response)
8286
- // hard-restarts because the request that triggered it has already
8287
- // failed — there's no in-flight turn worth preserving. See #420.
8288
- const restartArgs = ['agent', 'restart', plan.agentName]
8289
- if (plan.triggerReason !== '429-response') {
8290
- restartArgs.push('--graceful-restart')
8291
- }
8292
- process.stderr.write(
8293
- `telegram gateway: [autofallback] executed agent=${plan.agentName} prev=${plan.previousSlot} next=${plan.newSlot} restart=${plan.triggerReason === '429-response' ? 'hard' : 'graceful'}\n`,
8294
- )
8295
- switchroomExec(restartArgs)
8296
- } catch (err) {
8297
- process.stderr.write(`telegram gateway: [autofallback] restart failed agent=${plan.agentName}: ${err}\n`)
8298
- }
8299
- autoFallbackLockout = nextLockout(plan.previousSlot, Date.now())
8300
- persistLockout(agentDir)
8301
- void refreshPinnedBanner('auto-fallback')
8302
- return { kind: 'executed', previousSlot: plan.previousSlot, newSlot: plan.newSlot }
8303
- }
8304
8925
  process.stderr.write(
8305
- `telegram gateway: [autofallback] exhausted-all agent=${agentName} active=${plan.activeSlot}\n`,
8926
+ `telegram gateway: [fleet-fallback] outcome=${outcome.kind} agent=${triggerAgent}` +
8927
+ (outcome.kind === 'switched' ? ` old=${outcome.oldLabel} new=${outcome.newLabel}` : '') +
8928
+ '\n',
8306
8929
  )
8307
- autoFallbackLockout = nextLockout(plan.activeSlot, Date.now())
8308
- persistLockout(agentDir)
8309
- return { kind: 'exhausted-all', activeSlot: plan.activeSlot }
8930
+ // Post the announcement to every authorized chat. Mirrors the
8931
+ // operator-event broadcast pattern (line ~2290) — DM-only opts
8932
+ // (no message_thread_id) so THREAD_NOT_FOUND can't fire here;
8933
+ // wrap in swallowingApiCall anyway per the codebase rule.
8934
+ const access = loadAccess()
8935
+ if (access.allowFrom.length === 0) return outcome.kind === 'switched'
8936
+ const opts = { parse_mode: 'HTML' as const }
8937
+ for (const chat_id of access.allowFrom) {
8938
+ void swallowingApiCall(
8939
+ () => bot.api.sendMessage(chat_id, outcome.announcement, opts),
8940
+ { chat_id, verb: 'fleet-fallback:notify' },
8941
+ )
8942
+ }
8943
+ return outcome.kind === 'switched'
8310
8944
  } catch (err) {
8311
- process.stderr.write(`telegram gateway: [autofallback] ${opts.trigger} poll error: ${err}\n`)
8312
- return { kind: 'error', message: String((err as Error).message ?? err) }
8945
+ process.stderr.write(
8946
+ `telegram gateway: [fleet-fallback] error agent=${triggerAgent}: ${(err as Error)?.message ?? err}\n`,
8947
+ )
8948
+ return false
8313
8949
  }
8314
8950
  }
8315
8951
 
@@ -8346,14 +8982,19 @@ async function runCreditWatch(): Promise<void> {
8346
8982
  // assumption mirrors auto-fallback's notification routing.
8347
8983
  const access = loadAccess()
8348
8984
  for (const chat_id of access.allowFrom) {
8349
- try {
8350
- await bot.api.sendMessage(chat_id, decision.message, {
8351
- parse_mode: 'HTML',
8352
- link_preview_options: { is_disabled: true },
8353
- })
8354
- } catch (err) {
8355
- process.stderr.write(`telegram gateway: credit-watch notify chat=${chat_id} failed: ${err}\n`)
8356
- }
8985
+ // Credit-watch notify — best-effort. Wrap via swallowingApiCall so
8986
+ // flood-wait / deleted-chat / not-found surface as a stderr log
8987
+ // rather than a thrown exception that aborts the loop and leaves
8988
+ // half the allowFrom chats unnotified. Matches the wrapping
8989
+ // contract enforced by scripts/check-bot-api-wrapping.sh (#1075).
8990
+ await swallowingApiCall(
8991
+ () =>
8992
+ bot.api.sendMessage(chat_id, decision.message, {
8993
+ parse_mode: 'HTML',
8994
+ link_preview_options: { is_disabled: true },
8995
+ }),
8996
+ { chat_id, verb: 'credit-watch.notify' },
8997
+ )
8357
8998
  }
8358
8999
  // Persist state regardless of whether send succeeded — losing a
8359
9000
  // notify is bad, but re-spamming on every poll tick is worse.
@@ -8364,438 +9005,152 @@ async function runCreditWatch(): Promise<void> {
8364
9005
  }
8365
9006
  }
8366
9007
 
8367
- // /authfallback was removed in v0.6.12 it duplicated the work of
8368
- // the dashboard's Switch primary picker (operator-facing surface) and
8369
- // the auto-fallback poller (transparent on-quota-wall case).
8370
- // Operators who want to manually shuffle the active credential now
8371
- // use the picker. The `runAutoFallbackCheck` function and the
8372
- // `case 'fallback':` callback dispatch stay in the codebase: any
8373
- // pinned messages from earlier versions still work, and the
8374
- // auto-fallback poller still calls runAutoFallbackCheck directly.
8375
-
8376
- bot.command('auth', async ctx => {
9008
+ bot.command("auth", async ctx => {
8377
9009
  if (!isAuthorizedSender(ctx)) return
8378
- const parts = getCommandArgs(ctx).split(/\s+/).filter(Boolean)
9010
+ const text = ctx.message?.text ?? ""
9011
+ const parsed = parseAuthCommand(text)
9012
+ if (!parsed) return
8379
9013
  const currentAgent = getMyAgentName()
8380
- const intent = parseAuthSubCommand(parts, currentAgent)
8381
-
8382
- if (intent.kind === 'error' || intent.kind === 'usage') {
8383
- await switchroomReply(ctx, intent.message)
8384
- return
8385
- }
8386
-
8387
- if (intent.kind === 'login' || intent.kind === 'reauth' || intent.kind === 'link') {
8388
- await runSwitchroomAuthCommand(ctx, intent.cliArgs, intent.label)
8389
- if (intent.registerReauth) pendingReauthFlows.set(String(ctx.chat!.id), { agent: intent.agent, startedAt: Date.now() })
8390
- return
8391
- }
8392
- if (intent.kind === 'code') {
8393
- // Use structured JSON path so we can render typed outcome messages.
8394
- const { result, errorText } = execAuthCode(intent.agent, intent.code)
8395
- if (errorText) {
8396
- await switchroomReply(ctx, `<b>${escapeHtmlForTg(intent.label)} failed:</b>\n${preBlock(formatSwitchroomOutput(errorText))}`, { html: true })
8397
- } else if (result) {
8398
- const outcomeMsg = renderAuthCodeOutcome(result.outcome)
8399
- if (outcomeMsg) {
8400
- await switchroomReply(ctx, outcomeMsg, { html: true })
8401
- } else {
8402
- const output = result.instructions.join('\n')
8403
- const formatted = formatAuthOutputForTelegram(output)
8404
- await switchroomReply(ctx, formatted.text, { html: true })
8405
- }
8406
- }
8407
- pendingReauthFlows.delete(String(ctx.chat!.id))
8408
- // Redact the OAuth code from chat history (#488).
8409
- redactAuthCodeMessage(bot.api as never, String(ctx.chat!.id), ctx.message?.message_id ?? null, line => process.stderr.write(line))
8410
- return
8411
- }
8412
- if (intent.kind === 'cancel') {
8413
- await runSwitchroomCommand(ctx, intent.cliArgs, intent.label)
8414
- pendingReauthFlows.delete(String(ctx.chat!.id))
8415
- return
8416
- }
8417
-
8418
- // --- Slot management verbs ---
8419
-
8420
- if (intent.kind === 'add') {
8421
- await runSwitchroomAuthCommand(ctx, intent.cliArgs, intent.label)
8422
- pendingReauthFlows.set(String(ctx.chat!.id), { agent: intent.agent, startedAt: Date.now() })
8423
- void refreshPinnedBanner('auth-add')
8424
- return
8425
- }
8426
-
8427
- if (intent.kind === 'use') {
8428
- // Soft-confirm: a slot swap restarts the agent process, killing any
8429
- // in-flight turn. If a turn is currently running, refuse without
8430
- // --force so a fat-finger tap doesn't quietly destroy the user's
8431
- // work-in-progress. Idle-state swaps proceed with no friction. (#421B)
8432
- if (!intent.force && activeTurnStartedAt.size > 0) {
9014
+ // Post-unification admin gating: admin authority is sourced from each
9015
+ // agent's own `admin: true` flag (the same flag that gates /agents,
9016
+ // /restart, /update etc. per PR #1258). The gateway looks itself up
9017
+ // and passes a boolean through — handler-side code does not consult
9018
+ // any list. The broker enforces server-side from the same source.
9019
+ let isAdmin = false
9020
+ try {
9021
+ const cfg = loadSwitchroomConfig()
9022
+ const me = (cfg as unknown as { agents?: Record<string, { admin?: boolean }> })?.agents?.[currentAgent]
9023
+ isAdmin = me?.admin === true
9024
+ } catch { /* best-effort — non-admin is the safe default */ }
9025
+
9026
+ // `/auth add` and `/auth cancel` are gateway-routed (drive a
9027
+ // scratch-dir-backed `claude setup-token` lifecycle the broker
9028
+ // client can't model). Everything else delegates to
9029
+ // handleAuthCommand which only needs the narrow broker surface.
9030
+ const chatId = String(ctx.chat?.id ?? '')
9031
+ if (parsed.kind === 'add' || parsed.kind === 'cancel') {
9032
+ if (!isAuthAdmin({ isAdmin })) {
8433
9033
  await switchroomReply(
8434
9034
  ctx,
8435
- `⚠️ A turn is in flight. Swapping to <code>${escapeHtmlForTg(intent.slot)}</code> will abort it.\n` +
8436
- `Resend as <code>/auth use ${escapeHtmlForTg(intent.agent)} ${escapeHtmlForTg(intent.slot)} --force</code> to proceed.`,
9035
+ `<b>Not authorized.</b> <code>/auth ${parsed.kind}</code> is admin-only.\n` +
9036
+ `Set <code>admin: true</code> on this agent in switchroom.yaml to unlock ` +
9037
+ `(the same flag that gates <code>/agents</code>, <code>/restart</code>, ` +
9038
+ `<code>/update</code> etc.).`,
8437
9039
  { html: true },
8438
9040
  )
8439
9041
  return
8440
9042
  }
8441
- await runSwitchroomCommand(ctx, intent.cliArgs, intent.label)
8442
- // Restart the agent so the new OAuth token is picked up.
8443
- try { assertSafeAgentName(intent.agent) } catch { return }
8444
- await runSwitchroomCommand(ctx, ['agent', 'restart', intent.agent], `restart ${intent.agent}`)
8445
- void refreshPinnedBanner('auth-use')
8446
- return
8447
- }
8448
-
8449
- if (intent.kind === 'list') {
8450
- await runSwitchroomCommandFormatted(ctx, intent.cliArgs, intent.label, () => {
8451
- const data = switchroomExecJson<SlotListingFromCli>(intent.cliArgs)
8452
- if (!data) return null
8453
- return formatSlotList({ ...data, agent: data.agent ?? intent.agent })
8454
- })
8455
- return
8456
- }
8457
-
8458
- if (intent.kind === 'rm') {
8459
- // Safety check against current slot listing unless --force.
8460
- if (!intent.force) {
8461
- const listing = switchroomExecJson<SlotListingFromCli>(['auth', 'list', intent.agent, '--json'])
8462
- if (listing) {
8463
- const err = checkRemoveSafety({ ...listing, agent: listing.agent ?? intent.agent }, intent.slot, intent.force)
8464
- if (err) { await switchroomReply(ctx, err); return }
9043
+ if (parsed.kind === 'cancel') {
9044
+ const existing = pendingAuthAddFlows.get(chatId)
9045
+ if (!existing) {
9046
+ await switchroomReply(ctx, "<i>No pending <code>/auth add</code> flow in this chat.</i>", { html: true })
9047
+ return
8465
9048
  }
9049
+ cancelAccountAuthSession(existing)
9050
+ pendingAuthAddFlows.delete(chatId)
9051
+ await switchroomReply(ctx, "Cancelled.", { html: true })
9052
+ return
8466
9053
  }
8467
- await runSwitchroomCommand(ctx, intent.cliArgs, intent.label)
8468
- return
8469
- }
8470
-
8471
- // --- New account-shaped verbs (see reference/share-auth-across-the-fleet.md) ---
8472
-
8473
- if (intent.kind === 'account-add') {
8474
- // /auth account add <label> [--from-agent <name>]
8475
- // Lifts an already-authenticated agent's credentials into a global
8476
- // account so other agents can share the same Anthropic subscription
8477
- // without each running its own OAuth flow.
8478
- await runSwitchroomCommand(ctx, intent.cliArgs, intent.label)
8479
- return
8480
- }
8481
-
8482
- if (intent.kind === 'account-list') {
8483
- // /auth account list — table of accounts + which agents use each.
8484
- await runSwitchroomCommand(ctx, intent.cliArgs, intent.label)
8485
- return
8486
- }
8487
-
8488
- if (intent.kind === 'account-rm') {
8489
- // /auth account rm <label> — refused if any agent is still enabled.
8490
- await runSwitchroomCommand(ctx, intent.cliArgs, intent.label)
8491
- return
8492
- }
8493
-
8494
- if (intent.kind === 'account-rename') {
8495
- // /auth account rename <old> <new> — atomic dir rename + YAML
8496
- // rewrite of every agents.<name>.auth.accounts list. No agent
8497
- // restart required: per-agent credentials.json content is
8498
- // unchanged (only the source-of-truth label moved).
8499
- await runSwitchroomCommand(ctx, intent.cliArgs, intent.label)
8500
- return
8501
- }
8502
-
8503
- if (intent.kind === 'enable') {
8504
- // /auth enable <label> [agents...|all] — wires the account to those agents
8505
- // (defaults to the current agent), then restarts each so claude picks
8506
- // up the freshly fanned-out credentials. The CLI accepts the `all`
8507
- // keyword verbatim and expands it itself; we expand here too so the
8508
- // restart loop knows the real agent names.
8509
- await runSwitchroomCommand(ctx, intent.cliArgs, intent.label)
8510
- if (intent.restartAgentsAfter) {
8511
- const restartTargets = await resolveAgentsForRestart(intent.agents)
8512
- for (const a of restartTargets) {
8513
- try { assertSafeAgentName(a) } catch { continue }
8514
- await runSwitchroomCommand(ctx, ['agent', 'restart', a], `restart ${a}`)
8515
- }
9054
+ // parsed.kind === 'add'
9055
+ if (pendingAuthAddFlows.has(chatId)) {
9056
+ await switchroomReply(
9057
+ ctx,
9058
+ "<i>An <code>/auth add</code> flow is already in progress for this chat. " +
9059
+ "Finish the paste, or send <code>/auth cancel</code> to abort.</i>",
9060
+ { html: true },
9061
+ )
9062
+ return
8516
9063
  }
8517
- void refreshPinnedBanner('auth-enable')
8518
- return
8519
- }
8520
-
8521
- if (intent.kind === 'disable') {
8522
- // /auth disable <label> [agents...|all] — unwires the account from those
8523
- // agents. Doesn't auto-restart: the operator may want to drain the
8524
- // current credential first. The CLI hint already says "restart now".
8525
- await runSwitchroomCommand(ctx, intent.cliArgs, intent.label)
8526
- return
8527
- }
8528
-
8529
- if (intent.kind === 'share') {
8530
- // /auth share <label> [--from-agent <name>] — one-shot account-add +
8531
- // enable on every claude-enabled agent. The CLI does the merged YAML
8532
- // write; we restart every agent it touched so credentials load.
8533
- await runSwitchroomCommand(ctx, intent.cliArgs, intent.label)
8534
- const restartTargets = await resolveAgentsForRestart(['all'])
8535
- for (const a of restartTargets) {
8536
- try { assertSafeAgentName(a) } catch { continue }
8537
- await runSwitchroomCommand(ctx, ['agent', 'restart', a], `restart ${a}`)
9064
+ try {
9065
+ const { loginUrl, scratchDir, child } = await startAccountAuthSession(parsed.label)
9066
+ pendingAuthAddFlows.set(chatId, {
9067
+ label: parsed.label,
9068
+ scratchDir,
9069
+ child,
9070
+ startedAt: Date.now(),
9071
+ })
9072
+ await switchroomReply(
9073
+ ctx,
9074
+ `<b>Adding account</b> <code>${escapeHtmlForTg(parsed.label)}</code>\n\n` +
9075
+ `1. Open this URL on your phone:\n${loginUrl}\n\n` +
9076
+ `2. Log into Anthropic, copy the code Claude shows.\n` +
9077
+ `3. Paste it back here.\n\n` +
9078
+ `Send <code>/auth cancel</code> to abort.`,
9079
+ { html: true },
9080
+ )
9081
+ } catch (err) {
9082
+ await switchroomReply(
9083
+ ctx,
9084
+ `<b>/auth add failed:</b> ${escapeHtmlForTg((err as Error)?.message ?? String(err))}`,
9085
+ { html: true },
9086
+ )
8538
9087
  }
8539
- void refreshPinnedBanner('auth-share')
8540
9088
  return
8541
9089
  }
8542
9090
 
8543
- // intent.kind === 'status' — render the inline-keyboard dashboard.
8544
- // For the dashboard we're the bot-bound agent: we don't list every
8545
- // agent in the switchroom config; we show THIS bot's agent with its
8546
- // slots and actions. The 'status' branch of AuthIntent has no
8547
- // `agent` field; use currentAgent as the dashboard target.
8548
- await sendAuthDashboard(ctx, currentAgent)
8549
- })
8550
-
8551
- /**
8552
- * Gather DashboardState for an agent and send the dashboard as a fresh
8553
- * message (on `/auth` command) or editMessageText (on callback refresh).
8554
- *
8555
- * Implementation note: we could poll fetchQuota here to populate the
8556
- * fiveHour/sevenDay utilization per slot. Skipping for the initial
8557
- * landing — quota-check is expensive (one Anthropic API call per poll)
8558
- * and the background auto-fallback already surfaces quota-exhausted
8559
- * state. Dashboard renders the CLI-side health badges and omits
8560
- * utilization numbers when they're absent; a future PR can wire
8561
- * quota-check in.
8562
- */
8563
- async function sendAuthDashboard(
8564
- ctx: Context,
8565
- agent: string,
8566
- opts: { edit?: boolean } = {},
8567
- ): Promise<void> {
8568
- const state = fetchDashboardState(agent)
8569
- if (!state) {
8570
- await switchroomReply(
8571
- ctx,
8572
- `<b>/auth</b> — no data (agent "${escapeHtmlForTg(agent)}" missing from switchroom.yaml or CLI unreachable)`,
8573
- { html: true },
8574
- )
9091
+ const client = await getAuthBrokerClient(currentAgent)
9092
+ if (!client) {
9093
+ await switchroomReply(ctx, "<b>/auth unavailable:</b> auth-broker client is not loaded (post-RFC-H rewire in progress?).", { html: true })
8575
9094
  return
8576
9095
  }
8577
- const { text, keyboard } = buildDashboard(state)
8578
- if (opts.edit && ctx.callbackQuery) {
8579
- try {
8580
- await ctx.editMessageText(text, { parse_mode: 'HTML', reply_markup: keyboard, link_preview_options: { is_disabled: true } })
8581
- return
8582
- } catch {
8583
- // Message may have been deleted or identical content
8584
- // (editMessageText throws MESSAGE_NOT_MODIFIED) fall through
8585
- // to sending a new one.
8586
- }
8587
- }
8588
- await switchroomReply(ctx, text, { html: true, reply_markup: keyboard })
8589
- }
8590
-
8591
- /**
8592
- * Drop the cached per-account quota and immediately schedule a
8593
- * background re-probe for every known account. Used after auth-mutating
8594
- * dashboard taps (enable/disable/promote/share/account-rm-confirm) so
8595
- * the next `/auth` render shows fresh quota rather than a 30s-stale
8596
- * snapshot from before the change.
8597
- *
8598
- * Fire-and-forget: probes complete in ~hundreds of ms; the user's
8599
- * follow-up dashboard render reads whatever's cached at that moment,
8600
- * usually the freshly-warmed value. Errors (network, rate limit) are
8601
- * absorbed by `prefetchAccountQuotaIfStale`.
8602
- */
8603
- function clearAndRewarmAccountQuotas(): void {
8604
- clearAccountQuotaCache()
8605
- try {
8606
- const accounts = switchroomExecJson<Array<{ label: string }>>([
8607
- 'auth', 'account', 'list', '--json',
8608
- ])
8609
- if (Array.isArray(accounts)) {
8610
- for (const a of accounts) {
8611
- if (typeof a?.label === 'string') prefetchAccountQuotaIfStale(a.label)
9096
+ const reply = await handleAuthCommand(parsed, {
9097
+ agentName: currentAgent,
9098
+ isAdmin,
9099
+ client,
9100
+ chatId,
9101
+ // Format 2 enricher — probe live quota for every account in
9102
+ // parallel so the snapshot reflects current Anthropic-side
9103
+ // utilization, not the broker's potentially-days-stale
9104
+ // disk-cached `quota.json`. force:true bypasses the 5-min
9105
+ // in-process cache for this call. ~500-800ms per account
9106
+ // serial; in parallel ~800ms total for typical 3-account
9107
+ // fleets acceptable for an interactive command.
9108
+ liveQuotas: async (accounts) =>
9109
+ Promise.all(
9110
+ accounts.map((a) => fetchAccountQuota(a.label, { force: true })),
9111
+ ),
9112
+ tz: process.env.SWITCHROOM_TIMEZONE ?? process.env.TZ,
9113
+ })
9114
+ // Translate the handler's optional keyboard shape into grammy's
9115
+ // `reply_markup`. Buttons with `callbackData` become callback_data;
9116
+ // buttons with `insertText` become switch_inline_query_current_chat
9117
+ // (taps paste the slash-command into the user's input). Keep a
9118
+ // safe default for buttons missing both (shouldn't happen).
9119
+ if (reply.keyboard && reply.keyboard.length > 0) {
9120
+ // Build via grammy's InlineKeyboard so the type is correct
9121
+ // for switchroomReply's reply_markup field — no `as never`
9122
+ // cast needed.
9123
+ const kb = new InlineKeyboard()
9124
+ for (let i = 0; i < reply.keyboard.length; i++) {
9125
+ const row = reply.keyboard[i]!
9126
+ for (const b of row) {
9127
+ if (b.callbackData) kb.text(b.text, b.callbackData)
9128
+ else if (b.insertText) kb.switchInlineCurrent(b.text, b.insertText)
9129
+ else kb.text(b.text, 'auth:noop')
8612
9130
  }
9131
+ // grammy's row terminator — except after the last row.
9132
+ if (i < reply.keyboard.length - 1) kb.row()
8613
9133
  }
8614
- } catch {
8615
- /* clear-only fallback — next dashboard render's lazy prefetch will warm */
8616
- }
8617
- }
8618
-
8619
- function fetchDashboardState(agent: string): DashboardState | null {
8620
- // Slots come from switchroom auth list --json.
8621
- let slots: DashboardSlot[] = []
8622
- try {
8623
- const listing = switchroomExecJson<SlotListingFromCli>(['auth', 'list', agent, '--json'])
8624
- if (listing && Array.isArray(listing.slots)) {
8625
- slots = listing.slots.map((s) => ({
8626
- slot: s.slot,
8627
- active: s.active,
8628
- health: (s.health as SlotHealth) ?? 'missing',
8629
- quotaExhaustedUntil: s.quota_exhausted_until ?? null,
8630
- fiveHourPct: null,
8631
- sevenDayPct: null,
8632
- }))
8633
- }
8634
- } catch {
8635
- return null
8636
- }
8637
-
8638
- // Plan + bank + rateLimitTier come from switchroom auth status for
8639
- // THIS agent. rateLimitTier is the signal users need to verify the
8640
- // correct Anthropic account got authorized during reauth (e.g.
8641
- // max_5x vs max_20x). See 2026-04-22 account-mismatch discussion.
8642
- let plan: string | null = null
8643
- let rateLimitTier: string | null = null
8644
- const bankId = agent
8645
- try {
8646
- type AuthStatusResp = { agents: Array<{ name: string; subscription_type: string | null; rate_limit_tier?: string | null }> }
8647
- const statusData = switchroomExecJson<AuthStatusResp>(['auth', 'status'])
8648
- const thisAgent = statusData?.agents?.find((a) => a.name === agent)
8649
- if (thisAgent?.subscription_type) plan = thisAgent.subscription_type
8650
- if (thisAgent?.rate_limit_tier) rateLimitTier = thisAgent.rate_limit_tier
8651
- } catch {
8652
- /* best-effort */
8653
- }
8654
-
8655
- // Check for a pending auth session on disk. When present, surface it
8656
- // on the dashboard so the user can tap [♻️ Restart flow] without
8657
- // waiting for the automatic stale-session detection to fire (which
8658
- // only fires on actual PKCE challenge drift).
8659
- const pendingSessionSlot = readPendingSessionSlot(agent)
8660
-
8661
- // Account-level state for the dashboard's accounts section. The CLI
8662
- // emits a sorted, JSON array via `auth account list --json` (added
8663
- // in v0.6.x). We map it to the dashboard's `AccountSummary` shape,
8664
- // computing `enabledHere` from the per-account `agents` list.
8665
- //
8666
- // Wrapped in try/catch so an older CLI without --json (or any other
8667
- // failure) leaves `accounts` undefined — the renderer hides the
8668
- // section gracefully.
8669
- type AccountListItem = {
8670
- label: string
8671
- health: AccountHealth
8672
- subscriptionType?: string
8673
- expiresAt?: number
8674
- quotaExhaustedUntil?: number
8675
- email?: string
8676
- agents: string[]
8677
- /** v0.6.9+: agents for which this label is at index 0 of
8678
- * auth.accounts: (i.e. the post-fanout active for that agent).
8679
- * Optional — older CLIs without the field cause the dashboard to
8680
- * fall back to the v3a unmarked render. */
8681
- primaryForAgents?: string[]
8682
- }
8683
- let accounts: AccountSummary[] | undefined
8684
- let accountsTruncated = false
8685
- try {
8686
- const raw = switchroomExecJson<AccountListItem[]>([
8687
- 'auth', 'account', 'list', '--json',
8688
- ])
8689
- if (Array.isArray(raw)) {
8690
- accounts = raw.map((a) => {
8691
- // Layer per-account quota onto the summary from the in-process
8692
- // cache (warmed by `prefetchAccountQuotaIfStale` below). Sync
8693
- // read keeps `fetchDashboardState` itself sync; the cache TTL
8694
- // (30s) keeps the API-call rate bounded.
8695
- const cached = getCachedAccountQuota(a.label)
8696
- const summary: AccountSummary = {
8697
- label: a.label,
8698
- health: a.health,
8699
- enabledHere: Array.isArray(a.agents) && a.agents.includes(agent),
8700
- ...(Array.isArray(a.primaryForAgents)
8701
- ? { activeForThisAgent: a.primaryForAgents.includes(agent) }
8702
- : {}),
8703
- ...(a.subscriptionType ? { subscriptionType: a.subscriptionType } : {}),
8704
- ...(a.expiresAt != null ? { expiresAt: a.expiresAt } : {}),
8705
- ...(a.quotaExhaustedUntil != null
8706
- ? { quotaExhaustedUntil: a.quotaExhaustedUntil }
8707
- : {}),
8708
- ...(cached?.ok
8709
- ? {
8710
- fiveHourPct: cached.data.fiveHourUtilizationPct,
8711
- sevenDayPct: cached.data.sevenDayUtilizationPct,
8712
- ...(cached.data.fiveHourResetAt
8713
- ? { fiveHourResetAt: cached.data.fiveHourResetAt.getTime() }
8714
- : {}),
8715
- ...(cached.data.sevenDayResetAt
8716
- ? { sevenDayResetAt: cached.data.sevenDayResetAt.getTime() }
8717
- : {}),
8718
- }
8719
- : {}),
8720
- }
8721
- // Fire a background probe if the cache is cold/stale. The
8722
- // current render uses whatever's already cached; the *next*
8723
- // tap of /auth (after the probe completes) sees the fresh
8724
- // numbers. Swallowed errors keep the dashboard responsive even
8725
- // when Anthropic is slow or returns a transient 5xx.
8726
- prefetchAccountQuotaIfStale(a.label)
8727
- return summary
8728
- })
8729
- accountsTruncated = accounts.length > ACCOUNTS_DISPLAY_CAP
8730
- }
8731
- } catch {
8732
- /* leave accounts undefined */
8733
- }
8734
-
8735
- // `canBootstrapShare` decides whether to surface the "🌐 Share to
8736
- // fleet" button when zero accounts exist. We only show it when this
8737
- // agent has slot creds we could promote — otherwise the share verb
8738
- // would fail at the credentials lookup.
8739
- const canBootstrapShare = slots.some(
8740
- (s) => s.health === 'healthy' || s.health === 'active',
8741
- )
8742
-
8743
- // `quotaHot` now considers BOTH per-slot percentages (legacy slot
8744
- // model) and per-account percentages (new account model). Either
8745
- // path lighting up flips the [Fall back now] affordance, so the
8746
- // operator sees the warning whether they're on the legacy or new
8747
- // framework.
8748
- const slotQuotaHot = isQuotaHot(slots)
8749
- const accountQuotaHot = isAccountQuotaHot(accounts)
8750
-
8751
- return {
8752
- agent,
8753
- bankId,
8754
- plan,
8755
- rateLimitTier,
8756
- slots,
8757
- quotaHot: slotQuotaHot || accountQuotaHot,
8758
- generatedAt: new Date().toISOString().replace(/\.\d{3}Z$/, 'Z'),
8759
- pendingSessionSlot,
8760
- accounts,
8761
- accountsTruncated,
8762
- canBootstrapShare,
9134
+ await switchroomReply(ctx, reply.text, {
9135
+ html: reply.html,
9136
+ reply_markup: kb,
9137
+ })
9138
+ } else {
9139
+ await switchroomReply(ctx, reply.text, { html: reply.html })
8763
9140
  }
8764
- }
9141
+ })
8765
9142
 
8766
- /**
8767
- * Build the per-account list rendered on the boot/health card (issue
8768
- * #708). Reuses `fetchDashboardState` so the data source matches
8769
- * `/auth` exactly same cache, same shape. Returns null on any
8770
- * failure so the boot card silently omits the section.
8771
- */
8772
- function loadAccountsForBootCard(agent: string): ReadonlyArray<AccountSummary> | null {
9143
+ // Boot-card auth-row loader (issue #708, RFC H rewire). Queries the
9144
+ // broker for `list-state` and hands the raw shape to the boot card,
9145
+ // which delegates rendering to `renderAuthLine`. Returns null on any
9146
+ // failure so the boot card silently omits the section.
9147
+ async function loadAccountsForBootCard(agent: string): Promise<ListStateData | null> {
8773
9148
  try {
8774
- // Re-hydrate the in-process cache from on-disk snapshots
8775
- // captured by previous gateway lifetimes. Without this, a fresh
8776
- // boot would render the accounts section with empty quota rows
8777
- // until the background prefetch ticks. Best-effort.
8778
- try {
8779
- const labels = switchroomExecJson<Array<{ label?: string }>>([
8780
- 'auth', 'account', 'list', '--json',
8781
- ])
8782
- if (Array.isArray(labels)) {
8783
- hydrateAccountQuotaCacheFromDisk(
8784
- labels.map((l) => l?.label).filter((s): s is string => typeof s === 'string'),
8785
- )
8786
- }
8787
- } catch {
8788
- /* hydrate is best-effort; fall through to live state */
8789
- }
8790
-
8791
- const state = fetchDashboardState(agent)
8792
- if (!state || !state.accounts) return null
8793
- // Show only accounts enabled on this agent — fallback rows on the
8794
- // dashboard are useful, but on the boot card "accounts I'm using"
8795
- // is the right scope.
8796
- const enabled = state.accounts.filter((a) => a.enabledHere)
8797
- return enabled.length > 0 ? enabled : null
8798
- } catch {
9149
+ const client = await getAuthBrokerClient(agent)
9150
+ if (!client) return null
9151
+ return await client.listState()
9152
+ } catch (err) {
9153
+ process.stderr.write(`telegram gateway: boot-card auth probe failed: ${(err as Error)?.message ?? String(err)}\n`)
8799
9154
  return null
8800
9155
  }
8801
9156
  }
@@ -10405,310 +10760,154 @@ async function handleOperatorEventCallback(ctx: Context, data: string): Promise<
10405
10760
  }
10406
10761
  }
10407
10762
 
10763
+ // RFC H §7.3: the dashboard callback dispatcher is gone — there are
10764
+ // no auth: callback buttons in the new chat surface. We keep a no-op
10765
+ // stub so any stale pinned message that fires an `auth:*` tap is
10766
+ // silently dismissed instead of crashing the gateway.
10408
10767
  async function handleAuthDashboardCallback(ctx: Context): Promise<void> {
10409
10768
  const data = ctx.callbackQuery?.data ?? ''
10410
- const senderId = String(ctx.from?.id ?? '')
10411
- const access = loadAccess()
10412
- if (!access.allowFrom.includes(senderId)) {
10413
- await ctx.answerCallbackQuery({ text: 'Not authorized.' }).catch(() => {})
10414
- return
10415
- }
10416
- const action = parseCallbackData(data)
10769
+ const currentAgent = getMyAgentName()
10417
10770
 
10418
- switch (action.kind) {
10419
- case 'refresh': {
10420
- await ctx.answerCallbackQuery({ text: 'Refreshed' }).catch(() => {})
10421
- await sendAuthDashboard(ctx, action.agent, { edit: true })
10422
- return
10423
- }
10424
- case 'reauth': {
10425
- await ctx.answerCallbackQuery({ text: 'Starting reauth…' }).catch(() => {})
10426
- await runSwitchroomAuthCommand(
10427
- ctx,
10428
- action.slot ? ['auth', 'reauth', action.agent, '--slot', action.slot] : ['auth', 'reauth', action.agent],
10429
- `auth reauth ${action.agent}`,
10430
- )
10431
- pendingReauthFlows.set(String(ctx.chat!.id), { agent: action.agent, startedAt: Date.now() })
10432
- return
10433
- }
10434
- case 'add': {
10435
- await ctx.answerCallbackQuery({ text: 'Adding slot…' }).catch(() => {})
10436
- await runSwitchroomAuthCommand(ctx, ['auth', 'add', action.agent], `auth add ${action.agent}`)
10437
- pendingReauthFlows.set(String(ctx.chat!.id), { agent: action.agent, startedAt: Date.now() })
10438
- return
10439
- }
10440
- case 'use': {
10441
- await ctx.answerCallbackQuery({ text: `Switching to ${action.slot}…` }).catch(() => {})
10442
- await runSwitchroomCommand(ctx, ['auth', 'use', action.agent, action.slot], `auth use ${action.agent} ${action.slot}`)
10443
- try { assertSafeAgentName(action.agent) } catch { return }
10444
- await runSwitchroomCommand(ctx, ['agent', 'restart', action.agent], `restart ${action.agent}`)
10445
- await sendAuthDashboard(ctx, action.agent, { edit: true })
10446
- return
10447
- }
10448
- case 'rm': {
10449
- // Two-step confirm — swap the dashboard keyboard for a
10450
- // confirmation keyboard before doing anything destructive.
10451
- await ctx.answerCallbackQuery({ text: `Confirm remove ${action.slot}?` }).catch(() => {})
10452
- try {
10453
- await ctx.editMessageReplyMarkup({ reply_markup: buildRemoveConfirmKeyboard(action.agent, action.slot) })
10454
- } catch { /* ignore */ }
10455
- return
10456
- }
10457
- case 'confirm-rm': {
10458
- await ctx.answerCallbackQuery({ text: `Removing ${action.slot}…` }).catch(() => {})
10459
- const listing = switchroomExecJson<SlotListingFromCli>(['auth', 'list', action.agent, '--json'])
10460
- if (listing) {
10461
- const err = checkRemoveSafety({ ...listing, agent: listing.agent ?? action.agent }, action.slot, false)
10462
- if (err) {
10463
- await switchroomReply(ctx, err)
10464
- await sendAuthDashboard(ctx, action.agent, { edit: true })
10465
- return
10466
- }
10467
- }
10468
- await runSwitchroomCommand(ctx, ['auth', 'rm', action.agent, action.slot], `auth rm ${action.agent} ${action.slot}`)
10469
- await sendAuthDashboard(ctx, action.agent, { edit: true })
10470
- return
10471
- }
10472
- case 'fallback': {
10473
- await ctx.answerCallbackQuery({ text: 'Triggering fallback…' }).catch(() => {})
10474
- const result = await runAutoFallbackCheck({ trigger: 'manual' })
10475
- if (result.kind === 'executed') {
10476
- await switchroomReply(ctx, `✅ Switched <code>${escapeHtmlForTg(result.previousSlot)}</code> → <code>${escapeHtmlForTg(result.newSlot)}</code>.`, { html: true })
10477
- } else if (result.kind === 'exhausted-all') {
10478
- await switchroomReply(ctx, `🚨 All slots quota-exhausted. Tap ➕ Add slot.`, { html: true })
10479
- } else if (result.kind === 'error') {
10480
- await switchroomReply(ctx, `❌ Fallback error: ${escapeHtmlForTg(result.message)}`, { html: true })
10481
- } else {
10482
- await switchroomReply(ctx, `No action: ${escapeHtmlForTg(result.reason)}`, { html: true })
10483
- }
10484
- await sendAuthDashboard(ctx, action.agent, { edit: true })
10485
- return
10486
- }
10487
- case 'restart-flow': {
10488
- // Kill any pending session + restart the same flow (reauth or
10489
- // add-slot) fresh. Exists for the case where the user wants to
10490
- // start over BEFORE the automatic stale-session detection fires
10491
- // (e.g. closed the browser tab, 2FA failed, waited too long).
10492
- await ctx.answerCallbackQuery({ text: `Restarting ${action.slot} flow…` }).catch(() => {})
10493
- // Step 1: cancel any pending session for this agent.
10494
- try {
10495
- await runSwitchroomCommand(ctx, ['auth', 'cancel', action.agent], `auth cancel ${action.agent}`)
10496
- } catch { /* cancel is best-effort */ }
10497
- // Step 2: re-initiate. Slot == 'default' → reauth; else → add-slot.
10498
- // Both paths print the fresh URL + button + ForceReply prompt via
10499
- // runSwitchroomAuthCommand.
10500
- if (action.slot === 'default') {
10501
- await runSwitchroomAuthCommand(ctx, ['auth', 'reauth', action.agent], `auth reauth ${action.agent}`)
10502
- } else {
10503
- await runSwitchroomAuthCommand(ctx, ['auth', 'add', action.agent, '--slot', action.slot], `auth add ${action.agent} --slot ${action.slot}`)
10504
- }
10505
- pendingReauthFlows.set(String(ctx.chat!.id), { agent: action.agent, startedAt: Date.now() })
10771
+ // auth:use:<label> — fleet-wide swap via broker.setActive (same path
10772
+ // /auth use takes from chat). Admin-gated via the broker's own
10773
+ // per-agent admin flag.
10774
+ if (data.startsWith('auth:use:')) {
10775
+ const label = data.slice('auth:use:'.length)
10776
+ if (!label) {
10777
+ try { await ctx.answerCallbackQuery({ text: 'Missing account label.', show_alert: false }) } catch { /* */ }
10506
10778
  return
10507
10779
  }
10508
- case 'usage': {
10509
- await ctx.answerCallbackQuery({ text: 'Fetching quota…' }).catch(() => {})
10510
- const agentDir = resolveAgentDirFromEnv()
10511
- if (!agentDir) {
10512
- await switchroomReply(ctx, 'Quota lookup unavailable: no agent directory.')
10780
+ try {
10781
+ const client = await getAuthBrokerClient(currentAgent)
10782
+ if (!client) {
10783
+ try { await ctx.answerCallbackQuery({ text: 'Broker unreachable.', show_alert: true }) } catch { /* */ }
10513
10784
  return
10514
10785
  }
10786
+ const result = await client.setActive(label)
10515
10787
  try {
10516
- const quota = await fetchQuota({ claudeConfigDir: join(agentDir, '.claude') })
10517
- if (!quota.ok) {
10518
- await switchroomReply(ctx, `<b>Quota:</b> ${escapeHtmlForTg(quota.reason)}`, { html: true })
10519
- } else {
10520
- await switchroomReply(ctx, formatQuotaBlock(quota.data), { html: true })
10521
- }
10522
- } catch (err) {
10523
- await switchroomReply(ctx, `Quota fetch failed: ${escapeHtmlForTg(String(err))}`, { html: true })
10524
- }
10525
- return
10526
- }
10527
- // Account-level toggles (#share-auth-across-the-fleet). Two-stage
10528
- // confirm pattern mirrors `rm`/`confirm-rm` so a stray tap doesn't
10529
- // re-shuffle credentials. The CLI verb is the one source of truth
10530
- // for the YAML mutation + fanout; we only translate the tap into
10531
- // a `runSwitchroomCommand` call and refresh the dashboard.
10532
- case 'account-enable': {
10533
- await ctx.answerCallbackQuery({ text: `Confirm enable ${action.label}?` }).catch(() => {})
10534
- try {
10535
- await ctx.editMessageReplyMarkup({
10536
- reply_markup: buildAccountConfirmKeyboard(action.agent, action.label, 'enable'),
10788
+ await ctx.answerCallbackQuery({
10789
+ text: `Switched fleet → ${result.active} (${result.fanned.length} agents)`,
10790
+ show_alert: false,
10537
10791
  })
10538
- } catch { /* ignore */ }
10539
- return
10540
- }
10541
- case 'account-disable': {
10542
- await ctx.answerCallbackQuery({ text: `Confirm disable ${action.label}?` }).catch(() => {})
10543
- try {
10544
- await ctx.editMessageReplyMarkup({
10545
- reply_markup: buildAccountConfirmKeyboard(action.agent, action.label, 'disable'),
10546
- })
10547
- } catch { /* ignore */ }
10548
- return
10549
- }
10550
- case 'confirm-account-enable': {
10551
- await ctx.answerCallbackQuery({ text: `Enabling ${action.label}…` }).catch(() => {})
10552
- try { assertSafeAgentName(action.agent) } catch { return }
10553
- // CLI does the YAML mutation + per-agent credential fanout. The
10554
- // restart afterwards is what actually loads the new credentials
10555
- // into the running claude process.
10556
- await runSwitchroomCommand(
10557
- ctx,
10558
- ['auth', 'enable', action.label, action.agent],
10559
- `auth enable ${action.label} ${action.agent}`,
10560
- )
10561
- await runSwitchroomCommand(ctx, ['agent', 'restart', action.agent], `restart ${action.agent}`)
10562
- // Account roster changed — drop cached quota so the next
10563
- // dashboard render kicks a fresh probe instead of showing
10564
- // stale numbers (or a zero row for a label that just got added).
10565
- clearAndRewarmAccountQuotas()
10566
- await sendAuthDashboard(ctx, action.agent, { edit: true })
10567
- return
10568
- }
10569
- case 'confirm-account-disable': {
10570
- await ctx.answerCallbackQuery({ text: `Disabling ${action.label}…` }).catch(() => {})
10571
- try { assertSafeAgentName(action.agent) } catch { return }
10572
- await runSwitchroomCommand(
10573
- ctx,
10574
- ['auth', 'disable', action.label, action.agent],
10575
- `auth disable ${action.label} ${action.agent}`,
10576
- )
10577
- // Force restart so claude drops the stale credentials immediately.
10578
- // The CLI's `disable` doesn't auto-restart (it expects the operator
10579
- // to drain manually); the dashboard tap is implicit "I'm done with
10580
- // this account on this agent now," so we restart on their behalf.
10581
- await runSwitchroomCommand(ctx, ['agent', 'restart', action.agent], `restart ${action.agent}`)
10582
- clearAndRewarmAccountQuotas()
10583
- await sendAuthDashboard(ctx, action.agent, { edit: true })
10584
- return
10585
- }
10586
- case 'account-promote': {
10587
- // Two-stage confirm — same UX as enable/disable, just a different
10588
- // verb on the confirm row's callback. The CLI verb does the
10589
- // YAML reorder + fanout. Reachable via the legacy v3b per-row
10590
- // `⤴ Promote` button (callback verb `apr`) — kept for any
10591
- // already-pinned messages that still have it.
10592
- await ctx.answerCallbackQuery({ text: `Confirm promote ${action.label}?` }).catch(() => {})
10792
+ } catch { /* toast may fail on stale tap */ }
10793
+ // Edit the source message to reflect the new active. Leaving
10794
+ // the old keyboard intact would tempt a double-tap; we replace
10795
+ // the text + drop the keyboard so the user has to /auth again
10796
+ // to see fresh state.
10797
+ const msg = ctx.callbackQuery?.message
10798
+ if (msg) {
10799
+ // Wrap in swallowingApiCall per #1075 — stale callback-source
10800
+ // messages (deleted topic, expired) shouldn't crash the swap.
10801
+ await swallowingApiCall(
10802
+ () =>
10803
+ bot.api.editMessageText(
10804
+ msg.chat.id,
10805
+ msg.message_id,
10806
+ `<b>Active account →</b> <code>${escapeHtmlForTg(result.active)}</code>\n` +
10807
+ `<i>Re-mirrored credentials for ${result.fanned.length} agent${result.fanned.length === 1 ? '' : 's'}.</i>\n\n` +
10808
+ `<i>Tap /auth to see updated quota for the new active account.</i>`,
10809
+ { parse_mode: 'HTML' },
10810
+ ),
10811
+ { chat_id: String(msg.chat.id), verb: 'auth:use:edit' },
10812
+ )
10813
+ }
10814
+ } catch (err) {
10815
+ const msg = (err as Error)?.message ?? String(err)
10593
10816
  try {
10594
- await ctx.editMessageReplyMarkup({
10595
- reply_markup: buildAccountPromoteConfirmKeyboard(action.agent, action.label),
10817
+ await ctx.answerCallbackQuery({
10818
+ text: `Switch failed: ${msg.slice(0, 180)}`,
10819
+ show_alert: true,
10596
10820
  })
10597
- } catch { /* ignore */ }
10598
- return
10821
+ } catch { /* */ }
10599
10822
  }
10600
- case 'switch-primary-view': {
10601
- // v3c picker: open a sub-keyboard that lists every non-active
10602
- // account as a one-tap promote target. Direct
10603
- // `confirm-account-promote` callbacks (no second confirm) the
10604
- // picker IS the confirmation surface.
10605
- await ctx.answerCallbackQuery().catch(() => {})
10606
- const state = fetchDashboardState(action.agent)
10607
- const candidates = (state?.accounts ?? [])
10608
- .filter((a) => a.activeForThisAgent !== true)
10609
- .map((a) => ({ label: a.label, health: a.health }))
10610
- if (candidates.length === 0) {
10611
- // No fallbacks to switch to — return to the main board with a
10612
- // toast explaining why.
10613
- await ctx.answerCallbackQuery({ text: 'No fallback accounts to switch to.' }).catch(() => {})
10614
- await sendAuthDashboard(ctx, action.agent, { edit: true })
10823
+ return
10824
+ }
10825
+
10826
+ // auth:refresh — re-render the /auth snapshot in-place with a fresh
10827
+ // live probe. Replaces the message body; keyboard stays.
10828
+ if (data === 'auth:refresh') {
10829
+ // Freshness throttle: each refresh fan-fires N live api.anthropic.com
10830
+ // probes (one per account, force=true bypasses the 5-min cache).
10831
+ // Without this, a user double-tapping the ↻ button burns through
10832
+ // their account's RPM budget on duplicate work. Cap at one per
10833
+ // AUTH_REFRESH_THROTTLE_MS per (chat, message) pair.
10834
+ const refreshMsg = ctx.callbackQuery?.message
10835
+ if (refreshMsg) {
10836
+ const key = `${refreshMsg.chat.id}:${refreshMsg.message_id}`
10837
+ const lastAtMs = lastAuthRefreshAtMs.get(key) ?? 0
10838
+ const sinceLastMs = Date.now() - lastAtMs
10839
+ if (sinceLastMs < AUTH_REFRESH_THROTTLE_MS) {
10840
+ const waitS = Math.ceil((AUTH_REFRESH_THROTTLE_MS - sinceLastMs) / 1000)
10841
+ try {
10842
+ await ctx.answerCallbackQuery({
10843
+ text: `Just refreshed — try again in ${waitS}s`,
10844
+ show_alert: false,
10845
+ })
10846
+ } catch { /* */ }
10615
10847
  return
10616
10848
  }
10617
- try {
10618
- await ctx.editMessageReplyMarkup({
10619
- reply_markup: buildSwitchPrimaryKeyboard(action.agent, candidates),
10620
- })
10621
- } catch { /* ignore MESSAGE_NOT_MODIFIED */ }
10622
- return
10849
+ lastAuthRefreshAtMs.set(key, Date.now())
10623
10850
  }
10624
- case 'confirm-account-promote': {
10625
- await ctx.answerCallbackQuery({ text: `Promoting ${action.label}…` }).catch(() => {})
10626
- try { assertSafeAgentName(action.agent) } catch { return }
10627
- await runSwitchroomCommand(
10628
- ctx,
10629
- ['auth', 'promote', action.label, action.agent],
10630
- `auth promote ${action.label} ${action.agent}`,
10851
+ try {
10852
+ const client = await getAuthBrokerClient(currentAgent)
10853
+ if (!client) {
10854
+ try { await ctx.answerCallbackQuery({ text: 'Broker unreachable.', show_alert: true }) } catch { /* */ }
10855
+ return
10856
+ }
10857
+ const state = await client.listState()
10858
+ const quotas = await Promise.all(
10859
+ state.accounts.map((a) => fetchAccountQuota(a.label, { force: true })),
10631
10860
  )
10632
- // Promotion changes the active credential must restart so
10633
- // claude reloads the new primary's tokens.
10634
- await runSwitchroomCommand(ctx, ['agent', 'restart', action.agent], `restart ${action.agent}`)
10635
- clearAndRewarmAccountQuotas()
10636
- await sendAuthDashboard(ctx, action.agent, { edit: true })
10637
- return
10638
- }
10639
- case 'share-fleet': {
10640
- // Bootstrap one-tap: zero accounts exist, this agent has healthy
10641
- // slot creds. Synthesise label="default" so the user gets a
10642
- // sensible starting state in one tap; rename via CLI later.
10643
- await ctx.answerCallbackQuery({ text: 'Sharing to fleet…' }).catch(() => {})
10644
- try { assertSafeAgentName(action.agent) } catch { return }
10645
- await runSwitchroomCommand(
10646
- ctx,
10647
- ['auth', 'share', 'default', '--from-agent', action.agent],
10648
- `auth share default --from-agent ${action.agent}`,
10861
+ const tz = process.env.SWITCHROOM_TIMEZONE ?? process.env.TZ ?? 'UTC'
10862
+ const { renderAuthSnapshotFormat2, buildSnapshotsFromState, buildSnapshotKeyboard } = await import(
10863
+ '../auth-snapshot-format.js'
10649
10864
  )
10650
- await runSwitchroomCommand(ctx, ['agent', 'restart', action.agent], `restart ${action.agent}`)
10651
- clearAndRewarmAccountQuotas()
10652
- await sendAuthDashboard(ctx, action.agent, { edit: true })
10653
- return
10654
- }
10655
- // v3a: per-account drill-down sub-view handlers.
10656
- case 'account-view': {
10657
- // Drill into the per-account sub-view. Fetch current account state
10658
- // so the sub-view reflects live health, then edit-in-place.
10659
- await ctx.answerCallbackQuery().catch(() => {})
10660
- const state = fetchDashboardState(action.agent)
10661
- const acc = state?.accounts?.find((a) => a.label === action.label)
10662
- if (!acc || !state) {
10663
- await ctx.answerCallbackQuery({ text: `Account "${action.label}" not found.` }).catch(() => {})
10664
- await sendAuthDashboard(ctx, action.agent, { edit: true })
10665
- return
10865
+ const snapshots = buildSnapshotsFromState(state, quotas)
10866
+ const text = renderAuthSnapshotFormat2(snapshots, {
10867
+ tz,
10868
+ now: new Date(),
10869
+ liveProbedAtMs: Date.now(),
10870
+ })
10871
+ const kbRows = buildSnapshotKeyboard(snapshots)
10872
+ const inline_keyboard = kbRows.map((row) =>
10873
+ row.map((b) => {
10874
+ if (b.callbackData) return { text: b.text, callback_data: b.callbackData }
10875
+ if (b.insertText) return { text: b.text, switch_inline_query_current_chat: b.insertText }
10876
+ return { text: b.text, callback_data: 'auth:noop' }
10877
+ }),
10878
+ )
10879
+ const msg = ctx.callbackQuery?.message
10880
+ if (msg) {
10881
+ await swallowingApiCall(
10882
+ () =>
10883
+ bot.api.editMessageText(msg.chat.id, msg.message_id, text, {
10884
+ parse_mode: 'HTML',
10885
+ reply_markup: { inline_keyboard },
10886
+ }),
10887
+ { chat_id: String(msg.chat.id), verb: 'auth:refresh:edit' },
10888
+ )
10666
10889
  }
10667
- const text = buildAccountSubViewText(action.agent, acc)
10668
- const keyboard = buildAccountSubViewKeyboard(action.agent, action.label)
10669
- try {
10670
- await ctx.editMessageText(text, { parse_mode: 'HTML', reply_markup: keyboard, link_preview_options: { is_disabled: true } })
10671
- } catch { /* ignore MESSAGE_NOT_MODIFIED */ }
10672
- return
10673
- }
10674
- case 'account-reauth': {
10675
- // Reauth by account is not wired to a CLI verb in v3a.
10676
- // Surface a toast so the button is visible-but-inert; the full
10677
- // flow lands in v3b when `auth account reauth <label>` exists.
10678
- await ctx.answerCallbackQuery({ text: 'Reauth not yet wired — coming in v3b' }).catch(() => {})
10679
- return
10680
- }
10681
- case 'account-rm': {
10682
- // Two-step confirm — swap sub-view keyboard for remove confirm.
10683
- await ctx.answerCallbackQuery({ text: `Remove ${action.label}?` }).catch(() => {})
10890
+ try { await ctx.answerCallbackQuery({ text: 'Refreshed.', show_alert: false }) } catch { /* */ }
10891
+ } catch (err) {
10892
+ const msg = (err as Error)?.message ?? String(err)
10684
10893
  try {
10685
- await ctx.editMessageReplyMarkup({
10686
- reply_markup: buildAccountRemoveConfirmKeyboard(action.agent, action.label),
10894
+ await ctx.answerCallbackQuery({
10895
+ text: `Refresh failed: ${msg.slice(0, 180)}`,
10896
+ show_alert: true,
10687
10897
  })
10688
- } catch { /* ignore */ }
10689
- return
10690
- }
10691
- case 'account-rm-confirm': {
10692
- await ctx.answerCallbackQuery({ text: `Removing ${action.label}…` }).catch(() => {})
10693
- try { assertSafeAgentName(action.agent) } catch { return }
10694
- await runSwitchroomCommand(
10695
- ctx,
10696
- ['auth', 'account', 'rm', action.label],
10697
- `auth account rm ${action.label}`,
10698
- )
10699
- // Removed account label is gone — drop its cache entry (and any
10700
- // siblings, since `enabledHere` shifts when an agent's account
10701
- // list changes).
10702
- clearAndRewarmAccountQuotas()
10703
- await sendAuthDashboard(ctx, action.agent, { edit: true })
10704
- return
10705
- }
10706
- case 'noop':
10707
- default: {
10708
- await ctx.answerCallbackQuery().catch(() => {})
10709
- return
10898
+ } catch { /* */ }
10710
10899
  }
10900
+ return
10711
10901
  }
10902
+
10903
+ // Unknown auth:* — likely from a too-old message. Dismiss with a
10904
+ // hint pointing at the canonical re-render verb.
10905
+ try {
10906
+ await ctx.answerCallbackQuery({
10907
+ text: 'Unknown auth button. Send /auth for current state.',
10908
+ show_alert: false,
10909
+ })
10910
+ } catch { /* */ }
10712
10911
  }
10713
10912
 
10714
10913
  // /reauth was removed in v0.6.13 — the `/auth` dashboard's
@@ -11119,6 +11318,44 @@ bot.command('issues', async ctx => {
11119
11318
 
11120
11319
  bot.command('usage', async ctx => {
11121
11320
  if (!isAuthorizedSender(ctx)) return
11321
+ // Format 2 path: enumerate every account in the broker's known set,
11322
+ // probe live quota in parallel, render the health-grouped snapshot.
11323
+ // Falls back to the legacy single-agent shape when the broker is
11324
+ // unreachable, since /usage was historically callable against any
11325
+ // agent regardless of fleet state.
11326
+ const currentAgent = getMyAgentName()
11327
+ try {
11328
+ const client = await getAuthBrokerClient(currentAgent)
11329
+ if (client) {
11330
+ const state = await client.listState()
11331
+ if (state.accounts.length > 0) {
11332
+ const quotas = await Promise.all(
11333
+ state.accounts.map((a) => fetchAccountQuota(a.label, { force: true })),
11334
+ )
11335
+ const { renderAuthSnapshotFormat2, buildSnapshotsFromState } = await import(
11336
+ '../auth-snapshot-format.js'
11337
+ )
11338
+ const tz = process.env.SWITCHROOM_TIMEZONE ?? process.env.TZ ?? 'UTC'
11339
+ const snapshots = buildSnapshotsFromState(state, quotas)
11340
+ const text = renderAuthSnapshotFormat2(snapshots, {
11341
+ tz,
11342
+ now: new Date(),
11343
+ liveProbedAtMs: Date.now(),
11344
+ })
11345
+ await switchroomReply(ctx, text, { html: true })
11346
+ return
11347
+ }
11348
+ }
11349
+ } catch (err) {
11350
+ process.stderr.write(
11351
+ `telegram gateway: /usage Format 2 path failed agent=${currentAgent}: ${(err as Error)?.message ?? err}\n`,
11352
+ )
11353
+ // fall through to legacy single-agent path
11354
+ }
11355
+
11356
+ // Legacy single-agent path — kept as a graceful fallback when the
11357
+ // broker is unreachable (post-RFC-H rewire boot timing, broken
11358
+ // socket bind, etc.). Same shape /usage shipped with originally.
11122
11359
  const agentDir = resolveAgentDirFromEnv()
11123
11360
  if (!agentDir) {
11124
11361
  await switchroomReply(ctx, '<b>/usage:</b> cannot resolve agent dir.', { html: true })
@@ -11243,6 +11480,29 @@ bot.on('callback_query:data', async ctx => {
11243
11480
  return
11244
11481
  }
11245
11482
 
11483
+ // RFC E §4.1: drvpick:<verb>:<agent>[:<...>] — folder-picker card taps.
11484
+ // open / enter / back / refresh re-render the card in place;
11485
+ // grant writes an allow_always kernel decision at
11486
+ // doc:gdrive:folder/<id>/** and edits the card to a confirmation.
11487
+ //
11488
+ // Auth gate: the picker grant is an OPERATOR action (mirrors the
11489
+ // `op:`/`vd:`/`vg:` family, not the `apv:` agent-approval shape).
11490
+ // Mirror those patterns — refuse callbacks from anyone outside
11491
+ // `access.allowFrom`. Without this, a group member who isn't in
11492
+ // the operator allowlist could still tap [✅ Allow "<folder>"] on
11493
+ // a card that landed in the group and write an `allow_always`
11494
+ // decision attributed to themselves.
11495
+ if (data.startsWith('drvpick:')) {
11496
+ const access = loadAccess()
11497
+ const senderId = String(ctx.from?.id ?? '')
11498
+ if (!access.allowFrom.includes(senderId)) {
11499
+ await ctx.answerCallbackQuery({ text: 'Not authorized.' })
11500
+ return
11501
+ }
11502
+ await handleFolderPickerCallback(ctx, data, buildFolderPickerDeps())
11503
+ return
11504
+ }
11505
+
11246
11506
  // op:<action>:<encoded-agent> callbacks from operator-events.ts
11247
11507
  // renderOperatorEvent(). Agent name is URL-encoded at emit (issue #24).
11248
11508
  // Actions: dismiss, restart, reauth, swap-slot, add-slot, logs.
@@ -12993,35 +13253,10 @@ void (async () => {
12993
13253
  )
12994
13254
  }
12995
13255
 
12996
- // v0.6.10 boot-warm: kick off a background per-account quota
12997
- // probe for every account in the new auth framework. Without
12998
- // this, the FIRST `/auth` after a restart shows no mini-bars
12999
- // because the in-process cache is cold — the dashboard's lazy
13000
- // prefetch fires the probe but the operator's already-rendered
13001
- // message has empty quota rows. Pre-warming fills the cache
13002
- // before the user can tap.
13003
- //
13004
- // Fire-and-forget per label. Failures (rate limit, network,
13005
- // expired token) leave the cache unset so the dashboard's lazy
13006
- // path retries on the next render — same safety-net contract
13007
- // as available_reactions above.
13008
- try {
13009
- const accountsAtBoot = switchroomExecJson<Array<{ label: string }>>([
13010
- 'auth', 'account', 'list', '--json',
13011
- ])
13012
- if (Array.isArray(accountsAtBoot) && accountsAtBoot.length > 0) {
13013
- for (const a of accountsAtBoot) {
13014
- if (typeof a?.label === 'string') prefetchAccountQuotaIfStale(a.label)
13015
- }
13016
- process.stderr.write(
13017
- `telegram gateway: boot-warmed quota cache for ${accountsAtBoot.length} account(s)\n`,
13018
- )
13019
- }
13020
- } catch (err) {
13021
- process.stderr.write(
13022
- `telegram gateway: boot-warm of account quota cache failed (continuing): ${(err as Error).message}\n`,
13023
- )
13024
- }
13256
+ // RFC H removes the per-account-quota-cache boot-warm: the
13257
+ // auth-broker owns quota state now; the gateway reads it via
13258
+ // `list-state` on demand and renders directly. No in-process
13259
+ // cache to warm.
13025
13260
 
13026
13261
  // #412 boot-cleanup: clear any pre-existing turn-active marker.
13027
13262
  // By definition no turn can be in flight when the gateway just
@@ -13208,23 +13443,6 @@ void (async () => {
13208
13443
  }
13209
13444
  } catch {}
13210
13445
 
13211
- // Auto-fallback on quota exhaustion. Periodically polls
13212
- // the active slot's rate-limit headers; when utilization >= 99.5%
13213
- // or a 429 is observed, marks the slot exhausted, swaps to the
13214
- // next healthy slot via src/auth, restarts the agent, and posts
13215
- // a notification to the owner chat. See telegram-plugin/auto-fallback.ts
13216
- // for the pure decision logic + notification builder.
13217
- //
13218
- // Default poll cadence: every 60 minutes. Set
13219
- // SWITCHROOM_AUTO_FALLBACK_POLL_MS=0 to disable the background
13220
- // poller. Pre-v0.6.12 a manual `/authfallback` typed command
13221
- // also ran the same check; that command was removed in favour
13222
- // of the `/auth` dashboard's Switch primary picker.
13223
- const AUTO_FALLBACK_POLL_MS = Number(process.env.SWITCHROOM_AUTO_FALLBACK_POLL_MS ?? 60 * 60_000)
13224
- if (AUTO_FALLBACK_POLL_MS > 0) {
13225
- setInterval(() => { void runAutoFallbackCheck({ trigger: 'scheduled' }) }, AUTO_FALLBACK_POLL_MS).unref()
13226
- }
13227
-
13228
13446
  // Credit-exhaustion watcher (#348). Reads `<agentDir>/.claude/.claude.json`
13229
13447
  // for `cachedExtraUsageDisabledReason`. Fires a Telegram notification
13230
13448
  // on transition into / out of fatal billing states (out_of_credits,