switchroom 0.5.0 → 0.7.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. package/README.md +142 -121
  2. package/bin/autoaccept.exp +29 -6
  3. package/dist/agent-scheduler/index.js +12261 -0
  4. package/dist/cli/autoaccept-poll.js +10 -0
  5. package/dist/cli/switchroom.js +27250 -25324
  6. package/dist/vault/approvals/kernel-server.js +12709 -0
  7. package/dist/vault/broker/server.js +15724 -0
  8. package/package.json +4 -3
  9. package/profiles/_base/start.sh.hbs +133 -0
  10. package/profiles/_shared/telegram-style.md.hbs +3 -3
  11. package/profiles/default/CLAUDE.md +3 -3
  12. package/profiles/default/CLAUDE.md.hbs +2 -2
  13. package/profiles/default/workspace/CLAUDE.md.hbs +9 -0
  14. package/skills/docx/VENDORED.md +1 -1
  15. package/skills/mcp-builder/VENDORED.md +1 -1
  16. package/skills/pdf/VENDORED.md +1 -1
  17. package/skills/pptx/VENDORED.md +1 -1
  18. package/skills/skill-creator/VENDORED.md +1 -1
  19. package/skills/switchroom-architecture/SKILL.md +8 -7
  20. package/skills/switchroom-cli/SKILL.md +23 -15
  21. package/skills/switchroom-health/SKILL.md +7 -7
  22. package/skills/switchroom-install/SKILL.md +36 -39
  23. package/skills/switchroom-manage/SKILL.md +4 -4
  24. package/skills/switchroom-status/SKILL.md +1 -1
  25. package/skills/webapp-testing/VENDORED.md +1 -1
  26. package/skills/xlsx/VENDORED.md +1 -1
  27. package/telegram-plugin/admin-commands/dispatch.test.ts +119 -1
  28. package/telegram-plugin/admin-commands/index.ts +71 -0
  29. package/telegram-plugin/ask-user.ts +1 -0
  30. package/telegram-plugin/card-event-log.ts +138 -0
  31. package/telegram-plugin/dist/bridge/bridge.js +178 -31
  32. package/telegram-plugin/dist/foreman/foreman.js +6875 -6526
  33. package/telegram-plugin/dist/gateway/gateway.js +13862 -11834
  34. package/telegram-plugin/dist/server.js +202 -40
  35. package/telegram-plugin/fleet-state.ts +25 -10
  36. package/telegram-plugin/foreman/foreman.ts +38 -3
  37. package/telegram-plugin/gateway/approval-callback.ts +126 -0
  38. package/telegram-plugin/gateway/approval-card.test.ts +90 -0
  39. package/telegram-plugin/gateway/approval-card.ts +127 -0
  40. package/telegram-plugin/gateway/approvals-commands.ts +126 -0
  41. package/telegram-plugin/gateway/boot-card.ts +31 -6
  42. package/telegram-plugin/gateway/boot-probes.ts +503 -72
  43. package/telegram-plugin/gateway/gateway.ts +822 -94
  44. package/telegram-plugin/gateway/ipc-protocol.ts +34 -1
  45. package/telegram-plugin/gateway/ipc-server.ts +35 -0
  46. package/telegram-plugin/gateway/startup-mutex.ts +110 -2
  47. package/telegram-plugin/hooks/hooks.json +19 -0
  48. package/telegram-plugin/hooks/tool-label-pretool.mjs +216 -0
  49. package/telegram-plugin/hooks/tool-label-stop.mjs +63 -0
  50. package/telegram-plugin/package.json +4 -1
  51. package/telegram-plugin/plugin-logger.ts +20 -1
  52. package/telegram-plugin/progress-card-driver.ts +202 -13
  53. package/telegram-plugin/progress-card.ts +2 -2
  54. package/telegram-plugin/quota-check.ts +1 -0
  55. package/telegram-plugin/registry/subagents-schema.ts +37 -0
  56. package/telegram-plugin/registry/subagents.test.ts +64 -0
  57. package/telegram-plugin/session-tail.ts +58 -5
  58. package/telegram-plugin/shared/bot-runtime.ts +48 -2
  59. package/telegram-plugin/subagent-watcher.ts +139 -7
  60. package/telegram-plugin/tests/_progress-card-harness.ts +4 -0
  61. package/telegram-plugin/tests/bg-agent-progress-card-757.test.ts +201 -0
  62. package/telegram-plugin/tests/boot-card-probe-target.test.ts +10 -34
  63. package/telegram-plugin/tests/boot-card-render.test.ts +6 -5
  64. package/telegram-plugin/tests/boot-probes.test.ts +558 -0
  65. package/telegram-plugin/tests/card-event-log.test.ts +145 -0
  66. package/telegram-plugin/tests/gateway-startup-mutex.test.ts +102 -0
  67. package/telegram-plugin/tests/ipc-server-validate-inject-inbound.test.ts +134 -0
  68. package/telegram-plugin/tests/progress-card-delay-842.test.ts +160 -0
  69. package/telegram-plugin/tests/quota-check.test.ts +37 -1
  70. package/telegram-plugin/tests/subagent-registry-bugs.test.ts +5 -0
  71. package/telegram-plugin/tests/subagent-watcher-stall-notification.test.ts +104 -1
  72. package/telegram-plugin/tests/subagent-watcher.test.ts +5 -0
  73. package/telegram-plugin/tests/tool-label-sidecar.test.ts +114 -0
  74. package/telegram-plugin/tests/two-zone-bg-done-when-all-terminal.test.ts +5 -3
  75. package/telegram-plugin/tests/two-zone-card-header-phases.test.ts +10 -0
  76. package/telegram-plugin/tests/two-zone-snapshot-extras.test.ts +58 -14
  77. package/telegram-plugin/tests/welcome-text.test.ts +57 -0
  78. package/telegram-plugin/tool-label-sidecar.ts +140 -0
  79. package/telegram-plugin/tool-labels.ts +55 -0
  80. package/telegram-plugin/two-zone-card.ts +27 -7
  81. package/telegram-plugin/uat/SETUP.md +160 -0
  82. package/telegram-plugin/uat/assertions.ts +140 -0
  83. package/telegram-plugin/uat/driver.ts +174 -0
  84. package/telegram-plugin/uat/harness.ts +161 -0
  85. package/telegram-plugin/uat/login.ts +134 -0
  86. package/telegram-plugin/uat/port-allocator.ts +71 -0
  87. package/telegram-plugin/uat/scenarios/smoke-clerk-reply.test.ts +61 -0
  88. package/telegram-plugin/welcome-text.ts +44 -2
  89. package/bin/bridge-watchdog.sh +0 -967
@@ -58,7 +58,8 @@ import { handlePtyPartialPure, type PtyHandlerState } from '../pty-partial-handl
58
58
  import { handleStreamReply } from '../stream-reply-handler.js'
59
59
  import { createChatLock } from '../chat-lock.js'
60
60
  import { createRetryApiCall } from '../retry-api-call.js'
61
- import { installTgPostLogger } from '../shared/bot-runtime.js'
61
+ import { installTgPostLogger, withTgPostTags } from '../shared/bot-runtime.js'
62
+ import { emitCardEvent } from '../card-event-log.js'
62
63
  import { buildAttachmentPath, assertInsideInbox } from '../attachment-path.js'
63
64
  import { createPinManager } from '../progress-card-pin-manager.js'
64
65
  import { createPinWatchdog } from '../progress-card-pin-watchdog.js'
@@ -152,7 +153,7 @@ import {
152
153
  resetSessionAckText as buildResetSessionAckText,
153
154
  TELEGRAM_BASE_COMMANDS,
154
155
  TELEGRAM_SWITCHROOM_COMMANDS,
155
- type AgentMetadata, type AuthSummary,
156
+ type AgentMetadata, type AuthSummary, type StatusProbeRow,
156
157
  } from '../welcome-text.js'
157
158
  import {
158
159
  isContextExhaustionText,
@@ -229,6 +230,7 @@ import type {
229
230
  OperatorEventForward,
230
231
  PtyPartialForward,
231
232
  InboundMessage,
233
+ InjectInboundMessage,
232
234
  } from './ipc-protocol.js'
233
235
  import { writePidFile, clearPidFile } from './pid-file.js'
234
236
  import { acquireStartupLock, releaseStartupLock } from './startup-mutex.js'
@@ -257,7 +259,7 @@ import { StagingMap } from '../secret-detect/staging.js'
257
259
  import { maskToken } from '../secret-detect/mask.js'
258
260
  import { defaultVaultWrite, defaultVaultList } from '../secret-detect/vault-write.js'
259
261
  import { detectSecrets } from '../secret-detect/index.js'
260
- import { ADMIN_COMMAND_NAMES, parseCommandName } from '../admin-commands/index.js'
262
+ import { classifyAdminGate } from '../admin-commands/index.js'
261
263
  import {
262
264
  startSubagentWatcher,
263
265
  type SubagentWatcherHandle,
@@ -303,6 +305,11 @@ import {
303
305
  listGrantsViaBroker,
304
306
  revokeGrantViaBroker,
305
307
  } from '../../src/vault/broker/client.js'
308
+ import {
309
+ approvalRequest,
310
+ approvalConsume,
311
+ approvalRecord,
312
+ } from '../../src/vault/approvals/client.js'
306
313
  import {
307
314
  openTurnsDb,
308
315
  markOrphanedAsRestarted,
@@ -325,6 +332,71 @@ const APPROVED_DIR = join(STATE_DIR, 'approved')
325
332
  const ENV_FILE = join(STATE_DIR, '.env')
326
333
  const INBOX_DIR = join(STATE_DIR, 'inbox')
327
334
 
335
+ /**
336
+ * Trigger a restart of the agent + gateway pair.
337
+ *
338
+ * Branches on `SWITCHROOM_RUNTIME`:
339
+ * - `docker`: send `SIGTERM` to PID 1 (tini) after a brief delay so
340
+ * in-flight IPC responses flush. tini propagates the signal to its
341
+ * children (claude → start.sh → us), the whole tree exits cleanly,
342
+ * the container exits, and docker compose's `restart: unless-stopped`
343
+ * policy recreates it. This covers BOTH the agent process and the
344
+ * gateway plugin (we're a child of claude inside the same container).
345
+ * `targetAgent` is informational only here — we can't restart a
346
+ * different agent's container from inside our own (no docker.sock).
347
+ * - else (legacy systemd): detached `systemctl --user restart` of the
348
+ * two units. The detach is required so the systemctl job survives
349
+ * us being SIGTERM'd by systemd itself.
350
+ *
351
+ * `targetAgent` defaults to `SWITCHROOM_AGENT_NAME`; pass a different
352
+ * value only for the inline restart-button callback handler. Under
353
+ * docker, a `targetAgent !== SWITCHROOM_AGENT_NAME` request returns
354
+ * false (and logs) so the caller can surface a "not supported" message.
355
+ */
356
+ function triggerSelfRestart(
357
+ targetAgent: string,
358
+ reason: string,
359
+ delayMs = 300,
360
+ ): boolean {
361
+ const isDocker = process.env.SWITCHROOM_RUNTIME === 'docker'
362
+ const selfAgent = process.env.SWITCHROOM_AGENT_NAME
363
+ if (isDocker) {
364
+ if (selfAgent && targetAgent !== selfAgent) {
365
+ process.stderr.write(
366
+ `telegram gateway: cross-agent restart not supported under docker (target=${targetAgent}, self=${selfAgent}, reason=${reason})\n`,
367
+ )
368
+ return false
369
+ }
370
+ process.stderr.write(
371
+ `telegram gateway: restart-via-SIGTERM-PID1 agent=${targetAgent} reason=${reason} (docker)\n`,
372
+ )
373
+ setTimeout(() => {
374
+ try { process.kill(1, 'SIGTERM') } catch (err) {
375
+ process.stderr.write(`telegram gateway: SIGTERM PID 1 failed: ${err}\n`)
376
+ }
377
+ }, delayMs).unref()
378
+ return true
379
+ }
380
+ // Legacy systemd path.
381
+ process.stderr.write(
382
+ `telegram gateway: restart-via-systemctl agent=${targetAgent} reason=${reason}\n`,
383
+ )
384
+ try {
385
+ spawn(
386
+ 'sh',
387
+ [
388
+ '-c',
389
+ `sleep ${(delayMs / 1000).toFixed(2)} && systemctl --user restart switchroom-${targetAgent}.service switchroom-${targetAgent}-gateway.service`,
390
+ ],
391
+ { detached: true, stdio: 'ignore' },
392
+ ).unref()
393
+ return true
394
+ } catch (err) {
395
+ process.stderr.write(`telegram gateway: restart spawn failed for ${targetAgent}: ${err}\n`)
396
+ return false
397
+ }
398
+ }
399
+
328
400
  /**
329
401
  * Format the version string shown in the boot-card ack line. Two shapes
330
402
  * matching the deleted greeting card's behavior:
@@ -380,12 +452,53 @@ try {
380
452
  }
381
453
  }
382
454
 
383
- const TOKEN = process.env.TELEGRAM_BOT_TOKEN
384
- if (!TOKEN) {
455
+ // Issue #758: if TELEGRAM_BOT_TOKEN is not set in env (e.g. agent's .env was
456
+ // never written because bot_token in switchroom.yaml is a `vault:` reference),
457
+ // materialize it from the vault at startup. Resolved value is held in
458
+ // process.env only — never written back to disk.
459
+ //
460
+ // The outer try/catch is narrowed (post-#761 review) to ONLY catch the case
461
+ // where the helper module itself fails to load (ERR_MODULE_NOT_FOUND from the
462
+ // dynamic import). Anything else — including throws from inside
463
+ // materializeBotToken that aren't BotTokenMaterializeError — must propagate
464
+ // with its original message so we don't mask real bugs behind the legacy
465
+ // "set in .env" hint.
466
+ type MaterializeMod = typeof import('../../src/telegram/materialize-bot-token.js')
467
+ let materializeMod: MaterializeMod | null = null
468
+ try {
469
+ materializeMod = await import('../../src/telegram/materialize-bot-token.js')
470
+ } catch (err) {
471
+ const code = (err as NodeJS.ErrnoException | undefined)?.code
472
+ if (code === 'ERR_MODULE_NOT_FOUND' || code === 'MODULE_NOT_FOUND') {
473
+ // Module genuinely missing — fall through with materializeMod=null and
474
+ // handle below.
475
+ } else {
476
+ // Programming error, side-effect failure during module init, etc.
477
+ // Propagate the real message rather than masking it.
478
+ throw err
479
+ }
480
+ }
481
+
482
+ let TOKEN: string
483
+ if (materializeMod !== null) {
484
+ const { materializeBotToken, BotTokenMaterializeError } = materializeMod
485
+ try {
486
+ TOKEN = await materializeBotToken({ agentName: process.env.SWITCHROOM_AGENT_NAME })
487
+ } catch (err) {
488
+ if (err instanceof BotTokenMaterializeError) {
489
+ process.stderr.write(`telegram gateway: ${err.message}\n`)
490
+ process.exit(1)
491
+ }
492
+ throw err
493
+ }
494
+ } else if (process.env.TELEGRAM_BOT_TOKEN) {
495
+ TOKEN = process.env.TELEGRAM_BOT_TOKEN
496
+ } else {
385
497
  process.stderr.write(
386
498
  `telegram gateway: TELEGRAM_BOT_TOKEN required\n` +
387
499
  ` set in ${ENV_FILE}\n` +
388
- ` format: TELEGRAM_BOT_TOKEN=123456789:AAH...\n`,
500
+ ` format: TELEGRAM_BOT_TOKEN=123456789:AAH...\n` +
501
+ ` (token-materialization helper not found)\n`,
389
502
  )
390
503
  process.exit(1)
391
504
  }
@@ -954,20 +1067,7 @@ function purgeReactionTracking(key: string): void {
954
1067
  // scheduled, so nobody is waiting on this.
955
1068
  if (activeTurnStartedAt.size === 0 && pendingRestarts.size > 0) {
956
1069
  for (const [agentName, _timestamp] of pendingRestarts.entries()) {
957
- process.stderr.write(`telegram gateway: turn completed, restarting ${agentName} (agent + gateway) now\n`);
958
- try {
959
- spawn(
960
- 'sh',
961
- [
962
- '-c',
963
- // Sleep briefly so our stderr flush lands before systemd kills us.
964
- `sleep 0.3 && systemctl --user restart switchroom-${agentName}.service switchroom-${agentName}-gateway.service`,
965
- ],
966
- { detached: true, stdio: 'ignore' },
967
- ).unref();
968
- } catch (err) {
969
- process.stderr.write(`telegram gateway: restart spawn failed for ${agentName}: ${err}\n`);
970
- }
1070
+ triggerSelfRestart(agentName, 'turn-complete-pending-restart');
971
1071
  pendingRestarts.delete(agentName);
972
1072
  }
973
1073
  }
@@ -1275,6 +1375,17 @@ type PendingVaultOp =
1275
1375
  expiresLabel?: string // human-readable label for confirmation
1276
1376
  description?: string
1277
1377
  awaitingCustomDuration?: boolean // true while waiting for text reply
1378
+ /**
1379
+ * Approval-kernel request_id minted at the wizard confirm step
1380
+ * (MIGRATION.md §2, Phase 1 dual-dispatch — audit-only, advisory).
1381
+ * When set, `vg:generate` ALSO consumes + records an `allow_once`
1382
+ * decision on the kernel; `vg:cancel` records a `deny`. Cards in
1383
+ * flight from before this PR landed have it `undefined` and the
1384
+ * legacy `mintGrantViaBroker` runs alone — no kernel write. After
1385
+ * 1-2 releases the legacy-only branch can be removed (#833 Phase 2
1386
+ * is the enforcing flip).
1387
+ */
1388
+ kernel_request_id?: string
1278
1389
  startedAt: number
1279
1390
  }
1280
1391
  // Issue #228: waiting for confirmation before revoking a grant.
@@ -1300,12 +1411,165 @@ interface DeferredSecret {
1300
1411
  * slug if detection didn't fire.
1301
1412
  */
1302
1413
  suggested_slug: string
1414
+ /**
1415
+ * Approval-kernel request_id minted alongside the bespoke deferred-secret
1416
+ * card (MIGRATION.md §1, Phase 1 dual-dispatch). When set, the
1417
+ * `vd:unlock` / `vd:cancel` callback handler ALSO records the user's
1418
+ * decision on the kernel side via `approvalConsume` + `approvalRecord`,
1419
+ * so the audit log captures the unlock event.
1420
+ *
1421
+ * `undefined` on cards built before this PR landed (in-flight at deploy
1422
+ * time) — the legacy handler runs alone, no kernel record. After ~1-2
1423
+ * releases the legacy-only branch can be removed (separate cleanup PR).
1424
+ */
1425
+ kernel_request_id?: string
1303
1426
  }
1304
1427
  const deferredSecrets = new Map<string, DeferredSecret>()
1428
+
1429
+ /**
1430
+ * Mint an approval-kernel decision row for a deferred-secret card
1431
+ * (MIGRATION.md §1). Best-effort: if the kernel/broker is unreachable, we
1432
+ * return null and the caller proceeds with the legacy-only path so the
1433
+ * core unlock UX never depends on kernel availability.
1434
+ *
1435
+ * `agent_unit` is the gateway's agent — the per-agent ACL ships in Docker
1436
+ * Phase 2b. The kernel-server checks the listener's bound socket against
1437
+ * the claimed agent, so passing the local agent name is safe.
1438
+ */
1439
+ async function mintDeferredSecretKernelRequest(
1440
+ slug: string,
1441
+ approverSet: string[],
1442
+ ): Promise<string | null> {
1443
+ const agentName = process.env.SWITCHROOM_AGENT_NAME
1444
+ if (!agentName) return null
1445
+ try {
1446
+ const r = await approvalRequest({
1447
+ agent_unit: `switchroom-${agentName}.service`,
1448
+ scope: `secret:${slug}`,
1449
+ action: 'unlock',
1450
+ approver_set: approverSet,
1451
+ why: 'Unlock vault to save a deferred secret detected in chat.',
1452
+ })
1453
+ if (r === null || r.state !== 'pending') return null
1454
+ return r.request_id
1455
+ } catch (err) {
1456
+ process.stderr.write(
1457
+ `[approval-kernel] mintDeferredSecretKernelRequest failed: ${(err as Error).message}\n`,
1458
+ )
1459
+ return null
1460
+ }
1461
+ }
1462
+
1463
+ /**
1464
+ * Record the user's decision (allow/deny) on the approval kernel for a
1465
+ * deferred-secret card. Best-effort and idempotent — a missing
1466
+ * `request_id` (legacy in-flight card) or an unreachable kernel both
1467
+ * silently no-op so the legacy UX is unaffected.
1468
+ */
1469
+ async function recordDeferredSecretKernelDecision(
1470
+ request_id: string | undefined,
1471
+ decision: 'allow_once' | 'deny',
1472
+ granted_by_user_id: number,
1473
+ approverSet: string[],
1474
+ ): Promise<void> {
1475
+ if (!request_id) return
1476
+ try {
1477
+ const consumed = await approvalConsume(request_id)
1478
+ if (consumed === null || !consumed.consumed) return
1479
+ await approvalRecord({
1480
+ request_id,
1481
+ decision,
1482
+ approver_set: approverSet,
1483
+ granted_by_user_id,
1484
+ ttl_ms: null,
1485
+ })
1486
+ } catch (err) {
1487
+ process.stderr.write(
1488
+ `[approval-kernel] recordDeferredSecretKernelDecision failed: ${(err as Error).message}\n`,
1489
+ )
1490
+ }
1491
+ }
1305
1492
  function deferredKey(chat_id: string, message_id: number): string {
1306
1493
  return `${chat_id}:${message_id}`
1307
1494
  }
1308
1495
 
1496
+ /**
1497
+ * Mint an approval-kernel decision row for a `/vault grant` wizard
1498
+ * confirm step (MIGRATION.md §2, Phase 1 audit-only dual-dispatch).
1499
+ *
1500
+ * Best-effort: kernel/broker unreachable → returns null and the wizard
1501
+ * proceeds on the legacy `mint_grant` path alone, so the user-facing
1502
+ * grant UX never depends on kernel availability. This is *advisory*
1503
+ * in Phase 1 — the kernel verdict is informational alongside the
1504
+ * legacy `vault_grants` row, not enforcing. Phase 2 (issue #833) flips
1505
+ * enforcement.
1506
+ *
1507
+ * Scope shape `vault:grant:<agent_slug>` mirrors the `vault:secret:<slug>`
1508
+ * namespacing established in #832 / PR #830 — one decision per (agent,
1509
+ * grant-mint) tuple. Action `mint`. Approver-set is the gateway's
1510
+ * allowFrom (same set that gates the wizard callback in the first place).
1511
+ */
1512
+ async function mintGrantWizardKernelRequest(
1513
+ agentSlug: string,
1514
+ approverSet: string[],
1515
+ selectedKeys: string[],
1516
+ ttlSeconds: number | null,
1517
+ ): Promise<string | null> {
1518
+ const agentName = process.env.SWITCHROOM_AGENT_NAME
1519
+ if (!agentName) return null
1520
+ try {
1521
+ const why =
1522
+ `Mint capability token for agent "${agentSlug}" — ` +
1523
+ `${selectedKeys.length} key(s), ` +
1524
+ `${ttlSeconds === null ? 'no expiry' : `${ttlSeconds}s TTL`}.`
1525
+ const r = await approvalRequest({
1526
+ agent_unit: `switchroom-${agentName}.service`,
1527
+ scope: `vault:grant:${agentSlug}`,
1528
+ action: 'mint',
1529
+ approver_set: approverSet,
1530
+ why,
1531
+ })
1532
+ if (r === null || r.state !== 'pending') return null
1533
+ return r.request_id
1534
+ } catch (err) {
1535
+ process.stderr.write(
1536
+ `[approval-kernel] mintGrantWizardKernelRequest failed: ${(err as Error).message}\n`,
1537
+ )
1538
+ return null
1539
+ }
1540
+ }
1541
+
1542
+ /**
1543
+ * Record the user's wizard decision (allow/deny) on the approval kernel
1544
+ * for a `/vault grant` wizard card. Best-effort and idempotent — a
1545
+ * missing `request_id` (legacy in-flight wizard) or an unreachable
1546
+ * broker silently no-op so the legacy UX is unaffected. Audit-only in
1547
+ * Phase 1: nothing downstream reads this verdict yet.
1548
+ */
1549
+ async function recordGrantWizardKernelDecision(
1550
+ request_id: string | undefined,
1551
+ decision: 'allow_once' | 'deny',
1552
+ granted_by_user_id: number,
1553
+ approverSet: string[],
1554
+ ): Promise<void> {
1555
+ if (!request_id) return
1556
+ try {
1557
+ const consumed = await approvalConsume(request_id)
1558
+ if (consumed === null || !consumed.consumed) return
1559
+ await approvalRecord({
1560
+ request_id,
1561
+ decision,
1562
+ approver_set: approverSet,
1563
+ granted_by_user_id,
1564
+ ttl_ms: null,
1565
+ })
1566
+ } catch (err) {
1567
+ process.stderr.write(
1568
+ `[approval-kernel] recordGrantWizardKernelDecision failed: ${(err as Error).message}\n`,
1569
+ )
1570
+ }
1571
+ }
1572
+
1309
1573
  // Channel B context rule — tracks when the gateway has emitted the
1310
1574
  // "Paste the browser code here" prompt so that the next inbound message
1311
1575
  // in the same chat is treated as auth-flow-sensitive regardless of whether
@@ -1440,20 +1704,7 @@ const pendingStateReaper = setInterval(() => {
1440
1704
  `telegram gateway: [restart-drain] forcing agent=${agentName} waited=${waitedSec}s threshold=${Math.round(PENDING_RESTART_DRAIN_CAP_MS / 1000)}s\n`,
1441
1705
  )
1442
1706
  pendingRestarts.delete(agentName)
1443
- try {
1444
- spawn(
1445
- 'sh',
1446
- [
1447
- '-c',
1448
- // The systemctl restart will SIGTERM then SIGKILL after TimeoutStopSec.
1449
- // The currently-running claude process will get SIGKILL via the unit stop.
1450
- `sleep 0.1 && systemctl --user restart switchroom-${agentName}.service switchroom-${agentName}-gateway.service`,
1451
- ],
1452
- { detached: true, stdio: 'ignore' },
1453
- ).unref()
1454
- } catch (err) {
1455
- process.stderr.write(`telegram gateway: [restart-drain] forced restart spawn failed agent=${agentName}: ${err}\n`)
1456
- }
1707
+ triggerSelfRestart(agentName, 'restart-drain-cap-forced', 100)
1457
1708
  }
1458
1709
  }
1459
1710
  }, 60_000)
@@ -1887,6 +2138,7 @@ const ipcServer: IpcServer = createIpcServer({
1887
2138
  restartAgeMs: markerAgeMs,
1888
2139
  loadAccounts: () => loadAccountsForBootCard(agentSlug),
1889
2140
  tmuxSupervisor: process.env.SWITCHROOM_TMUX_SUPERVISOR === '1',
2141
+ dockerMode: process.env.SWITCHROOM_RUNTIME === 'docker',
1890
2142
  }, ackMsgId).then(handle => {
1891
2143
  activeBootCard = handle
1892
2144
  }).catch((err: Error) => {
@@ -2007,27 +2259,19 @@ const ipcServer: IpcServer = createIpcServer({
2007
2259
  const turnInFlight = activeTurnStartedAt.size > 0;
2008
2260
 
2009
2261
  if (!turnInFlight) {
2010
- // No active turn, restart immediately. Cycle both the agent unit and
2011
- // the gateway unit (us) so telegram-plugin code changes always
2012
- // propagate. Send the client response FIRST, then spawn a detached
2013
- // shell to run the combined systemctl restart after a brief delay.
2014
- // The delay ensures the IPC response has flushed before systemd
2015
- // kills us; the detach ensures the systemctl job survives our death.
2262
+ // No active turn, restart immediately. Cycle both the agent and
2263
+ // gateway side-by-side so telegram-plugin code changes always
2264
+ // propagate. Send the client response FIRST, then trigger the
2265
+ // restart after a brief delay so the IPC response has flushed
2266
+ // before we get killed. (Under docker the helper SIGTERM's PID 1;
2267
+ // under systemd it spawns a detached `systemctl restart`.)
2016
2268
  try {
2017
2269
  client.send({
2018
2270
  type: 'schedule_restart_result',
2019
2271
  success: true,
2020
2272
  restartedImmediately: true,
2021
2273
  });
2022
- spawn(
2023
- 'sh',
2024
- [
2025
- '-c',
2026
- `sleep 0.3 && systemctl --user restart switchroom-${agentName}.service switchroom-${agentName}-gateway.service`,
2027
- ],
2028
- { detached: true, stdio: 'ignore' },
2029
- ).unref();
2030
- process.stderr.write(`telegram gateway: scheduled immediate restart of ${agentName} (agent + gateway)\n`);
2274
+ triggerSelfRestart(agentName, 'schedule-restart-immediate');
2031
2275
  } catch (err) {
2032
2276
  client.send({
2033
2277
  type: 'schedule_restart_result',
@@ -2090,6 +2334,31 @@ const ipcServer: IpcServer = createIpcServer({
2090
2334
  handlePtyPartial(msg.text)
2091
2335
  },
2092
2336
 
2337
+ /**
2338
+ * Phase 2 cron-fold-in: forward a synthesized inbound from the
2339
+ * in-agent scheduler sibling to the registered bridge for the
2340
+ * named agent. The wrapped `inbound` envelope is shipped verbatim
2341
+ * — the in-agent scheduler is the synthesis authority (it runs
2342
+ * `dispatchAsInbound` from `src/scheduler/dispatch.ts` to build
2343
+ * the message). The gateway only validates wire shape (handled
2344
+ * in ipc-server.ts:validateClientMessage) and routes.
2345
+ *
2346
+ * Logs every fire so an operator can correlate the agent's
2347
+ * transcript turn against the scheduler's audit row by `prompt_key`.
2348
+ */
2349
+ onInjectInbound(_client: IpcClient, msg: InjectInboundMessage) {
2350
+ const promptKey = typeof msg.inbound.meta?.prompt_key === 'string'
2351
+ ? msg.inbound.meta.prompt_key
2352
+ : 'unknown'
2353
+ const source = typeof msg.inbound.meta?.source === 'string'
2354
+ ? msg.inbound.meta.source
2355
+ : 'unknown'
2356
+ const delivered = ipcServer.sendToAgent(msg.agentName, msg.inbound)
2357
+ process.stderr.write(
2358
+ `telegram gateway: inject_inbound agent=${msg.agentName} source=${source} prompt_key=${promptKey} delivered=${delivered}\n`,
2359
+ )
2360
+ },
2361
+
2093
2362
  log: (msg) => process.stderr.write(`telegram gateway: ipc — ${msg}\n`),
2094
2363
  })
2095
2364
 
@@ -4702,12 +4971,18 @@ async function handleInbound(
4702
4971
  // the post-context flow stays seamless.
4703
4972
  const dKey = deferredKey(chat_id, msgId ?? 0)
4704
4973
  const cachedBranchDetection = detectSecrets(effectiveText).find((d) => d.confidence === 'high' && !d.suppressed)
4974
+ const cachedBranchSlug = cachedBranchDetection?.suggested_slug ?? (isAuthFlowContext ? 'anthropic_oauth_code' : 'secret')
4975
+ const cachedBranchKernelId = await mintDeferredSecretKernelRequest(
4976
+ cachedBranchSlug,
4977
+ loadAccess().allowFrom,
4978
+ )
4705
4979
  deferredSecrets.set(dKey, {
4706
4980
  chat_id,
4707
4981
  original_message_id: msgId ?? 0,
4708
4982
  text: effectiveText,
4709
4983
  staged_at: Date.now(),
4710
- suggested_slug: cachedBranchDetection?.suggested_slug ?? (isAuthFlowContext ? 'anthropic_oauth_code' : 'secret'),
4984
+ suggested_slug: cachedBranchSlug,
4985
+ kernel_request_id: cachedBranchKernelId ?? undefined,
4711
4986
  })
4712
4987
  await switchroomReply(
4713
4988
  ctx,
@@ -4748,12 +5023,17 @@ async function handleInbound(
4748
5023
  highConfDetection?.suggested_slug
4749
5024
  ?? (isAuthFlowContext ? 'anthropic_oauth_code' : 'secret')
4750
5025
  const dKey = deferredKey(chat_id, msgId ?? 0)
5026
+ const noPassKernelId = await mintDeferredSecretKernelRequest(
5027
+ suggestedSlug,
5028
+ loadAccess().allowFrom,
5029
+ )
4751
5030
  deferredSecrets.set(dKey, {
4752
5031
  chat_id,
4753
5032
  original_message_id: msgId ?? 0,
4754
5033
  text: effectiveText,
4755
5034
  staged_at: Date.now(),
4756
5035
  suggested_slug: suggestedSlug,
5036
+ kernel_request_id: noPassKernelId ?? undefined,
4757
5037
  })
4758
5038
  if (msgId != null) {
4759
5039
  try { await bot.api.deleteMessage(chat_id, msgId) } catch {}
@@ -5336,6 +5616,47 @@ function resolveSystemdRunPath(): string | null {
5336
5616
  return _systemdRunPath
5337
5617
  }
5338
5618
 
5619
+ /**
5620
+ * Detect whether `docker` is callable from this process — required by
5621
+ * `switchroom update`'s pull-images and recreate-containers steps.
5622
+ *
5623
+ * The gateway runs INSIDE the agent container (cron-fold-in / Phase 4
5624
+ * docker model), which by design has no docker binary AND no socket
5625
+ * mount. We probe both: binary on PATH (via `docker --version`) and
5626
+ * socket on disk (via existsSync). True only if BOTH are present —
5627
+ * mirroring the actual requirements `switchroom update` will hit when
5628
+ * it shells out.
5629
+ *
5630
+ * Cached: docker availability doesn't change at runtime within a
5631
+ * single container generation.
5632
+ */
5633
+ let _dockerReachable: boolean | undefined
5634
+ function isDockerReachable(): boolean {
5635
+ if (_dockerReachable !== undefined) return _dockerReachable
5636
+ // Cheap socket probe first — if the mount is absent, no need to
5637
+ // pay the execSync cost. Common-case fast-path on docker installs.
5638
+ if (!existsSync('/var/run/docker.sock')) {
5639
+ _dockerReachable = false
5640
+ return _dockerReachable
5641
+ }
5642
+ try {
5643
+ // -version is fast and doesn't require an actual daemon roundtrip
5644
+ // for binary-present probing. Bounded timeout in case the binary
5645
+ // exists but blocks (unlikely but defensive).
5646
+ execSync('docker --version', { stdio: 'ignore', timeout: 2000 })
5647
+ _dockerReachable = true
5648
+ } catch {
5649
+ _dockerReachable = false
5650
+ }
5651
+ return _dockerReachable
5652
+ }
5653
+
5654
+ // @internal exported for tests — resets the docker-reachable cache so
5655
+ // a test can swap underlying state and observe the new probe result.
5656
+ export function _resetDockerReachableCache(): void {
5657
+ _dockerReachable = undefined
5658
+ }
5659
+
5339
5660
  function spawnSwitchroomDetached(
5340
5661
  args: string[],
5341
5662
  onFailure?: (info: { code: number; tail: string }) => void,
@@ -5748,28 +6069,35 @@ async function runSwitchroomCommandFormatted(ctx: Context, args: string[], label
5748
6069
  }
5749
6070
 
5750
6071
  // ─── Admin-command gating middleware ─────────────────────────────────────
5751
- // When AGENT_ADMIN=false (default), admin slash commands like /agents, /logs,
5752
- // /restart etc. should fall through to Claude rather than being executed
5753
- // locally. Grammy's bot.command() handlers fire BEFORE bot.on('message:text'),
5754
- // so without this middleware the commands would silently execute (or no-op
5755
- // due to isAuthorizedSender) and never reach handleInboundCoalesced.
6072
+ // When AGENT_ADMIN=false (default), admin slash commands (/agents, /logs,
6073
+ // /grant, etc.) must NOT execute locally this agent isn't admin-flagged
6074
+ // and routing them through Claude burns tokens for no benefit. Reply with a
6075
+ // concise "admin required" warning instead.
5756
6076
  //
5757
- // Middleware registered BEFORE bot.command() calls intercepts text messages
5758
- // first. If admin gating is off and the command is in ADMIN_COMMAND_NAMES, we
5759
- // redirect to handleInboundCoalesced so Claude sees the message.
6077
+ // Special case: `/restart` with no arg, or `/restart <my-agent-name>`, is
6078
+ // allowed to fall through to the local bot.command('restart', …) handler so
6079
+ // every agent can self-restart without admin privilege. `/restart <other>`
6080
+ // is blocked just like any other admin verb.
5760
6081
  //
5761
6082
  // Invariant: when AGENT_ADMIN=true, this middleware is a no-op — bot.command()
5762
- // handlers run normally and Claude never sees admin commands.
6083
+ // handlers run normally for all admin verbs and Claude never sees them.
5763
6084
  bot.use(async (ctx, next) => {
5764
6085
  if (!AGENT_ADMIN && ctx.message?.text) {
5765
- const cmd = parseCommandName(ctx.message.text)
5766
- if (cmd !== null && ADMIN_COMMAND_NAMES.has(cmd)) {
5767
- // Redirect admin command text to Claude via the normal inbound path.
5768
- // We intentionally do NOT call next() so bot.command() never fires.
6086
+ const myName = getMyAgentName()
6087
+ const decision = classifyAdminGate(ctx.message.text, myName)
6088
+ if (decision.action === 'block') {
6089
+ // Block admin commands the LLM should never see. Reply with a concise
6090
+ // "admin required" warning instead of forwarding to Claude.
5769
6091
  process.stderr.write(
5770
- `telegram gateway: admin-gate redirect cmd=/${cmd} agent=${process.env.SWITCHROOM_AGENT_NAME ?? '-'} (AGENT_ADMIN=false)\n`,
6092
+ `telegram gateway: admin-gate blocked cmd=/${decision.cmd} agent=${process.env.SWITCHROOM_AGENT_NAME ?? '-'} reason=${decision.reason} (AGENT_ADMIN=false)\n`,
5771
6093
  )
5772
- await handleInboundCoalesced(ctx, ctx.message.text, undefined)
6094
+ const cmdHtml = escapeHtmlForTg(`/${decision.cmd}`)
6095
+ const nameHtml = escapeHtmlForTg(myName)
6096
+ const text =
6097
+ decision.reason === 'other-agent'
6098
+ ? `⚠️ <code>${cmdHtml}</code> targeting another agent is an admin operation — this agent (<code>${nameHtml}</code>) isn't admin-flagged. Run it from an admin agent, or set <code>admin: true</code> for this agent in switchroom.yaml. (Self-restart is allowed: send <code>/restart</code> with no arg.)`
6099
+ : `⚠️ <code>${cmdHtml}</code> is an admin command — this agent (<code>${nameHtml}</code>) isn't admin-flagged. Run it from an admin agent, or set <code>admin: true</code> for this agent in switchroom.yaml.`
6100
+ await switchroomReply(ctx, text, { html: true })
5773
6101
  return
5774
6102
  }
5775
6103
  }
@@ -5848,7 +6176,7 @@ function buildAgentAudit(agentName: string): AgentAudit | undefined {
5848
6176
  // to `switchroom agent list --json` and `switchroom auth status --json`.
5849
6177
  // Best-effort — any missing piece renders as a placeholder in the text
5850
6178
  // templates rather than blocking the reply.
5851
- function buildAgentMetadata(agentName: string): AgentMetadata {
6179
+ async function buildAgentMetadata(agentName: string): Promise<AgentMetadata> {
5852
6180
  type AgentListResp = {
5853
6181
  agents: Array<{
5854
6182
  name: string; status: string; uptime: string;
@@ -5885,9 +6213,66 @@ function buildAgentMetadata(agentName: string): AgentMetadata {
5885
6213
  status: a?.status ?? null,
5886
6214
  auth: authSummary,
5887
6215
  audit: buildAgentAudit(agentName),
6216
+ live: await buildLiveProbeRows(agentName),
5888
6217
  }
5889
6218
  }
5890
6219
 
6220
+ /**
6221
+ * Run the boot-card probe set on demand for `/status`. Same probes,
6222
+ * different rendering contract: `/status` shows every row (silent-when-
6223
+ * healthy is for the boot card; the user explicitly asked for current
6224
+ * state here). Failures are swallowed per-probe via runAllProbes's
6225
+ * Promise.allSettled, and we filter out anything we couldn't render so
6226
+ * the reply doesn't break on a broken probe.
6227
+ */
6228
+ async function buildLiveProbeRows(agentName: string): Promise<StatusProbeRow[]> {
6229
+ try {
6230
+ const { runAllProbes } = await import('./boot-card.js')
6231
+ const agentDir = resolveAgentDirFromEnv()
6232
+ ?? (process.env.TELEGRAM_STATE_DIR
6233
+ ? require('path').dirname(process.env.TELEGRAM_STATE_DIR)
6234
+ : '/tmp')
6235
+ const probes = await runAllProbes({
6236
+ agentName,
6237
+ agentSlug: agentName,
6238
+ version: formatBootVersion(),
6239
+ agentDir,
6240
+ gatewayInfo: { pid: process.pid, startedAtMs: GATEWAY_STARTED_AT_MS },
6241
+ tmuxSupervisor: process.env.SWITCHROOM_TMUX_SUPERVISOR === '1',
6242
+ dockerMode: process.env.SWITCHROOM_RUNTIME === 'docker',
6243
+ })
6244
+ const rows: StatusProbeRow[] = []
6245
+ // Render order matches the boot card's PROBE_KEYS so the two
6246
+ // surfaces tell the same story in the same order.
6247
+ const order = ['account', 'agent', 'gateway', 'quota', 'hindsight',
6248
+ 'scheduler', 'broker', 'kernel', 'skills'] as const
6249
+ for (const k of order) {
6250
+ const r = probes[k]
6251
+ if (!r) continue
6252
+ rows.push({ status: r.status, label: r.label, detail: r.detail })
6253
+ }
6254
+ return rows
6255
+ } catch (err: unknown) {
6256
+ process.stderr.write(
6257
+ `telegram gateway: /status: probe gathering failed: ${
6258
+ (err as Error)?.message ?? String(err)
6259
+ }\n`,
6260
+ )
6261
+ return []
6262
+ }
6263
+ }
6264
+
6265
+ // RFC B §9: register /approvals list|revoke against the approval kernel.
6266
+ // The kernel's IPC client (`src/vault/approvals/client.ts`) round-trips
6267
+ // through the vault broker — same socket, no new daemon. The isApprover
6268
+ // gate reuses the existing dmCommandGate / allowFrom pattern.
6269
+ {
6270
+ const { registerApprovalsCommands } = await import('./approvals-commands.js')
6271
+ registerApprovalsCommands(bot, {
6272
+ isApprover: ctx => dmCommandGate(ctx) !== null,
6273
+ })
6274
+ }
6275
+
5891
6276
  bot.command('start', async ctx => {
5892
6277
  // dmCommandGate (#894 backport): silent drop on disabled or
5893
6278
  // non-allowlisted senders so the bot doesn't leak its existence.
@@ -5912,7 +6297,7 @@ bot.command('status', async ctx => {
5912
6297
  const from = ctx.from!
5913
6298
  if (access.allowFrom.includes(senderId)) {
5914
6299
  const userTag = from.username ? `@${from.username}` : senderId
5915
- const meta = buildAgentMetadata(getMyAgentName())
6300
+ const meta = await buildAgentMetadata(getMyAgentName())
5916
6301
  await ctx.reply(buildStatusPairedText({ user: userTag, meta }), { parse_mode: 'HTML' })
5917
6302
  return
5918
6303
  }
@@ -6123,6 +6508,168 @@ async function handleNewOrResetCommand(ctx: Context, kind: 'new' | 'reset'): Pro
6123
6508
  bot.command('new', async ctx => handleNewOrResetCommand(ctx, 'new'))
6124
6509
  bot.command('reset', async ctx => handleNewOrResetCommand(ctx, 'reset'))
6125
6510
 
6511
+ // /update — host update from Telegram (#919). Default = dry-run plan
6512
+ // (`switchroom update --check`); explicit `apply` triggers the real
6513
+ // thing via spawnSwitchroomDetached so the gateway can be killed
6514
+ // mid-flight by the recreate-containers step without orphaning the
6515
+ // update. Admin-gated via ADMIN_COMMAND_NAMES.
6516
+ bot.command('update', async ctx => {
6517
+ if (!isAuthorizedSender(ctx)) return
6518
+ const arg = ctx.match?.trim() || ''
6519
+ if (arg === '' || arg === 'check' || arg === '--check') {
6520
+ await runSwitchroomCommand(ctx, ['update', '--check'], 'update --check')
6521
+ await switchroomReply(
6522
+ ctx,
6523
+ 'Reply with <code>/update apply</code> to execute, or <code>/update apply --skip-images</code> to skip the image pull.',
6524
+ { html: true },
6525
+ )
6526
+ return
6527
+ }
6528
+ // Parse `apply` (with optional --skip-images / --rebuild passthrough).
6529
+ // `/update apply` and `/update apply --skip-images` are the supported
6530
+ // forms; everything else surfaces a usage hint.
6531
+ const tokens = arg.split(/\s+/)
6532
+ if (tokens[0] !== 'apply' && tokens[0] !== '--apply') {
6533
+ await switchroomReply(
6534
+ ctx,
6535
+ 'Usage: <code>/update</code> (dry-run) or <code>/update apply [--skip-images] [--rebuild]</code>',
6536
+ { html: true },
6537
+ )
6538
+ return
6539
+ }
6540
+ // Whitelist passthrough flags. Anything outside the allowlist is
6541
+ // refused — operators should not be able to inject arbitrary CLI
6542
+ // args via Telegram (defense in depth even though admin-gated).
6543
+ const ALLOWED_FLAGS = new Set(['--skip-images', '--rebuild'])
6544
+ const passthrough = tokens.slice(1)
6545
+ for (const tok of passthrough) {
6546
+ if (!ALLOWED_FLAGS.has(tok)) {
6547
+ await switchroomReply(
6548
+ ctx,
6549
+ `Refusing to pass unknown flag: <code>${escapeHtmlForTg(tok)}</code>. ` +
6550
+ `Allowed: <code>--skip-images</code>, <code>--rebuild</code>.`,
6551
+ { html: true },
6552
+ )
6553
+ return
6554
+ }
6555
+ }
6556
+ // Docker reachability guard (#926). The gateway runs INSIDE the agent
6557
+ // container, which has the switchroom CLI baked in but no docker
6558
+ // binary and no /var/run/docker.sock mount. So `switchroom update`'s
6559
+ // pull-images and recreate-containers steps would fail with
6560
+ // "docker: command not found". Without this guard, the operator
6561
+ // sees an opaque "❌ update failed (exit 127)" via
6562
+ // notifyDetachedFailure ~5s after the ack.
6563
+ //
6564
+ // Surface a clean explanation instead, pointing them at the host
6565
+ // CLI as the working path. /update (dry-run) does NOT need docker
6566
+ // and is unaffected — only /update apply.
6567
+ if (!isDockerReachable()) {
6568
+ await switchroomReply(
6569
+ ctx,
6570
+ `❌ <b>/update apply</b> needs docker access from inside the agent ` +
6571
+ `container, but it's not available (no <code>docker</code> binary on ` +
6572
+ `PATH, no <code>/var/run/docker.sock</code> mount).\n\n` +
6573
+ `On docker installs, run <code>switchroom update</code> from the ` +
6574
+ `host shell instead.\n\n` +
6575
+ `<i>Tracked as #926 — host-side update daemon would close this gap.</i>`,
6576
+ { html: true },
6577
+ )
6578
+ return
6579
+ }
6580
+ // Debounce vs concurrent self-restart commands (/restart, /new, /reset
6581
+ // and other /update). Reading + writing the SAME restart marker means
6582
+ // a double-tap of /update apply is rejected, AND a /restart fired
6583
+ // mid-update is rejected (and vice versa). 15s window matches the
6584
+ // /restart handler.
6585
+ const existing = readRestartMarker()
6586
+ if (existing && Date.now() - existing.ts < 15_000) {
6587
+ await switchroomReply(
6588
+ ctx,
6589
+ `⏳ Self-restart already in progress (started ${Math.round(
6590
+ (Date.now() - existing.ts) / 1000,
6591
+ )}s ago) — ignoring duplicate.`,
6592
+ { html: true },
6593
+ )
6594
+ return
6595
+ }
6596
+ const chatId = String(ctx.chat!.id)
6597
+ const threadId = resolveThreadId(chatId, ctx.message?.message_thread_id)
6598
+ // Send the ack and capture its message_id so the post-restart
6599
+ // greeting card can edit/reply into the same message. Mirrors the
6600
+ // /restart handler (gateway.ts ~6273) so the boot-card lookup
6601
+ // (gateway.ts ~10393) finds chat_id + ack_message_id in the marker.
6602
+ const ackText =
6603
+ `🚀 <b>update started</b> — running ${[
6604
+ '<code>switchroom update</code>',
6605
+ ...passthrough.map((t) => `<code>${escapeHtmlForTg(t)}</code>`),
6606
+ ].join(' ')}\n` +
6607
+ `\nThe gateway will restart as part of the recreate step; watch ` +
6608
+ `for the post-restart greeting card to confirm completion.`
6609
+ let ackId: number | null = null
6610
+ try {
6611
+ const sent = await lockedBot.api.sendMessage(chatId, ackText, {
6612
+ parse_mode: 'HTML',
6613
+ link_preview_options: { is_disabled: true },
6614
+ ...(threadId != null ? { message_thread_id: threadId } : {}),
6615
+ })
6616
+ ackId = sent.message_id
6617
+ if (HISTORY_ENABLED) {
6618
+ try {
6619
+ recordOutbound({
6620
+ chat_id: chatId,
6621
+ thread_id: threadId ?? null,
6622
+ message_ids: [sent.message_id],
6623
+ texts: [`🚀 update started`],
6624
+ attachment_kinds: [],
6625
+ })
6626
+ } catch {}
6627
+ }
6628
+ } catch {}
6629
+ writeRestartMarker({
6630
+ chat_id: chatId,
6631
+ thread_id: threadId ?? null,
6632
+ ack_message_id: ackId,
6633
+ ts: Date.now(),
6634
+ })
6635
+ // Reason banner for the post-restart greeting card. Without this the
6636
+ // banner falls back to whatever the CLI's clean-shutdown marker
6637
+ // stamped — usually 'unknown' or a docker-compose-restart string.
6638
+ stampUserRestartReason('user: /update from chat')
6639
+ // Unpin progress cards + clear active reactions before we die. The
6640
+ // pinned-progress-card surface is the headline feature per CLAUDE.md;
6641
+ // leaving one pinned across the recreate would surprise the operator.
6642
+ await sweepBeforeSelfRestart()
6643
+ spawnSwitchroomDetached(
6644
+ ['update', ...passthrough],
6645
+ notifyDetachedFailure(chatId, threadId ?? null, 'update'),
6646
+ )
6647
+ })
6648
+
6649
+ // /upgradestatus — read-only snapshot of where this host stands (#927).
6650
+ // Wraps `switchroom update --status` synchronously and posts the
6651
+ // formatted output. NOT admin-gated: read-only fleet metadata is safe
6652
+ // for any allowFrom user to see, and the answer "is something behind?"
6653
+ // is the missing companion to /update's "trigger an update".
6654
+ // (Telegram slash-commands forbid hyphens, hence /upgradestatus not
6655
+ // /upgrade-status. The /upgrade alias just below redirects.)
6656
+ bot.command('upgradestatus', async ctx => {
6657
+ if (!isAuthorizedSender(ctx)) return
6658
+ await runSwitchroomCommand(ctx, ['update', '--status'], 'update --status')
6659
+ })
6660
+ // Alias with hyphen — Grammy doesn't allow hyphens in command names
6661
+ // (Telegram's slash-command grammar excludes them) but operators are
6662
+ // likely to type /upgrade-status; surface a polite redirect.
6663
+ bot.command('upgrade', async ctx => {
6664
+ if (!isAuthorizedSender(ctx)) return
6665
+ await switchroomReply(
6666
+ ctx,
6667
+ 'Did you mean <code>/upgradestatus</code> (no hyphen — Telegram slash-command grammar)? ' +
6668
+ 'Or <code>/update</code> to plan, <code>/update apply</code> to execute.',
6669
+ { html: true },
6670
+ )
6671
+ })
6672
+
6126
6673
  // ─── /approve, /deny, /pending ────────────────────────────────────────────
6127
6674
  // Slash-command alternatives to the inline-button approval flow (useful for
6128
6675
  // desktop-only sessions and power-users). Share pendingPermissions state
@@ -6961,6 +7508,16 @@ async function handleVaultDeferCallback(ctx: Context, data: string): Promise<voi
6961
7508
  const cardMessageId = ctx.callbackQuery?.message?.message_id
6962
7509
 
6963
7510
  if (action === 'cancel') {
7511
+ // Kernel-side dual-dispatch (MIGRATION.md §1): record the deny decision
7512
+ // BEFORE the legacy handler clears state, so the audit log captures it
7513
+ // even if the editMessageText below races with another tap. Best-effort
7514
+ // — broker unreachable falls back to legacy-only.
7515
+ await recordDeferredSecretKernelDecision(
7516
+ deferred.kernel_request_id,
7517
+ 'deny',
7518
+ ctx.from?.id ?? 0,
7519
+ access.allowFrom,
7520
+ )
6964
7521
  deferredSecrets.delete(deferKey)
6965
7522
  await ctx.answerCallbackQuery({ text: 'Discarded.' }).catch(() => {})
6966
7523
  if (cardMessageId != null) {
@@ -6974,6 +7531,18 @@ async function handleVaultDeferCallback(ctx: Context, data: string): Promise<voi
6974
7531
  }
6975
7532
 
6976
7533
  if (action === 'unlock') {
7534
+ // Kernel-side dual-dispatch (MIGRATION.md §1): record the allow_once
7535
+ // decision when the user taps unlock. The actual passphrase capture +
7536
+ // vault write still happens via the legacy path below — the kernel
7537
+ // decision is for audit/state, not secret material (per RFC B). We
7538
+ // record at tap-time rather than after passphrase entry so a kernel
7539
+ // record exists even if the user abandons the passphrase prompt.
7540
+ await recordDeferredSecretKernelDecision(
7541
+ deferred.kernel_request_id,
7542
+ 'allow_once',
7543
+ ctx.from?.id ?? 0,
7544
+ access.allowFrom,
7545
+ )
6977
7546
  // If a passphrase is already cached we can skip straight to the write.
6978
7547
  // Covers the case where the user had unlocked separately between
6979
7548
  // detection and tap.
@@ -7163,12 +7732,43 @@ async function grantWizardConfirm(ctx: Context, chatId: string, state: Extract<P
7163
7732
  const sent = await switchroomReply(ctx, text, { html: true, reply_markup: kb })
7164
7733
  state.wizardMsgId = (sent as unknown as { message_id?: number })?.message_id
7165
7734
  }
7166
- pendingVaultOps.set(chatId, { ...state, step: 'confirm', expiresLabel })
7735
+ // Mint kernel decision row at the confirm step (MIGRATION.md §2,
7736
+ // audit-only Phase 1). We do it here rather than at executeGrantWizard
7737
+ // so a kernel row exists even if the user taps Cancel from the confirm
7738
+ // card — the deny verdict on cancel is then recorded against the same
7739
+ // request_id. If the kernel/broker is unreachable, request_id stays
7740
+ // undefined and the wizard runs legacy-only (no behaviour change).
7741
+ const kernelRequestId = await mintGrantWizardKernelRequest(
7742
+ state.agent!,
7743
+ loadAccess().allowFrom,
7744
+ state.selectedKeys!,
7745
+ state.ttlSeconds ?? null,
7746
+ )
7747
+ pendingVaultOps.set(chatId, {
7748
+ ...state,
7749
+ step: 'confirm',
7750
+ expiresLabel,
7751
+ kernel_request_id: kernelRequestId ?? state.kernel_request_id,
7752
+ })
7167
7753
  }
7168
7754
 
7169
7755
  /** Execute the grant: call broker mint_grant, write token, reply. */
7170
7756
  async function executeGrantWizard(ctx: Context, chatId: string, state: Extract<PendingVaultOp, { kind: 'grant-wizard' }>): Promise<void> {
7171
7757
  pendingVaultOps.delete(chatId)
7758
+ // Kernel-side dual-dispatch (MIGRATION.md §2, audit-only Phase 1):
7759
+ // record the allow_once decision when the user taps Generate. The
7760
+ // legacy `mintGrantViaBroker` below still drives the actual grant
7761
+ // mint + token write — the kernel row is informational, not
7762
+ // enforcing, in Phase 1 (issue #833 will flip to enforcing).
7763
+ // We record at tap-time rather than after mint_grant succeeds so a
7764
+ // kernel row exists even if the legacy mint fails (audit captures
7765
+ // intent regardless of downstream outcome).
7766
+ await recordGrantWizardKernelDecision(
7767
+ state.kernel_request_id,
7768
+ 'allow_once',
7769
+ ctx.from?.id ?? 0,
7770
+ loadAccess().allowFrom,
7771
+ )
7172
7772
  // Defence-in-depth: state.agent flows from callback_data into a path
7173
7773
  // join below. A crafted vg:agent:../../etc payload would produce a
7174
7774
  // path traversal. Validate against the same regex the rest of the
@@ -7316,6 +7916,20 @@ async function handleVaultGrantCallback(ctx: Context, data: string): Promise<voi
7316
7916
 
7317
7917
  // Cancel at any wizard step
7318
7918
  if (data === 'vg:cancel') {
7919
+ // Kernel-side dual-dispatch (MIGRATION.md §2, audit-only Phase 1):
7920
+ // if the user got as far as the confirm step, a kernel request_id
7921
+ // will be on the wizard state — record the deny decision so the
7922
+ // audit log captures the abandonment. No-op if the user cancelled
7923
+ // before the confirm step (or if the kernel was unreachable).
7924
+ const cancelState = pendingVaultOps.get(chatId)
7925
+ if (cancelState && cancelState.kind === 'grant-wizard') {
7926
+ await recordGrantWizardKernelDecision(
7927
+ cancelState.kernel_request_id,
7928
+ 'deny',
7929
+ ctx.from?.id ?? 0,
7930
+ loadAccess().allowFrom,
7931
+ )
7932
+ }
7319
7933
  pendingVaultOps.delete(chatId)
7320
7934
  const msg = ctx.callbackQuery?.message
7321
7935
  if (msg && 'text' in msg) {
@@ -7556,19 +8170,23 @@ async function handleOperatorEventCallback(ctx: Context, data: string): Promise<
7556
8170
  }
7557
8171
  case 'restart': {
7558
8172
  await ctx.answerCallbackQuery({ text: `Restarting ${agent}…` }).catch(() => {})
7559
- try {
7560
- execFileSync('systemctl', ['--user', 'restart', `switchroom-${agent}`], {
7561
- encoding: 'utf-8',
7562
- timeout: 15000,
7563
- stdio: ['ignore', 'pipe', 'pipe'],
7564
- })
8173
+ const ok = triggerSelfRestart(agent, 'inline-button-restart')
8174
+ if (ok) {
7565
8175
  await ctx.reply(`<b>${agent}</b> restart requested.`, { parse_mode: 'HTML' })
7566
8176
  await ctx.editMessageReplyMarkup({ reply_markup: { inline_keyboard: [] } }).catch(() => {})
7567
- } catch (err) {
7568
- // err.message includes concatenated stderr which can contain HTML
7569
- // metacharacters; escape before interpolating into a <pre> block.
7570
- const safeMsg = escapeHtmlForTg((err as Error).message)
7571
- await ctx.reply(`<b>Restart failed for ${agent}:</b>\n<pre>${safeMsg}</pre>`, {
8177
+ } else {
8178
+ // Under docker the helper refuses cross-agent restart; surface
8179
+ // a clear message instead of a silent no-op. Service name in
8180
+ // the generated compose is `agent-<name>` (compose.ts:408);
8181
+ // container_name is `switchroom-<name>` (compose.ts:410).
8182
+ // `docker compose restart` takes a SERVICE name, so we point
8183
+ // the operator at the service.
8184
+ const isDocker = process.env.SWITCHROOM_RUNTIME === 'docker'
8185
+ const detail = isDocker
8186
+ ? `cross-agent restart is not supported under docker. ` +
8187
+ `Restart from the host: <code>docker compose -p switchroom restart agent-${agent}</code>.`
8188
+ : 'restart trigger failed'
8189
+ await ctx.reply(`<b>Restart failed for ${agent}:</b> ${detail}`, {
7572
8190
  parse_mode: 'HTML',
7573
8191
  })
7574
8192
  }
@@ -7582,6 +8200,21 @@ async function handleOperatorEventCallback(ctx: Context, data: string): Promise<
7582
8200
  }
7583
8201
  case 'logs': {
7584
8202
  await ctx.answerCallbackQuery({ text: 'Fetching logs…' }).catch(() => {})
8203
+ // Pick the right log source for the runtime. Under docker, the
8204
+ // gateway is INSIDE the agent container — calling `docker logs`
8205
+ // requires the host's docker socket which is deliberately not
8206
+ // mounted into agent containers. Under systemd, journalctl
8207
+ // works as before. v0.7.2 fixed `case 'restart'` but left this
8208
+ // path systemd-only.
8209
+ const isDocker = process.env.SWITCHROOM_RUNTIME === 'docker'
8210
+ if (isDocker) {
8211
+ await ctx.reply(
8212
+ `<i>Inline log fetch is not available under docker mode (no docker.sock in agent containers). ` +
8213
+ `Run from the host: <code>docker logs --since 30m --tail 30 switchroom-${agent}</code></i>`,
8214
+ { parse_mode: 'HTML' },
8215
+ )
8216
+ return
8217
+ }
7585
8218
  try {
7586
8219
  const out = execFileSync(
7587
8220
  'journalctl',
@@ -8231,17 +8864,11 @@ bot.command('permissions', async ctx => {
8231
8864
  await runSwitchroomCommand(ctx, ['agent', 'permissions', agentName], `permissions ${agentName}`)
8232
8865
  })
8233
8866
 
8234
- bot.command('update', async ctx => {
8235
- if (!isAuthorizedSender(ctx)) return
8236
- await switchroomReply(ctx, '🔄 Running <b>switchroom update</b>… back in ~30 seconds.', { html: true })
8237
- await sweepBeforeSelfRestart()
8238
- const chatId = String(ctx.chat!.id)
8239
- const threadId = resolveThreadId(chatId, ctx.message?.message_thread_id)
8240
- spawnSwitchroomDetached(
8241
- ['update'],
8242
- notifyDetachedFailure(chatId, threadId ?? null, 'update'),
8243
- )
8244
- })
8867
+ // Drive-by cleanup (#927): the dead /update handler that lived here
8868
+ // was a pre-#919 stub. Grammy registers in order so the comprehensive
8869
+ // /update handler at line ~6516 (added in #919, hardened in #924,
8870
+ // docker-guarded in #934) fired first and this one never ran.
8871
+ // Removed to avoid future confusion.
8245
8872
 
8246
8873
  bot.command('version', async ctx => {
8247
8874
  if (!isAuthorizedSender(ctx)) return
@@ -8293,6 +8920,16 @@ bot.on('callback_query:data', async ctx => {
8293
8920
  return
8294
8921
  }
8295
8922
 
8923
+ // RFC B §6.1: apv:<request_id>:<choice>[:<param>] — approval kernel taps.
8924
+ // Routed through the generic kernel handler so any surface that uses
8925
+ // buildApprovalCard inherits consume → record → confirmation UX without
8926
+ // each surface re-implementing it.
8927
+ if (data.startsWith('apv:')) {
8928
+ const { handleApprovalCallback } = await import('./approval-callback.js')
8929
+ await handleApprovalCallback(ctx, data)
8930
+ return
8931
+ }
8932
+
8296
8933
  // op:<action>:<encoded-agent> callbacks from operator-events.ts
8297
8934
  // renderOperatorEvent(). Agent name is URL-encoded at emit (issue #24).
8298
8935
  // Actions: dismiss, restart, reauth, swap-slot, add-slot, logs.
@@ -9391,8 +10028,37 @@ if (streamMode === 'checklist') {
9391
10028
  return { code: 0, description: msg, kind: 'transient' }
9392
10029
  }
9393
10030
 
10031
+ // #842: progress-card first-render gating. Read the per-agent
10032
+ // overrides from switchroom.yaml; fall back to driver defaults
10033
+ // (45000 ms / 0 ms) when absent, unreadable, or not present in the
10034
+ // cascade (defaults → profile → per-agent).
10035
+ let progressCardDelayMs: number | undefined
10036
+ let progressCardDelayMsBackground: number | undefined
10037
+ try {
10038
+ const swConfig = loadSwitchroomConfig()
10039
+ const agentSlugForCfg = process.env.SWITCHROOM_AGENT_NAME
10040
+ const agentCfg = agentSlugForCfg ? swConfig.agents?.[agentSlugForCfg] : undefined
10041
+ const pc = agentCfg?.channels?.telegram?.progress_card
10042
+ if (pc) {
10043
+ if (typeof pc.delay_ms === 'number') progressCardDelayMs = pc.delay_ms
10044
+ if (typeof pc.delay_ms_background === 'number') progressCardDelayMsBackground = pc.delay_ms_background
10045
+ }
10046
+ } catch {
10047
+ // Best-effort — gateway may run in dirs where loadSwitchroomConfig
10048
+ // fails. Driver defaults apply.
10049
+ }
10050
+
9394
10051
  progressDriver = createProgressDriver({
10052
+ ...(progressCardDelayMs != null ? { initialDelayMs: progressCardDelayMs } : {}),
10053
+ ...(progressCardDelayMsBackground != null ? { initialDelayMsBackground: progressCardDelayMsBackground } : {}),
9395
10054
  emit: ({ chatId, threadId, turnKey, html, done, isFirstEmit, replyToMessageId, agentId }) => {
10055
+ // Tag the outbound API calls so `tg-post` log lines carry turnKey
10056
+ // (and cardMessageId when known) — lets us audit days-old session
10057
+ // logs for "did the card render?" / "what edit storms hit it?"
10058
+ // without parsing free-form progress-card traces. (#card-audit-log)
10059
+ const knownCardMessageId = pinMgr.pinnedMessageId(turnKey, agentId)
10060
+ const tgPostTags: Record<string, string | number> = { turnKey }
10061
+ if (knownCardMessageId != null) tgPostTags.cardMessageId = knownCardMessageId
9396
10062
  const args = {
9397
10063
  chat_id: chatId, text: html, done, message_thread_id: threadId,
9398
10064
  lane: 'progress', format: 'html', turnKey,
@@ -9439,7 +10105,7 @@ if (streamMode === 'checklist') {
9439
10105
  // default in a follow-up PR.
9440
10106
  const draftFlagOn = process.env.PROGRESS_CARD_DRAFT_TRANSPORT === '1'
9441
10107
  const draftEligible = draftFlagOn && isDmChatId(chatId) && threadId == null
9442
- handleStreamReply(args, { activeDraftStreams, activeDraftParseModes, suppressPtyPreview }, {
10108
+ withTgPostTags(tgPostTags, () => handleStreamReply(args, { activeDraftStreams, activeDraftParseModes, suppressPtyPreview }, {
9443
10109
  // grammy Bot vs local StreamBotApi — see cast pattern above.
9444
10110
  bot: lockedBot as never, retry: robustApiCall, markdownToHtml, escapeMarkdownV2, repairEscapedWhitespace,
9445
10111
  takeHandoffPrefix: () => '', assertAllowedChat, resolveThreadId, disableLinkPreview: true,
@@ -9466,7 +10132,7 @@ if (streamMode === 'checklist') {
9466
10132
  ...(sendMessageDraftFn != null ? { sendMessageDraft: sendMessageDraftFn } : {}),
9467
10133
  }
9468
10134
  : {}),
9469
- }).then((result) => {
10135
+ })).then((result) => {
9470
10136
  // Successful API call — reset the consecutive-4xx counter.
9471
10137
  progressDriver?.reportApiSuccess(turnKey)
9472
10138
  // #203: progress-card edit is a user-visible signal.
@@ -9929,7 +10595,10 @@ void (async () => {
9929
10595
  const cleanMarkerStale = cleanMarker
9930
10596
  ? !shouldSuppressRecoveryBanner(cleanMarker, nowMs, CLEAN_SHUTDOWN_MAX_AGE_MS)
9931
10597
  : false
9932
- const detailParts: string[] = ['gateway crashed and was auto-restarted by systemd']
10598
+ const supervisor = process.env.SWITCHROOM_RUNTIME === 'docker'
10599
+ ? 'docker compose'
10600
+ : 'systemd'
10601
+ const detailParts: string[] = [`gateway crashed and was auto-restarted by ${supervisor}`]
9933
10602
  if (cleanMarker?.signal) detailParts.push(`prior signal=${cleanMarker.signal}`)
9934
10603
  if (cleanMarkerStale) detailParts.push('clean-shutdown marker stale')
9935
10604
  emitGatewayOperatorEvent({
@@ -9978,6 +10647,7 @@ void (async () => {
9978
10647
  restartAgeMs: markerAgeMs,
9979
10648
  loadAccounts: () => loadAccountsForBootCard(agentSlug),
9980
10649
  tmuxSupervisor: process.env.SWITCHROOM_TMUX_SUPERVISOR === '1',
10650
+ dockerMode: process.env.SWITCHROOM_RUNTIME === 'docker',
9981
10651
  }, ackMsgId)
9982
10652
  activeBootCard = handle
9983
10653
  } catch (err) {
@@ -10051,11 +10721,23 @@ void (async () => {
10051
10721
  // Closes #30 task 4 and the 2026-04-21 lessons-learned loop where
10052
10722
  // IPC flaps falsely triggered the gateway's recovery banner.
10053
10723
  // SWITCHROOM_RESTART_WATCHDOG_POLL_MS=0 disables it.
10724
+ //
10725
+ // Disabled under SWITCHROOM_RUNTIME=docker — the watchdog reads
10726
+ // systemd's NRestarts counter, which doesn't exist for docker
10727
+ // containers. Reading docker's restart count would require
10728
+ // mounting docker.sock into the agent container (a security
10729
+ // regression we explicitly avoid). Container restart visibility
10730
+ // comes from the boot card + gateway boot logs in docker mode.
10054
10731
  const RESTART_WATCHDOG_POLL_MS = Number(
10055
10732
  process.env.SWITCHROOM_RESTART_WATCHDOG_POLL_MS ?? 30_000,
10056
10733
  )
10057
10734
  const watchdogAgentName = process.env.SWITCHROOM_AGENT_NAME
10058
- if (RESTART_WATCHDOG_POLL_MS > 0 && watchdogAgentName) {
10735
+ const watchdogDockerMode = process.env.SWITCHROOM_RUNTIME === 'docker'
10736
+ if (watchdogDockerMode) {
10737
+ process.stderr.write(
10738
+ `telegram gateway: restart-watchdog disabled (SWITCHROOM_RUNTIME=docker; systemd NRestarts unavailable)\n`,
10739
+ )
10740
+ } else if (RESTART_WATCHDOG_POLL_MS > 0 && watchdogAgentName) {
10059
10741
  startRestartWatchdog({
10060
10742
  agentName: watchdogAgentName,
10061
10743
  pollIntervalMs: RESTART_WATCHDOG_POLL_MS,
@@ -10120,6 +10802,52 @@ void (async () => {
10120
10802
  onStall: (agentId, idleMs, description) => {
10121
10803
  progressDriver?.onSubAgentStall(agentId, idleMs, description)
10122
10804
  },
10805
+ // Symmetric to onStall: clear the ⚠ Stalled badge as soon
10806
+ // as the watcher sees JSONL activity return, instead of
10807
+ // waiting on the next render tick to recompute idle ms.
10808
+ onUnstall: (agentId, description) => {
10809
+ progressDriver?.onSubAgentUnstall?.(agentId, description)
10810
+ },
10811
+ // #card-audit-log: symmetric sub_agent_finished surface.
10812
+ // The driver's per-chat shadow knows the parent turnKey and
10813
+ // the registry DB carries the background flag — combine them
10814
+ // into a single audit-log line for retrospective debugging.
10815
+ onFinish: ({ agentId, outcome, toolCount, durationMs }) => {
10816
+ let parentTurnKey = ''
10817
+ let chatId = ''
10818
+ let isBackground = false
10819
+ try {
10820
+ const fleets = progressDriver?.peekAllFleets() ?? []
10821
+ for (const f of fleets) {
10822
+ if (f.fleet.has(agentId)) {
10823
+ parentTurnKey = f.turnKey
10824
+ chatId = f.chatId ?? ''
10825
+ break
10826
+ }
10827
+ }
10828
+ } catch {
10829
+ // peek failures are non-fatal — we still emit the event.
10830
+ }
10831
+ if (turnsDb != null) {
10832
+ try {
10833
+ const row = turnsDb
10834
+ .prepare('SELECT background FROM subagents WHERE jsonl_agent_id = ?')
10835
+ .get(agentId) as { background: number } | undefined
10836
+ if (row != null) isBackground = row.background === 1
10837
+ } catch { /* best-effort */ }
10838
+ }
10839
+ const finalOutcome: 'completed' | 'orphan' | 'background' =
10840
+ isBackground ? 'background' : (outcome === 'completed' ? 'completed' : 'orphan')
10841
+ emitCardEvent({
10842
+ agent: process.env.SWITCHROOM_AGENT_NAME ?? '',
10843
+ chatId,
10844
+ turnKey: parentTurnKey,
10845
+ event: 'finalized',
10846
+ reason: `sub_agent_finished agentId=${agentId} outcome=${finalOutcome} tools=${toolCount}`,
10847
+ subagents: [agentId],
10848
+ durationMs,
10849
+ })
10850
+ },
10123
10851
  })
10124
10852
  process.stderr.write('telegram gateway: subagent-watcher active\n')
10125
10853
  }