npm - switchroom - Versions diffs - 0.5.0 → 0.7.8 - Mend

switchroom 0.5.0 → 0.7.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (89) hide show

package/README.md +142 -121
package/bin/autoaccept.exp +29 -6
package/dist/agent-scheduler/index.js +12261 -0
package/dist/cli/autoaccept-poll.js +10 -0
package/dist/cli/switchroom.js +27250 -25324
package/dist/vault/approvals/kernel-server.js +12709 -0
package/dist/vault/broker/server.js +15724 -0
package/package.json +4 -3
package/profiles/_base/start.sh.hbs +133 -0
package/profiles/_shared/telegram-style.md.hbs +3 -3
package/profiles/default/CLAUDE.md +3 -3
package/profiles/default/CLAUDE.md.hbs +2 -2
package/profiles/default/workspace/CLAUDE.md.hbs +9 -0
package/skills/docx/VENDORED.md +1 -1
package/skills/mcp-builder/VENDORED.md +1 -1
package/skills/pdf/VENDORED.md +1 -1
package/skills/pptx/VENDORED.md +1 -1
package/skills/skill-creator/VENDORED.md +1 -1
package/skills/switchroom-architecture/SKILL.md +8 -7
package/skills/switchroom-cli/SKILL.md +23 -15
package/skills/switchroom-health/SKILL.md +7 -7
package/skills/switchroom-install/SKILL.md +36 -39
package/skills/switchroom-manage/SKILL.md +4 -4
package/skills/switchroom-status/SKILL.md +1 -1
package/skills/webapp-testing/VENDORED.md +1 -1
package/skills/xlsx/VENDORED.md +1 -1
package/telegram-plugin/admin-commands/dispatch.test.ts +119 -1
package/telegram-plugin/admin-commands/index.ts +71 -0
package/telegram-plugin/ask-user.ts +1 -0
package/telegram-plugin/card-event-log.ts +138 -0
package/telegram-plugin/dist/bridge/bridge.js +178 -31
package/telegram-plugin/dist/foreman/foreman.js +6875 -6526
package/telegram-plugin/dist/gateway/gateway.js +13862 -11834
package/telegram-plugin/dist/server.js +202 -40
package/telegram-plugin/fleet-state.ts +25 -10
package/telegram-plugin/foreman/foreman.ts +38 -3
package/telegram-plugin/gateway/approval-callback.ts +126 -0
package/telegram-plugin/gateway/approval-card.test.ts +90 -0
package/telegram-plugin/gateway/approval-card.ts +127 -0
package/telegram-plugin/gateway/approvals-commands.ts +126 -0
package/telegram-plugin/gateway/boot-card.ts +31 -6
package/telegram-plugin/gateway/boot-probes.ts +503 -72
package/telegram-plugin/gateway/gateway.ts +822 -94
package/telegram-plugin/gateway/ipc-protocol.ts +34 -1
package/telegram-plugin/gateway/ipc-server.ts +35 -0
package/telegram-plugin/gateway/startup-mutex.ts +110 -2
package/telegram-plugin/hooks/hooks.json +19 -0
package/telegram-plugin/hooks/tool-label-pretool.mjs +216 -0
package/telegram-plugin/hooks/tool-label-stop.mjs +63 -0
package/telegram-plugin/package.json +4 -1
package/telegram-plugin/plugin-logger.ts +20 -1
package/telegram-plugin/progress-card-driver.ts +202 -13
package/telegram-plugin/progress-card.ts +2 -2
package/telegram-plugin/quota-check.ts +1 -0
package/telegram-plugin/registry/subagents-schema.ts +37 -0
package/telegram-plugin/registry/subagents.test.ts +64 -0
package/telegram-plugin/session-tail.ts +58 -5
package/telegram-plugin/shared/bot-runtime.ts +48 -2
package/telegram-plugin/subagent-watcher.ts +139 -7
package/telegram-plugin/tests/_progress-card-harness.ts +4 -0
package/telegram-plugin/tests/bg-agent-progress-card-757.test.ts +201 -0
package/telegram-plugin/tests/boot-card-probe-target.test.ts +10 -34
package/telegram-plugin/tests/boot-card-render.test.ts +6 -5
package/telegram-plugin/tests/boot-probes.test.ts +558 -0
package/telegram-plugin/tests/card-event-log.test.ts +145 -0
package/telegram-plugin/tests/gateway-startup-mutex.test.ts +102 -0
package/telegram-plugin/tests/ipc-server-validate-inject-inbound.test.ts +134 -0
package/telegram-plugin/tests/progress-card-delay-842.test.ts +160 -0
package/telegram-plugin/tests/quota-check.test.ts +37 -1
package/telegram-plugin/tests/subagent-registry-bugs.test.ts +5 -0
package/telegram-plugin/tests/subagent-watcher-stall-notification.test.ts +104 -1
package/telegram-plugin/tests/subagent-watcher.test.ts +5 -0
package/telegram-plugin/tests/tool-label-sidecar.test.ts +114 -0
package/telegram-plugin/tests/two-zone-bg-done-when-all-terminal.test.ts +5 -3
package/telegram-plugin/tests/two-zone-card-header-phases.test.ts +10 -0
package/telegram-plugin/tests/two-zone-snapshot-extras.test.ts +58 -14
package/telegram-plugin/tests/welcome-text.test.ts +57 -0
package/telegram-plugin/tool-label-sidecar.ts +140 -0
package/telegram-plugin/tool-labels.ts +55 -0
package/telegram-plugin/two-zone-card.ts +27 -7
package/telegram-plugin/uat/SETUP.md +160 -0
package/telegram-plugin/uat/assertions.ts +140 -0
package/telegram-plugin/uat/driver.ts +174 -0
package/telegram-plugin/uat/harness.ts +161 -0
package/telegram-plugin/uat/login.ts +134 -0
package/telegram-plugin/uat/port-allocator.ts +71 -0
package/telegram-plugin/uat/scenarios/smoke-clerk-reply.test.ts +61 -0
package/telegram-plugin/welcome-text.ts +44 -2
package/bin/bridge-watchdog.sh +0 -967

package/telegram-plugin/gateway/gateway.ts CHANGED Viewed

@@ -58,7 +58,8 @@ import { handlePtyPartialPure, type PtyHandlerState } from '../pty-partial-handl
 import { handleStreamReply } from '../stream-reply-handler.js'
 import { createChatLock } from '../chat-lock.js'
 import { createRetryApiCall } from '../retry-api-call.js'
-import { installTgPostLogger } from '../shared/bot-runtime.js'
+import { installTgPostLogger, withTgPostTags } from '../shared/bot-runtime.js'
+import { emitCardEvent } from '../card-event-log.js'
 import { buildAttachmentPath, assertInsideInbox } from '../attachment-path.js'
 import { createPinManager } from '../progress-card-pin-manager.js'
 import { createPinWatchdog } from '../progress-card-pin-watchdog.js'
@@ -152,7 +153,7 @@ import {
   resetSessionAckText as buildResetSessionAckText,
   TELEGRAM_BASE_COMMANDS,
   TELEGRAM_SWITCHROOM_COMMANDS,
-  type AgentMetadata, type AuthSummary,
+  type AgentMetadata, type AuthSummary, type StatusProbeRow,
 } from '../welcome-text.js'
 import {
   isContextExhaustionText,
@@ -229,6 +230,7 @@ import type {
   OperatorEventForward,
   PtyPartialForward,
   InboundMessage,
+  InjectInboundMessage,
 } from './ipc-protocol.js'
 import { writePidFile, clearPidFile } from './pid-file.js'
 import { acquireStartupLock, releaseStartupLock } from './startup-mutex.js'
@@ -257,7 +259,7 @@ import { StagingMap } from '../secret-detect/staging.js'
 import { maskToken } from '../secret-detect/mask.js'
 import { defaultVaultWrite, defaultVaultList } from '../secret-detect/vault-write.js'
 import { detectSecrets } from '../secret-detect/index.js'
-import { ADMIN_COMMAND_NAMES, parseCommandName } from '../admin-commands/index.js'
+import { classifyAdminGate } from '../admin-commands/index.js'
 import {
   startSubagentWatcher,
   type SubagentWatcherHandle,
@@ -303,6 +305,11 @@ import {
   listGrantsViaBroker,
   revokeGrantViaBroker,
 } from '../../src/vault/broker/client.js'
+import {
+  approvalRequest,
+  approvalConsume,
+  approvalRecord,
+} from '../../src/vault/approvals/client.js'
 import {
   openTurnsDb,
   markOrphanedAsRestarted,
@@ -325,6 +332,71 @@ const APPROVED_DIR = join(STATE_DIR, 'approved')
 const ENV_FILE = join(STATE_DIR, '.env')
 const INBOX_DIR = join(STATE_DIR, 'inbox')
+/**
+ * Trigger a restart of the agent + gateway pair.
+ *
+ * Branches on `SWITCHROOM_RUNTIME`:
+ *   - `docker`: send `SIGTERM` to PID 1 (tini) after a brief delay so
+ *     in-flight IPC responses flush. tini propagates the signal to its
+ *     children (claude → start.sh → us), the whole tree exits cleanly,
+ *     the container exits, and docker compose's `restart: unless-stopped`
+ *     policy recreates it. This covers BOTH the agent process and the
+ *     gateway plugin (we're a child of claude inside the same container).
+ *     `targetAgent` is informational only here — we can't restart a
+ *     different agent's container from inside our own (no docker.sock).
+ *   - else (legacy systemd): detached `systemctl --user restart` of the
+ *     two units. The detach is required so the systemctl job survives
+ *     us being SIGTERM'd by systemd itself.
+ *
+ * `targetAgent` defaults to `SWITCHROOM_AGENT_NAME`; pass a different
+ * value only for the inline restart-button callback handler. Under
+ * docker, a `targetAgent !== SWITCHROOM_AGENT_NAME` request returns
+ * false (and logs) so the caller can surface a "not supported" message.
+ */
+function triggerSelfRestart(
+  targetAgent: string,
+  reason: string,
+  delayMs = 300,
+): boolean {
+  const isDocker = process.env.SWITCHROOM_RUNTIME === 'docker'
+  const selfAgent = process.env.SWITCHROOM_AGENT_NAME
+  if (isDocker) {
+    if (selfAgent && targetAgent !== selfAgent) {
+      process.stderr.write(
+        `telegram gateway: cross-agent restart not supported under docker (target=${targetAgent}, self=${selfAgent}, reason=${reason})\n`,
+      )
+      return false
+    }
+    process.stderr.write(
+      `telegram gateway: restart-via-SIGTERM-PID1 agent=${targetAgent} reason=${reason} (docker)\n`,
+    )
+    setTimeout(() => {
+      try { process.kill(1, 'SIGTERM') } catch (err) {
+        process.stderr.write(`telegram gateway: SIGTERM PID 1 failed: ${err}\n`)
+      }
+    }, delayMs).unref()
+    return true
+  }
+  // Legacy systemd path.
+  process.stderr.write(
+    `telegram gateway: restart-via-systemctl agent=${targetAgent} reason=${reason}\n`,
+  )
+  try {
+    spawn(
+      'sh',
+      [
+        '-c',
+        `sleep ${(delayMs / 1000).toFixed(2)} && systemctl --user restart switchroom-${targetAgent}.service switchroom-${targetAgent}-gateway.service`,
+      ],
+      { detached: true, stdio: 'ignore' },
+    ).unref()
+    return true
+  } catch (err) {
+    process.stderr.write(`telegram gateway: restart spawn failed for ${targetAgent}: ${err}\n`)
+    return false
+  }
+}
 /**
  * Format the version string shown in the boot-card ack line. Two shapes
  * matching the deleted greeting card's behavior:
@@ -380,12 +452,53 @@ try {
   }
 }
-const TOKEN = process.env.TELEGRAM_BOT_TOKEN
-if (!TOKEN) {
+// Issue #758: if TELEGRAM_BOT_TOKEN is not set in env (e.g. agent's .env was
+// never written because bot_token in switchroom.yaml is a `vault:` reference),
+// materialize it from the vault at startup. Resolved value is held in
+// process.env only — never written back to disk.
+//
+// The outer try/catch is narrowed (post-#761 review) to ONLY catch the case
+// where the helper module itself fails to load (ERR_MODULE_NOT_FOUND from the
+// dynamic import). Anything else — including throws from inside
+// materializeBotToken that aren't BotTokenMaterializeError — must propagate
+// with its original message so we don't mask real bugs behind the legacy
+// "set in .env" hint.
+type MaterializeMod = typeof import('../../src/telegram/materialize-bot-token.js')
+let materializeMod: MaterializeMod | null = null
+try {
+  materializeMod = await import('../../src/telegram/materialize-bot-token.js')
+} catch (err) {
+  const code = (err as NodeJS.ErrnoException | undefined)?.code
+  if (code === 'ERR_MODULE_NOT_FOUND' || code === 'MODULE_NOT_FOUND') {
+    // Module genuinely missing — fall through with materializeMod=null and
+    // handle below.
+  } else {
+    // Programming error, side-effect failure during module init, etc.
+    // Propagate the real message rather than masking it.
+    throw err
+  }
+}
+let TOKEN: string
+if (materializeMod !== null) {
+  const { materializeBotToken, BotTokenMaterializeError } = materializeMod
+  try {
+    TOKEN = await materializeBotToken({ agentName: process.env.SWITCHROOM_AGENT_NAME })
+  } catch (err) {
+    if (err instanceof BotTokenMaterializeError) {
+      process.stderr.write(`telegram gateway: ${err.message}\n`)
+      process.exit(1)
+    }
+    throw err
+  }
+} else if (process.env.TELEGRAM_BOT_TOKEN) {
+  TOKEN = process.env.TELEGRAM_BOT_TOKEN
+} else {
   process.stderr.write(
     `telegram gateway: TELEGRAM_BOT_TOKEN required\n` +
     `  set in ${ENV_FILE}\n` +
-    `  format: TELEGRAM_BOT_TOKEN=123456789:AAH...\n`,
+    `  format: TELEGRAM_BOT_TOKEN=123456789:AAH...\n` +
+    `  (token-materialization helper not found)\n`,
   )
   process.exit(1)
 }
@@ -954,20 +1067,7 @@ function purgeReactionTracking(key: string): void {
   // scheduled, so nobody is waiting on this.
   if (activeTurnStartedAt.size === 0 && pendingRestarts.size > 0) {
     for (const [agentName, _timestamp] of pendingRestarts.entries()) {
-      process.stderr.write(`telegram gateway: turn completed, restarting ${agentName} (agent + gateway) now\n`);
-      try {
-        spawn(
-          'sh',
-          [
-            '-c',
-            // Sleep briefly so our stderr flush lands before systemd kills us.
-            `sleep 0.3 && systemctl --user restart switchroom-${agentName}.service switchroom-${agentName}-gateway.service`,
-          ],
-          { detached: true, stdio: 'ignore' },
-        ).unref();
-      } catch (err) {
-        process.stderr.write(`telegram gateway: restart spawn failed for ${agentName}: ${err}\n`);
-      }
+      triggerSelfRestart(agentName, 'turn-complete-pending-restart');
       pendingRestarts.delete(agentName);
     }
   }
@@ -1275,6 +1375,17 @@ type PendingVaultOp =
       expiresLabel?: string     // human-readable label for confirmation
       description?: string
       awaitingCustomDuration?: boolean  // true while waiting for text reply
+      /**
+       * Approval-kernel request_id minted at the wizard confirm step
+       * (MIGRATION.md §2, Phase 1 dual-dispatch — audit-only, advisory).
+       * When set, `vg:generate` ALSO consumes + records an `allow_once`
+       * decision on the kernel; `vg:cancel` records a `deny`. Cards in
+       * flight from before this PR landed have it `undefined` and the
+       * legacy `mintGrantViaBroker` runs alone — no kernel write. After
+       * 1-2 releases the legacy-only branch can be removed (#833 Phase 2
+       * is the enforcing flip).
+       */
+      kernel_request_id?: string
       startedAt: number
     }
   // Issue #228: waiting for confirmation before revoking a grant.
@@ -1300,12 +1411,165 @@ interface DeferredSecret {
    * slug if detection didn't fire.
    */
   suggested_slug: string
+  /**
+   * Approval-kernel request_id minted alongside the bespoke deferred-secret
+   * card (MIGRATION.md §1, Phase 1 dual-dispatch). When set, the
+   * `vd:unlock` / `vd:cancel` callback handler ALSO records the user's
+   * decision on the kernel side via `approvalConsume` + `approvalRecord`,
+   * so the audit log captures the unlock event.
+   *
+   * `undefined` on cards built before this PR landed (in-flight at deploy
+   * time) — the legacy handler runs alone, no kernel record. After ~1-2
+   * releases the legacy-only branch can be removed (separate cleanup PR).
+   */
+  kernel_request_id?: string
 }
 const deferredSecrets = new Map<string, DeferredSecret>()
+/**
+ * Mint an approval-kernel decision row for a deferred-secret card
+ * (MIGRATION.md §1). Best-effort: if the kernel/broker is unreachable, we
+ * return null and the caller proceeds with the legacy-only path so the
+ * core unlock UX never depends on kernel availability.
+ *
+ * `agent_unit` is the gateway's agent — the per-agent ACL ships in Docker
+ * Phase 2b. The kernel-server checks the listener's bound socket against
+ * the claimed agent, so passing the local agent name is safe.
+ */
+async function mintDeferredSecretKernelRequest(
+  slug: string,
+  approverSet: string[],
+): Promise<string | null> {
+  const agentName = process.env.SWITCHROOM_AGENT_NAME
+  if (!agentName) return null
+  try {
+    const r = await approvalRequest({
+      agent_unit: `switchroom-${agentName}.service`,
+      scope: `secret:${slug}`,
+      action: 'unlock',
+      approver_set: approverSet,
+      why: 'Unlock vault to save a deferred secret detected in chat.',
+    })
+    if (r === null || r.state !== 'pending') return null
+    return r.request_id
+  } catch (err) {
+    process.stderr.write(
+      `[approval-kernel] mintDeferredSecretKernelRequest failed: ${(err as Error).message}\n`,
+    )
+    return null
+  }
+}
+/**
+ * Record the user's decision (allow/deny) on the approval kernel for a
+ * deferred-secret card. Best-effort and idempotent — a missing
+ * `request_id` (legacy in-flight card) or an unreachable kernel both
+ * silently no-op so the legacy UX is unaffected.
+ */
+async function recordDeferredSecretKernelDecision(
+  request_id: string | undefined,
+  decision: 'allow_once' | 'deny',
+  granted_by_user_id: number,
+  approverSet: string[],
+): Promise<void> {
+  if (!request_id) return
+  try {
+    const consumed = await approvalConsume(request_id)
+    if (consumed === null || !consumed.consumed) return
+    await approvalRecord({
+      request_id,
+      decision,
+      approver_set: approverSet,
+      granted_by_user_id,
+      ttl_ms: null,
+    })
+  } catch (err) {
+    process.stderr.write(
+      `[approval-kernel] recordDeferredSecretKernelDecision failed: ${(err as Error).message}\n`,
+    )
+  }
+}
 function deferredKey(chat_id: string, message_id: number): string {
   return `${chat_id}:${message_id}`
 }
+/**
+ * Mint an approval-kernel decision row for a `/vault grant` wizard
+ * confirm step (MIGRATION.md §2, Phase 1 audit-only dual-dispatch).
+ *
+ * Best-effort: kernel/broker unreachable → returns null and the wizard
+ * proceeds on the legacy `mint_grant` path alone, so the user-facing
+ * grant UX never depends on kernel availability. This is *advisory*
+ * in Phase 1 — the kernel verdict is informational alongside the
+ * legacy `vault_grants` row, not enforcing. Phase 2 (issue #833) flips
+ * enforcement.
+ *
+ * Scope shape `vault:grant:<agent_slug>` mirrors the `vault:secret:<slug>`
+ * namespacing established in #832 / PR #830 — one decision per (agent,
+ * grant-mint) tuple. Action `mint`. Approver-set is the gateway's
+ * allowFrom (same set that gates the wizard callback in the first place).
+ */
+async function mintGrantWizardKernelRequest(
+  agentSlug: string,
+  approverSet: string[],
+  selectedKeys: string[],
+  ttlSeconds: number | null,
+): Promise<string | null> {
+  const agentName = process.env.SWITCHROOM_AGENT_NAME
+  if (!agentName) return null
+  try {
+    const why =
+      `Mint capability token for agent "${agentSlug}" — ` +
+      `${selectedKeys.length} key(s), ` +
+      `${ttlSeconds === null ? 'no expiry' : `${ttlSeconds}s TTL`}.`
+    const r = await approvalRequest({
+      agent_unit: `switchroom-${agentName}.service`,
+      scope: `vault:grant:${agentSlug}`,
+      action: 'mint',
+      approver_set: approverSet,
+      why,
+    })
+    if (r === null || r.state !== 'pending') return null
+    return r.request_id
+  } catch (err) {
+    process.stderr.write(
+      `[approval-kernel] mintGrantWizardKernelRequest failed: ${(err as Error).message}\n`,
+    )
+    return null
+  }
+}
+/**
+ * Record the user's wizard decision (allow/deny) on the approval kernel
+ * for a `/vault grant` wizard card. Best-effort and idempotent — a
+ * missing `request_id` (legacy in-flight wizard) or an unreachable
+ * broker silently no-op so the legacy UX is unaffected. Audit-only in
+ * Phase 1: nothing downstream reads this verdict yet.
+ */
+async function recordGrantWizardKernelDecision(
+  request_id: string | undefined,
+  decision: 'allow_once' | 'deny',
+  granted_by_user_id: number,
+  approverSet: string[],
+): Promise<void> {
+  if (!request_id) return
+  try {
+    const consumed = await approvalConsume(request_id)
+    if (consumed === null || !consumed.consumed) return
+    await approvalRecord({
+      request_id,
+      decision,
+      approver_set: approverSet,
+      granted_by_user_id,
+      ttl_ms: null,
+    })
+  } catch (err) {
+    process.stderr.write(
+      `[approval-kernel] recordGrantWizardKernelDecision failed: ${(err as Error).message}\n`,
+    )
+  }
+}
 // Channel B context rule — tracks when the gateway has emitted the
 // "Paste the browser code here" prompt so that the next inbound message
 // in the same chat is treated as auth-flow-sensitive regardless of whether
@@ -1440,20 +1704,7 @@ const pendingStateReaper = setInterval(() => {
         `telegram gateway: [restart-drain] forcing agent=${agentName} waited=${waitedSec}s threshold=${Math.round(PENDING_RESTART_DRAIN_CAP_MS / 1000)}s\n`,
       )
       pendingRestarts.delete(agentName)
-      try {
-        spawn(
-          'sh',
-          [
-            '-c',
-            // The systemctl restart will SIGTERM then SIGKILL after TimeoutStopSec.
-            // The currently-running claude process will get SIGKILL via the unit stop.
-            `sleep 0.1 && systemctl --user restart switchroom-${agentName}.service switchroom-${agentName}-gateway.service`,
-          ],
-          { detached: true, stdio: 'ignore' },
-        ).unref()
-      } catch (err) {
-        process.stderr.write(`telegram gateway: [restart-drain] forced restart spawn failed agent=${agentName}: ${err}\n`)
-      }
+      triggerSelfRestart(agentName, 'restart-drain-cap-forced', 100)
     }
   }
 }, 60_000)
@@ -1887,6 +2138,7 @@ const ipcServer: IpcServer = createIpcServer({
             restartAgeMs: markerAgeMs,
             loadAccounts: () => loadAccountsForBootCard(agentSlug),
             tmuxSupervisor: process.env.SWITCHROOM_TMUX_SUPERVISOR === '1',
+            dockerMode: process.env.SWITCHROOM_RUNTIME === 'docker',
           }, ackMsgId).then(handle => {
             activeBootCard = handle
           }).catch((err: Error) => {
@@ -2007,27 +2259,19 @@ const ipcServer: IpcServer = createIpcServer({
     const turnInFlight = activeTurnStartedAt.size > 0;
     if (!turnInFlight) {
-      // No active turn, restart immediately. Cycle both the agent unit and
-      // the gateway unit (us) so telegram-plugin code changes always
-      // propagate. Send the client response FIRST, then spawn a detached
-      // shell to run the combined systemctl restart after a brief delay.
-      // The delay ensures the IPC response has flushed before systemd
-      // kills us; the detach ensures the systemctl job survives our death.
+      // No active turn, restart immediately. Cycle both the agent and
+      // gateway side-by-side so telegram-plugin code changes always
+      // propagate. Send the client response FIRST, then trigger the
+      // restart after a brief delay so the IPC response has flushed
+      // before we get killed. (Under docker the helper SIGTERM's PID 1;
+      // under systemd it spawns a detached `systemctl restart`.)
       try {
         client.send({
           type: 'schedule_restart_result',
           success: true,
           restartedImmediately: true,
         });
-        spawn(
-          'sh',
-          [
-            '-c',
-            `sleep 0.3 && systemctl --user restart switchroom-${agentName}.service switchroom-${agentName}-gateway.service`,
-          ],
-          { detached: true, stdio: 'ignore' },
-        ).unref();
-        process.stderr.write(`telegram gateway: scheduled immediate restart of ${agentName} (agent + gateway)\n`);
+        triggerSelfRestart(agentName, 'schedule-restart-immediate');
       } catch (err) {
         client.send({
           type: 'schedule_restart_result',
@@ -2090,6 +2334,31 @@ const ipcServer: IpcServer = createIpcServer({
     handlePtyPartial(msg.text)
   },
+  /**
+   * Phase 2 cron-fold-in: forward a synthesized inbound from the
+   * in-agent scheduler sibling to the registered bridge for the
+   * named agent. The wrapped `inbound` envelope is shipped verbatim
+   * — the in-agent scheduler is the synthesis authority (it runs
+   * `dispatchAsInbound` from `src/scheduler/dispatch.ts` to build
+   * the message). The gateway only validates wire shape (handled
+   * in ipc-server.ts:validateClientMessage) and routes.
+   *
+   * Logs every fire so an operator can correlate the agent's
+   * transcript turn against the scheduler's audit row by `prompt_key`.
+   */
+  onInjectInbound(_client: IpcClient, msg: InjectInboundMessage) {
+    const promptKey = typeof msg.inbound.meta?.prompt_key === 'string'
+      ? msg.inbound.meta.prompt_key
+      : 'unknown'
+    const source = typeof msg.inbound.meta?.source === 'string'
+      ? msg.inbound.meta.source
+      : 'unknown'
+    const delivered = ipcServer.sendToAgent(msg.agentName, msg.inbound)
+    process.stderr.write(
+      `telegram gateway: inject_inbound agent=${msg.agentName} source=${source} prompt_key=${promptKey} delivered=${delivered}\n`,
+    )
+  },
   log: (msg) => process.stderr.write(`telegram gateway: ipc — ${msg}\n`),
 })
@@ -4702,12 +4971,18 @@ async function handleInbound(
         // the post-context flow stays seamless.
         const dKey = deferredKey(chat_id, msgId ?? 0)
         const cachedBranchDetection = detectSecrets(effectiveText).find((d) => d.confidence === 'high' && !d.suppressed)
+        const cachedBranchSlug = cachedBranchDetection?.suggested_slug ?? (isAuthFlowContext ? 'anthropic_oauth_code' : 'secret')
+        const cachedBranchKernelId = await mintDeferredSecretKernelRequest(
+          cachedBranchSlug,
+          loadAccess().allowFrom,
+        )
         deferredSecrets.set(dKey, {
           chat_id,
           original_message_id: msgId ?? 0,
           text: effectiveText,
           staged_at: Date.now(),
-          suggested_slug: cachedBranchDetection?.suggested_slug ?? (isAuthFlowContext ? 'anthropic_oauth_code' : 'secret'),
+          suggested_slug: cachedBranchSlug,
+          kernel_request_id: cachedBranchKernelId ?? undefined,
         })
         await switchroomReply(
           ctx,
@@ -4748,12 +5023,17 @@ async function handleInbound(
           highConfDetection?.suggested_slug
           ?? (isAuthFlowContext ? 'anthropic_oauth_code' : 'secret')
         const dKey = deferredKey(chat_id, msgId ?? 0)
+        const noPassKernelId = await mintDeferredSecretKernelRequest(
+          suggestedSlug,
+          loadAccess().allowFrom,
+        )
         deferredSecrets.set(dKey, {
           chat_id,
           original_message_id: msgId ?? 0,
           text: effectiveText,
           staged_at: Date.now(),
           suggested_slug: suggestedSlug,
+          kernel_request_id: noPassKernelId ?? undefined,
         })
         if (msgId != null) {
           try { await bot.api.deleteMessage(chat_id, msgId) } catch {}
@@ -5336,6 +5616,47 @@ function resolveSystemdRunPath(): string | null {
   return _systemdRunPath
 }
+/**
+ * Detect whether `docker` is callable from this process — required by
+ * `switchroom update`'s pull-images and recreate-containers steps.
+ *
+ * The gateway runs INSIDE the agent container (cron-fold-in / Phase 4
+ * docker model), which by design has no docker binary AND no socket
+ * mount. We probe both: binary on PATH (via `docker --version`) and
+ * socket on disk (via existsSync). True only if BOTH are present —
+ * mirroring the actual requirements `switchroom update` will hit when
+ * it shells out.
+ *
+ * Cached: docker availability doesn't change at runtime within a
+ * single container generation.
+ */
+let _dockerReachable: boolean | undefined
+function isDockerReachable(): boolean {
+  if (_dockerReachable !== undefined) return _dockerReachable
+  // Cheap socket probe first — if the mount is absent, no need to
+  // pay the execSync cost. Common-case fast-path on docker installs.
+  if (!existsSync('/var/run/docker.sock')) {
+    _dockerReachable = false
+    return _dockerReachable
+  }
+  try {
+    // -version is fast and doesn't require an actual daemon roundtrip
+    // for binary-present probing. Bounded timeout in case the binary
+    // exists but blocks (unlikely but defensive).
+    execSync('docker --version', { stdio: 'ignore', timeout: 2000 })
+    _dockerReachable = true
+  } catch {
+    _dockerReachable = false
+  }
+  return _dockerReachable
+}
+// @internal exported for tests — resets the docker-reachable cache so
+// a test can swap underlying state and observe the new probe result.
+export function _resetDockerReachableCache(): void {
+  _dockerReachable = undefined
+}
 function spawnSwitchroomDetached(
   args: string[],
   onFailure?: (info: { code: number; tail: string }) => void,
@@ -5748,28 +6069,35 @@ async function runSwitchroomCommandFormatted(ctx: Context, args: string[], label
 }
 // ─── Admin-command gating middleware ─────────────────────────────────────
-// When AGENT_ADMIN=false (default), admin slash commands like /agents, /logs,
-// /restart etc. should fall through to Claude rather than being executed
-// locally. Grammy's bot.command() handlers fire BEFORE bot.on('message:text'),
-// so without this middleware the commands would silently execute (or no-op
-// due to isAuthorizedSender) and never reach handleInboundCoalesced.
+// When AGENT_ADMIN=false (default), admin slash commands (/agents, /logs,
+// /grant, etc.) must NOT execute locally — this agent isn't admin-flagged
+// and routing them through Claude burns tokens for no benefit. Reply with a
+// concise "admin required" warning instead.
 //
-// Middleware registered BEFORE bot.command() calls intercepts text messages
-// first. If admin gating is off and the command is in ADMIN_COMMAND_NAMES, we
-// redirect to handleInboundCoalesced so Claude sees the message.
+// Special case: `/restart` with no arg, or `/restart <my-agent-name>`, is
+// allowed to fall through to the local bot.command('restart', …) handler so
+// every agent can self-restart without admin privilege. `/restart <other>`
+// is blocked just like any other admin verb.
 //
 // Invariant: when AGENT_ADMIN=true, this middleware is a no-op — bot.command()
-// handlers run normally and Claude never sees admin commands.
+// handlers run normally for all admin verbs and Claude never sees them.
 bot.use(async (ctx, next) => {
   if (!AGENT_ADMIN && ctx.message?.text) {
-    const cmd = parseCommandName(ctx.message.text)
-    if (cmd !== null && ADMIN_COMMAND_NAMES.has(cmd)) {
-      // Redirect admin command text to Claude via the normal inbound path.
-      // We intentionally do NOT call next() so bot.command() never fires.
+    const myName = getMyAgentName()
+    const decision = classifyAdminGate(ctx.message.text, myName)
+    if (decision.action === 'block') {
+      // Block admin commands the LLM should never see. Reply with a concise
+      // "admin required" warning instead of forwarding to Claude.
       process.stderr.write(
-        `telegram gateway: admin-gate redirect cmd=/${cmd} agent=${process.env.SWITCHROOM_AGENT_NAME ?? '-'} (AGENT_ADMIN=false)\n`,
+        `telegram gateway: admin-gate blocked cmd=/${decision.cmd} agent=${process.env.SWITCHROOM_AGENT_NAME ?? '-'} reason=${decision.reason} (AGENT_ADMIN=false)\n`,
       )
-      await handleInboundCoalesced(ctx, ctx.message.text, undefined)
+      const cmdHtml = escapeHtmlForTg(`/${decision.cmd}`)
+      const nameHtml = escapeHtmlForTg(myName)
+      const text =
+        decision.reason === 'other-agent'
+          ? `⚠️ <code>${cmdHtml}</code> targeting another agent is an admin operation — this agent (<code>${nameHtml}</code>) isn't admin-flagged. Run it from an admin agent, or set <code>admin: true</code> for this agent in switchroom.yaml. (Self-restart is allowed: send <code>/restart</code> with no arg.)`
+          : `⚠️ <code>${cmdHtml}</code> is an admin command — this agent (<code>${nameHtml}</code>) isn't admin-flagged. Run it from an admin agent, or set <code>admin: true</code> for this agent in switchroom.yaml.`
+      await switchroomReply(ctx, text, { html: true })
       return
     }
   }
@@ -5848,7 +6176,7 @@ function buildAgentAudit(agentName: string): AgentAudit | undefined {
 // to `switchroom agent list --json` and `switchroom auth status --json`.
 // Best-effort — any missing piece renders as a placeholder in the text
 // templates rather than blocking the reply.
-function buildAgentMetadata(agentName: string): AgentMetadata {
+async function buildAgentMetadata(agentName: string): Promise<AgentMetadata> {
   type AgentListResp = {
     agents: Array<{
       name: string; status: string; uptime: string;
@@ -5885,9 +6213,66 @@ function buildAgentMetadata(agentName: string): AgentMetadata {
     status: a?.status ?? null,
     auth: authSummary,
     audit: buildAgentAudit(agentName),
+    live: await buildLiveProbeRows(agentName),
   }
 }
+/**
+ * Run the boot-card probe set on demand for `/status`. Same probes,
+ * different rendering contract: `/status` shows every row (silent-when-
+ * healthy is for the boot card; the user explicitly asked for current
+ * state here). Failures are swallowed per-probe via runAllProbes's
+ * Promise.allSettled, and we filter out anything we couldn't render so
+ * the reply doesn't break on a broken probe.
+ */
+async function buildLiveProbeRows(agentName: string): Promise<StatusProbeRow[]> {
+  try {
+    const { runAllProbes } = await import('./boot-card.js')
+    const agentDir = resolveAgentDirFromEnv()
+      ?? (process.env.TELEGRAM_STATE_DIR
+        ? require('path').dirname(process.env.TELEGRAM_STATE_DIR)
+        : '/tmp')
+    const probes = await runAllProbes({
+      agentName,
+      agentSlug: agentName,
+      version: formatBootVersion(),
+      agentDir,
+      gatewayInfo: { pid: process.pid, startedAtMs: GATEWAY_STARTED_AT_MS },
+      tmuxSupervisor: process.env.SWITCHROOM_TMUX_SUPERVISOR === '1',
+      dockerMode: process.env.SWITCHROOM_RUNTIME === 'docker',
+    })
+    const rows: StatusProbeRow[] = []
+    // Render order matches the boot card's PROBE_KEYS so the two
+    // surfaces tell the same story in the same order.
+    const order = ['account', 'agent', 'gateway', 'quota', 'hindsight',
+      'scheduler', 'broker', 'kernel', 'skills'] as const
+    for (const k of order) {
+      const r = probes[k]
+      if (!r) continue
+      rows.push({ status: r.status, label: r.label, detail: r.detail })
+    }
+    return rows
+  } catch (err: unknown) {
+    process.stderr.write(
+      `telegram gateway: /status: probe gathering failed: ${
+        (err as Error)?.message ?? String(err)
+      }\n`,
+    )
+    return []
+  }
+}
+// RFC B §9: register /approvals list|revoke against the approval kernel.
+// The kernel's IPC client (`src/vault/approvals/client.ts`) round-trips
+// through the vault broker — same socket, no new daemon. The isApprover
+// gate reuses the existing dmCommandGate / allowFrom pattern.
+{
+  const { registerApprovalsCommands } = await import('./approvals-commands.js')
+  registerApprovalsCommands(bot, {
+    isApprover: ctx => dmCommandGate(ctx) !== null,
+  })
+}
 bot.command('start', async ctx => {
   // dmCommandGate (#894 backport): silent drop on disabled or
   // non-allowlisted senders so the bot doesn't leak its existence.
@@ -5912,7 +6297,7 @@ bot.command('status', async ctx => {
   const from = ctx.from!
   if (access.allowFrom.includes(senderId)) {
     const userTag = from.username ? `@${from.username}` : senderId
-    const meta = buildAgentMetadata(getMyAgentName())
+    const meta = await buildAgentMetadata(getMyAgentName())
     await ctx.reply(buildStatusPairedText({ user: userTag, meta }), { parse_mode: 'HTML' })
     return
   }
@@ -6123,6 +6508,168 @@ async function handleNewOrResetCommand(ctx: Context, kind: 'new' | 'reset'): Pro
 bot.command('new', async ctx => handleNewOrResetCommand(ctx, 'new'))
 bot.command('reset', async ctx => handleNewOrResetCommand(ctx, 'reset'))
+// /update — host update from Telegram (#919). Default = dry-run plan
+// (`switchroom update --check`); explicit `apply` triggers the real
+// thing via spawnSwitchroomDetached so the gateway can be killed
+// mid-flight by the recreate-containers step without orphaning the
+// update. Admin-gated via ADMIN_COMMAND_NAMES.
+bot.command('update', async ctx => {
+  if (!isAuthorizedSender(ctx)) return
+  const arg = ctx.match?.trim() || ''
+  if (arg === '' || arg === 'check' || arg === '--check') {
+    await runSwitchroomCommand(ctx, ['update', '--check'], 'update --check')
+    await switchroomReply(
+      ctx,
+      'Reply with <code>/update apply</code> to execute, or <code>/update apply --skip-images</code> to skip the image pull.',
+      { html: true },
+    )
+    return
+  }
+  // Parse `apply` (with optional --skip-images / --rebuild passthrough).
+  // `/update apply` and `/update apply --skip-images` are the supported
+  // forms; everything else surfaces a usage hint.
+  const tokens = arg.split(/\s+/)
+  if (tokens[0] !== 'apply' && tokens[0] !== '--apply') {
+    await switchroomReply(
+      ctx,
+      'Usage: <code>/update</code> (dry-run) or <code>/update apply [--skip-images] [--rebuild]</code>',
+      { html: true },
+    )
+    return
+  }
+  // Whitelist passthrough flags. Anything outside the allowlist is
+  // refused — operators should not be able to inject arbitrary CLI
+  // args via Telegram (defense in depth even though admin-gated).
+  const ALLOWED_FLAGS = new Set(['--skip-images', '--rebuild'])
+  const passthrough = tokens.slice(1)
+  for (const tok of passthrough) {
+    if (!ALLOWED_FLAGS.has(tok)) {
+      await switchroomReply(
+        ctx,
+        `Refusing to pass unknown flag: <code>${escapeHtmlForTg(tok)}</code>. ` +
+        `Allowed: <code>--skip-images</code>, <code>--rebuild</code>.`,
+        { html: true },
+      )
+      return
+    }
+  }
+  // Docker reachability guard (#926). The gateway runs INSIDE the agent
+  // container, which has the switchroom CLI baked in but no docker
+  // binary and no /var/run/docker.sock mount. So `switchroom update`'s
+  // pull-images and recreate-containers steps would fail with
+  // "docker: command not found". Without this guard, the operator
+  // sees an opaque "❌ update failed (exit 127)" via
+  // notifyDetachedFailure ~5s after the ack.
+  //
+  // Surface a clean explanation instead, pointing them at the host
+  // CLI as the working path. /update (dry-run) does NOT need docker
+  // and is unaffected — only /update apply.
+  if (!isDockerReachable()) {
+    await switchroomReply(
+      ctx,
+      `❌ <b>/update apply</b> needs docker access from inside the agent ` +
+      `container, but it's not available (no <code>docker</code> binary on ` +
+      `PATH, no <code>/var/run/docker.sock</code> mount).\n\n` +
+      `On docker installs, run <code>switchroom update</code> from the ` +
+      `host shell instead.\n\n` +
+      `<i>Tracked as #926 — host-side update daemon would close this gap.</i>`,
+      { html: true },
+    )
+    return
+  }
+  // Debounce vs concurrent self-restart commands (/restart, /new, /reset
+  // and other /update). Reading + writing the SAME restart marker means
+  // a double-tap of /update apply is rejected, AND a /restart fired
+  // mid-update is rejected (and vice versa). 15s window matches the
+  // /restart handler.
+  const existing = readRestartMarker()
+  if (existing && Date.now() - existing.ts < 15_000) {
+    await switchroomReply(
+      ctx,
+      `⏳ Self-restart already in progress (started ${Math.round(
+        (Date.now() - existing.ts) / 1000,
+      )}s ago) — ignoring duplicate.`,
+      { html: true },
+    )
+    return
+  }
+  const chatId = String(ctx.chat!.id)
+  const threadId = resolveThreadId(chatId, ctx.message?.message_thread_id)
+  // Send the ack and capture its message_id so the post-restart
+  // greeting card can edit/reply into the same message. Mirrors the
+  // /restart handler (gateway.ts ~6273) so the boot-card lookup
+  // (gateway.ts ~10393) finds chat_id + ack_message_id in the marker.
+  const ackText =
+    `🚀 <b>update started</b> — running ${[
+      '<code>switchroom update</code>',
+      ...passthrough.map((t) => `<code>${escapeHtmlForTg(t)}</code>`),
+    ].join(' ')}\n` +
+    `\nThe gateway will restart as part of the recreate step; watch ` +
+    `for the post-restart greeting card to confirm completion.`
+  let ackId: number | null = null
+  try {
+    const sent = await lockedBot.api.sendMessage(chatId, ackText, {
+      parse_mode: 'HTML',
+      link_preview_options: { is_disabled: true },
+      ...(threadId != null ? { message_thread_id: threadId } : {}),
+    })
+    ackId = sent.message_id
+    if (HISTORY_ENABLED) {
+      try {
+        recordOutbound({
+          chat_id: chatId,
+          thread_id: threadId ?? null,
+          message_ids: [sent.message_id],
+          texts: [`🚀 update started`],
+          attachment_kinds: [],
+        })
+      } catch {}
+    }
+  } catch {}
+  writeRestartMarker({
+    chat_id: chatId,
+    thread_id: threadId ?? null,
+    ack_message_id: ackId,
+    ts: Date.now(),
+  })
+  // Reason banner for the post-restart greeting card. Without this the
+  // banner falls back to whatever the CLI's clean-shutdown marker
+  // stamped — usually 'unknown' or a docker-compose-restart string.
+  stampUserRestartReason('user: /update from chat')
+  // Unpin progress cards + clear active reactions before we die. The
+  // pinned-progress-card surface is the headline feature per CLAUDE.md;
+  // leaving one pinned across the recreate would surprise the operator.
+  await sweepBeforeSelfRestart()
+  spawnSwitchroomDetached(
+    ['update', ...passthrough],
+    notifyDetachedFailure(chatId, threadId ?? null, 'update'),
+  )
+})
+// /upgradestatus — read-only snapshot of where this host stands (#927).
+// Wraps `switchroom update --status` synchronously and posts the
+// formatted output. NOT admin-gated: read-only fleet metadata is safe
+// for any allowFrom user to see, and the answer "is something behind?"
+// is the missing companion to /update's "trigger an update".
+// (Telegram slash-commands forbid hyphens, hence /upgradestatus not
+// /upgrade-status. The /upgrade alias just below redirects.)
+bot.command('upgradestatus', async ctx => {
+  if (!isAuthorizedSender(ctx)) return
+  await runSwitchroomCommand(ctx, ['update', '--status'], 'update --status')
+})
+// Alias with hyphen — Grammy doesn't allow hyphens in command names
+// (Telegram's slash-command grammar excludes them) but operators are
+// likely to type /upgrade-status; surface a polite redirect.
+bot.command('upgrade', async ctx => {
+  if (!isAuthorizedSender(ctx)) return
+  await switchroomReply(
+    ctx,
+    'Did you mean <code>/upgradestatus</code> (no hyphen — Telegram slash-command grammar)? ' +
+    'Or <code>/update</code> to plan, <code>/update apply</code> to execute.',
+    { html: true },
+  )
+})
 // ─── /approve, /deny, /pending ────────────────────────────────────────────
 // Slash-command alternatives to the inline-button approval flow (useful for
 // desktop-only sessions and power-users). Share pendingPermissions state
@@ -6961,6 +7508,16 @@ async function handleVaultDeferCallback(ctx: Context, data: string): Promise<voi
   const cardMessageId = ctx.callbackQuery?.message?.message_id
   if (action === 'cancel') {
+    // Kernel-side dual-dispatch (MIGRATION.md §1): record the deny decision
+    // BEFORE the legacy handler clears state, so the audit log captures it
+    // even if the editMessageText below races with another tap. Best-effort
+    // — broker unreachable falls back to legacy-only.
+    await recordDeferredSecretKernelDecision(
+      deferred.kernel_request_id,
+      'deny',
+      ctx.from?.id ?? 0,
+      access.allowFrom,
+    )
     deferredSecrets.delete(deferKey)
     await ctx.answerCallbackQuery({ text: 'Discarded.' }).catch(() => {})
     if (cardMessageId != null) {
@@ -6974,6 +7531,18 @@ async function handleVaultDeferCallback(ctx: Context, data: string): Promise<voi
   }
   if (action === 'unlock') {
+    // Kernel-side dual-dispatch (MIGRATION.md §1): record the allow_once
+    // decision when the user taps unlock. The actual passphrase capture +
+    // vault write still happens via the legacy path below — the kernel
+    // decision is for audit/state, not secret material (per RFC B). We
+    // record at tap-time rather than after passphrase entry so a kernel
+    // record exists even if the user abandons the passphrase prompt.
+    await recordDeferredSecretKernelDecision(
+      deferred.kernel_request_id,
+      'allow_once',
+      ctx.from?.id ?? 0,
+      access.allowFrom,
+    )
     // If a passphrase is already cached we can skip straight to the write.
     // Covers the case where the user had unlocked separately between
     // detection and tap.
@@ -7163,12 +7732,43 @@ async function grantWizardConfirm(ctx: Context, chatId: string, state: Extract<P
     const sent = await switchroomReply(ctx, text, { html: true, reply_markup: kb })
     state.wizardMsgId = (sent as unknown as { message_id?: number })?.message_id
   }
-  pendingVaultOps.set(chatId, { ...state, step: 'confirm', expiresLabel })
+  // Mint kernel decision row at the confirm step (MIGRATION.md §2,
+  // audit-only Phase 1). We do it here rather than at executeGrantWizard
+  // so a kernel row exists even if the user taps Cancel from the confirm
+  // card — the deny verdict on cancel is then recorded against the same
+  // request_id. If the kernel/broker is unreachable, request_id stays
+  // undefined and the wizard runs legacy-only (no behaviour change).
+  const kernelRequestId = await mintGrantWizardKernelRequest(
+    state.agent!,
+    loadAccess().allowFrom,
+    state.selectedKeys!,
+    state.ttlSeconds ?? null,
+  )
+  pendingVaultOps.set(chatId, {
+    ...state,
+    step: 'confirm',
+    expiresLabel,
+    kernel_request_id: kernelRequestId ?? state.kernel_request_id,
+  })
 }
 /** Execute the grant: call broker mint_grant, write token, reply. */
 async function executeGrantWizard(ctx: Context, chatId: string, state: Extract<PendingVaultOp, { kind: 'grant-wizard' }>): Promise<void> {
   pendingVaultOps.delete(chatId)
+  // Kernel-side dual-dispatch (MIGRATION.md §2, audit-only Phase 1):
+  // record the allow_once decision when the user taps Generate. The
+  // legacy `mintGrantViaBroker` below still drives the actual grant
+  // mint + token write — the kernel row is informational, not
+  // enforcing, in Phase 1 (issue #833 will flip to enforcing).
+  // We record at tap-time rather than after mint_grant succeeds so a
+  // kernel row exists even if the legacy mint fails (audit captures
+  // intent regardless of downstream outcome).
+  await recordGrantWizardKernelDecision(
+    state.kernel_request_id,
+    'allow_once',
+    ctx.from?.id ?? 0,
+    loadAccess().allowFrom,
+  )
   // Defence-in-depth: state.agent flows from callback_data into a path
   // join below. A crafted vg:agent:../../etc payload would produce a
   // path traversal. Validate against the same regex the rest of the
@@ -7316,6 +7916,20 @@ async function handleVaultGrantCallback(ctx: Context, data: string): Promise<voi
   // Cancel at any wizard step
   if (data === 'vg:cancel') {
+    // Kernel-side dual-dispatch (MIGRATION.md §2, audit-only Phase 1):
+    // if the user got as far as the confirm step, a kernel request_id
+    // will be on the wizard state — record the deny decision so the
+    // audit log captures the abandonment. No-op if the user cancelled
+    // before the confirm step (or if the kernel was unreachable).
+    const cancelState = pendingVaultOps.get(chatId)
+    if (cancelState && cancelState.kind === 'grant-wizard') {
+      await recordGrantWizardKernelDecision(
+        cancelState.kernel_request_id,
+        'deny',
+        ctx.from?.id ?? 0,
+        loadAccess().allowFrom,
+      )
+    }
     pendingVaultOps.delete(chatId)
     const msg = ctx.callbackQuery?.message
     if (msg && 'text' in msg) {
@@ -7556,19 +8170,23 @@ async function handleOperatorEventCallback(ctx: Context, data: string): Promise<
     }
     case 'restart': {
       await ctx.answerCallbackQuery({ text: `Restarting ${agent}…` }).catch(() => {})
-      try {
-        execFileSync('systemctl', ['--user', 'restart', `switchroom-${agent}`], {
-          encoding: 'utf-8',
-          timeout: 15000,
-          stdio: ['ignore', 'pipe', 'pipe'],
-        })
+      const ok = triggerSelfRestart(agent, 'inline-button-restart')
+      if (ok) {
         await ctx.reply(`<b>${agent}</b> restart requested.`, { parse_mode: 'HTML' })
         await ctx.editMessageReplyMarkup({ reply_markup: { inline_keyboard: [] } }).catch(() => {})
-      } catch (err) {
-        // err.message includes concatenated stderr which can contain HTML
-        // metacharacters; escape before interpolating into a <pre> block.
-        const safeMsg = escapeHtmlForTg((err as Error).message)
-        await ctx.reply(`<b>Restart failed for ${agent}:</b>\n<pre>${safeMsg}</pre>`, {
+      } else {
+        // Under docker the helper refuses cross-agent restart; surface
+        // a clear message instead of a silent no-op. Service name in
+        // the generated compose is `agent-<name>` (compose.ts:408);
+        // container_name is `switchroom-<name>` (compose.ts:410).
+        // `docker compose restart` takes a SERVICE name, so we point
+        // the operator at the service.
+        const isDocker = process.env.SWITCHROOM_RUNTIME === 'docker'
+        const detail = isDocker
+          ? `cross-agent restart is not supported under docker. ` +
+            `Restart from the host: <code>docker compose -p switchroom restart agent-${agent}</code>.`
+          : 'restart trigger failed'
+        await ctx.reply(`<b>Restart failed for ${agent}:</b> ${detail}`, {
           parse_mode: 'HTML',
         })
       }
@@ -7582,6 +8200,21 @@ async function handleOperatorEventCallback(ctx: Context, data: string): Promise<
     }
     case 'logs': {
       await ctx.answerCallbackQuery({ text: 'Fetching logs…' }).catch(() => {})
+      // Pick the right log source for the runtime. Under docker, the
+      // gateway is INSIDE the agent container — calling `docker logs`
+      // requires the host's docker socket which is deliberately not
+      // mounted into agent containers. Under systemd, journalctl
+      // works as before. v0.7.2 fixed `case 'restart'` but left this
+      // path systemd-only.
+      const isDocker = process.env.SWITCHROOM_RUNTIME === 'docker'
+      if (isDocker) {
+        await ctx.reply(
+          `<i>Inline log fetch is not available under docker mode (no docker.sock in agent containers). ` +
+            `Run from the host: <code>docker logs --since 30m --tail 30 switchroom-${agent}</code></i>`,
+          { parse_mode: 'HTML' },
+        )
+        return
+      }
       try {
         const out = execFileSync(
           'journalctl',
@@ -8231,17 +8864,11 @@ bot.command('permissions', async ctx => {
   await runSwitchroomCommand(ctx, ['agent', 'permissions', agentName], `permissions ${agentName}`)
 })
-bot.command('update', async ctx => {
-  if (!isAuthorizedSender(ctx)) return
-  await switchroomReply(ctx, '🔄 Running <b>switchroom update</b>… back in ~30 seconds.', { html: true })
-  await sweepBeforeSelfRestart()
-  const chatId = String(ctx.chat!.id)
-  const threadId = resolveThreadId(chatId, ctx.message?.message_thread_id)
-  spawnSwitchroomDetached(
-    ['update'],
-    notifyDetachedFailure(chatId, threadId ?? null, 'update'),
-  )
-})
+// Drive-by cleanup (#927): the dead /update handler that lived here
+// was a pre-#919 stub. Grammy registers in order so the comprehensive
+// /update handler at line ~6516 (added in #919, hardened in #924,
+// docker-guarded in #934) fired first and this one never ran.
+// Removed to avoid future confusion.
 bot.command('version', async ctx => {
   if (!isAuthorizedSender(ctx)) return
@@ -8293,6 +8920,16 @@ bot.on('callback_query:data', async ctx => {
     return
   }
+  // RFC B §6.1: apv:<request_id>:<choice>[:<param>] — approval kernel taps.
+  // Routed through the generic kernel handler so any surface that uses
+  // buildApprovalCard inherits consume → record → confirmation UX without
+  // each surface re-implementing it.
+  if (data.startsWith('apv:')) {
+    const { handleApprovalCallback } = await import('./approval-callback.js')
+    await handleApprovalCallback(ctx, data)
+    return
+  }
   // op:<action>:<encoded-agent> callbacks from operator-events.ts
   // renderOperatorEvent(). Agent name is URL-encoded at emit (issue #24).
   // Actions: dismiss, restart, reauth, swap-slot, add-slot, logs.
@@ -9391,8 +10028,37 @@ if (streamMode === 'checklist') {
     return { code: 0, description: msg, kind: 'transient' }
   }
+  // #842: progress-card first-render gating. Read the per-agent
+  // overrides from switchroom.yaml; fall back to driver defaults
+  // (45000 ms / 0 ms) when absent, unreadable, or not present in the
+  // cascade (defaults → profile → per-agent).
+  let progressCardDelayMs: number | undefined
+  let progressCardDelayMsBackground: number | undefined
+  try {
+    const swConfig = loadSwitchroomConfig()
+    const agentSlugForCfg = process.env.SWITCHROOM_AGENT_NAME
+    const agentCfg = agentSlugForCfg ? swConfig.agents?.[agentSlugForCfg] : undefined
+    const pc = agentCfg?.channels?.telegram?.progress_card
+    if (pc) {
+      if (typeof pc.delay_ms === 'number') progressCardDelayMs = pc.delay_ms
+      if (typeof pc.delay_ms_background === 'number') progressCardDelayMsBackground = pc.delay_ms_background
+    }
+  } catch {
+    // Best-effort — gateway may run in dirs where loadSwitchroomConfig
+    // fails. Driver defaults apply.
+  }
   progressDriver = createProgressDriver({
+    ...(progressCardDelayMs != null ? { initialDelayMs: progressCardDelayMs } : {}),
+    ...(progressCardDelayMsBackground != null ? { initialDelayMsBackground: progressCardDelayMsBackground } : {}),
     emit: ({ chatId, threadId, turnKey, html, done, isFirstEmit, replyToMessageId, agentId }) => {
+      // Tag the outbound API calls so `tg-post` log lines carry turnKey
+      // (and cardMessageId when known) — lets us audit days-old session
+      // logs for "did the card render?" / "what edit storms hit it?"
+      // without parsing free-form progress-card traces. (#card-audit-log)
+      const knownCardMessageId = pinMgr.pinnedMessageId(turnKey, agentId)
+      const tgPostTags: Record<string, string | number> = { turnKey }
+      if (knownCardMessageId != null) tgPostTags.cardMessageId = knownCardMessageId
       const args = {
         chat_id: chatId, text: html, done, message_thread_id: threadId,
         lane: 'progress', format: 'html', turnKey,
@@ -9439,7 +10105,7 @@ if (streamMode === 'checklist') {
       // default in a follow-up PR.
       const draftFlagOn = process.env.PROGRESS_CARD_DRAFT_TRANSPORT === '1'
       const draftEligible = draftFlagOn && isDmChatId(chatId) && threadId == null
-      handleStreamReply(args, { activeDraftStreams, activeDraftParseModes, suppressPtyPreview }, {
+      withTgPostTags(tgPostTags, () => handleStreamReply(args, { activeDraftStreams, activeDraftParseModes, suppressPtyPreview }, {
         // grammy Bot vs local StreamBotApi — see cast pattern above.
         bot: lockedBot as never, retry: robustApiCall, markdownToHtml, escapeMarkdownV2, repairEscapedWhitespace,
         takeHandoffPrefix: () => '', assertAllowedChat, resolveThreadId, disableLinkPreview: true,
@@ -9466,7 +10132,7 @@ if (streamMode === 'checklist') {
               ...(sendMessageDraftFn != null ? { sendMessageDraft: sendMessageDraftFn } : {}),
             }
           : {}),
-      }).then((result) => {
+      })).then((result) => {
         // Successful API call — reset the consecutive-4xx counter.
         progressDriver?.reportApiSuccess(turnKey)
         // #203: progress-card edit is a user-visible signal.
@@ -9929,7 +10595,10 @@ void (async () => {
               const cleanMarkerStale = cleanMarker
                 ? !shouldSuppressRecoveryBanner(cleanMarker, nowMs, CLEAN_SHUTDOWN_MAX_AGE_MS)
                 : false
-              const detailParts: string[] = ['gateway crashed and was auto-restarted by systemd']
+              const supervisor = process.env.SWITCHROOM_RUNTIME === 'docker'
+                ? 'docker compose'
+                : 'systemd'
+              const detailParts: string[] = [`gateway crashed and was auto-restarted by ${supervisor}`]
               if (cleanMarker?.signal) detailParts.push(`prior signal=${cleanMarker.signal}`)
               if (cleanMarkerStale) detailParts.push('clean-shutdown marker stale')
               emitGatewayOperatorEvent({
@@ -9978,6 +10647,7 @@ void (async () => {
                       restartAgeMs: markerAgeMs,
                       loadAccounts: () => loadAccountsForBootCard(agentSlug),
                       tmuxSupervisor: process.env.SWITCHROOM_TMUX_SUPERVISOR === '1',
+                      dockerMode: process.env.SWITCHROOM_RUNTIME === 'docker',
                     }, ackMsgId)
                     activeBootCard = handle
                   } catch (err) {
@@ -10051,11 +10721,23 @@ void (async () => {
         // Closes #30 task 4 and the 2026-04-21 lessons-learned loop where
         // IPC flaps falsely triggered the gateway's recovery banner.
         // SWITCHROOM_RESTART_WATCHDOG_POLL_MS=0 disables it.
+        //
+        // Disabled under SWITCHROOM_RUNTIME=docker — the watchdog reads
+        // systemd's NRestarts counter, which doesn't exist for docker
+        // containers. Reading docker's restart count would require
+        // mounting docker.sock into the agent container (a security
+        // regression we explicitly avoid). Container restart visibility
+        // comes from the boot card + gateway boot logs in docker mode.
         const RESTART_WATCHDOG_POLL_MS = Number(
           process.env.SWITCHROOM_RESTART_WATCHDOG_POLL_MS ?? 30_000,
         )
         const watchdogAgentName = process.env.SWITCHROOM_AGENT_NAME
-        if (RESTART_WATCHDOG_POLL_MS > 0 && watchdogAgentName) {
+        const watchdogDockerMode = process.env.SWITCHROOM_RUNTIME === 'docker'
+        if (watchdogDockerMode) {
+          process.stderr.write(
+            `telegram gateway: restart-watchdog disabled (SWITCHROOM_RUNTIME=docker; systemd NRestarts unavailable)\n`,
+          )
+        } else if (RESTART_WATCHDOG_POLL_MS > 0 && watchdogAgentName) {
           startRestartWatchdog({
             agentName: watchdogAgentName,
             pollIntervalMs: RESTART_WATCHDOG_POLL_MS,
@@ -10120,6 +10802,52 @@ void (async () => {
               onStall: (agentId, idleMs, description) => {
                 progressDriver?.onSubAgentStall(agentId, idleMs, description)
               },
+              // Symmetric to onStall: clear the ⚠ Stalled badge as soon
+              // as the watcher sees JSONL activity return, instead of
+              // waiting on the next render tick to recompute idle ms.
+              onUnstall: (agentId, description) => {
+                progressDriver?.onSubAgentUnstall?.(agentId, description)
+              },
+              // #card-audit-log: symmetric sub_agent_finished surface.
+              // The driver's per-chat shadow knows the parent turnKey and
+              // the registry DB carries the background flag — combine them
+              // into a single audit-log line for retrospective debugging.
+              onFinish: ({ agentId, outcome, toolCount, durationMs }) => {
+                let parentTurnKey = ''
+                let chatId = ''
+                let isBackground = false
+                try {
+                  const fleets = progressDriver?.peekAllFleets() ?? []
+                  for (const f of fleets) {
+                    if (f.fleet.has(agentId)) {
+                      parentTurnKey = f.turnKey
+                      chatId = f.chatId ?? ''
+                      break
+                    }
+                  }
+                } catch {
+                  // peek failures are non-fatal — we still emit the event.
+                }
+                if (turnsDb != null) {
+                  try {
+                    const row = turnsDb
+                      .prepare('SELECT background FROM subagents WHERE jsonl_agent_id = ?')
+                      .get(agentId) as { background: number } | undefined
+                    if (row != null) isBackground = row.background === 1
+                  } catch { /* best-effort */ }
+                }
+                const finalOutcome: 'completed' | 'orphan' | 'background' =
+                  isBackground ? 'background' : (outcome === 'completed' ? 'completed' : 'orphan')
+                emitCardEvent({
+                  agent: process.env.SWITCHROOM_AGENT_NAME ?? '',
+                  chatId,
+                  turnKey: parentTurnKey,
+                  event: 'finalized',
+                  reason: `sub_agent_finished agentId=${agentId} outcome=${finalOutcome} tools=${toolCount}`,
+                  subagents: [agentId],
+                  durationMs,
+                })
+              },
             })
             process.stderr.write('telegram gateway: subagent-watcher active\n')
           }