npm - switchroom - Versions diffs - 0.5.0 → 0.7.9 - Mend

switchroom 0.5.0 → 0.7.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (89) hide show

package/README.md +142 -121
package/bin/autoaccept.exp +29 -6
package/dist/agent-scheduler/index.js +12261 -0
package/dist/cli/autoaccept-poll.js +10 -0
package/dist/cli/switchroom.js +27250 -25324
package/dist/vault/approvals/kernel-server.js +12709 -0
package/dist/vault/broker/server.js +15724 -0
package/package.json +4 -3
package/profiles/_base/start.sh.hbs +133 -0
package/profiles/_shared/telegram-style.md.hbs +3 -3
package/profiles/default/CLAUDE.md +3 -3
package/profiles/default/CLAUDE.md.hbs +2 -2
package/profiles/default/workspace/CLAUDE.md.hbs +9 -0
package/skills/docx/VENDORED.md +1 -1
package/skills/mcp-builder/VENDORED.md +1 -1
package/skills/pdf/VENDORED.md +1 -1
package/skills/pptx/VENDORED.md +1 -1
package/skills/skill-creator/VENDORED.md +1 -1
package/skills/switchroom-architecture/SKILL.md +8 -7
package/skills/switchroom-cli/SKILL.md +23 -15
package/skills/switchroom-health/SKILL.md +7 -7
package/skills/switchroom-install/SKILL.md +36 -39
package/skills/switchroom-manage/SKILL.md +4 -4
package/skills/switchroom-status/SKILL.md +1 -1
package/skills/webapp-testing/VENDORED.md +1 -1
package/skills/xlsx/VENDORED.md +1 -1
package/telegram-plugin/admin-commands/dispatch.test.ts +119 -1
package/telegram-plugin/admin-commands/index.ts +71 -0
package/telegram-plugin/ask-user.ts +1 -0
package/telegram-plugin/card-event-log.ts +138 -0
package/telegram-plugin/dist/bridge/bridge.js +178 -31
package/telegram-plugin/dist/foreman/foreman.js +6875 -6526
package/telegram-plugin/dist/gateway/gateway.js +13862 -11834
package/telegram-plugin/dist/server.js +202 -40
package/telegram-plugin/fleet-state.ts +25 -10
package/telegram-plugin/foreman/foreman.ts +38 -3
package/telegram-plugin/gateway/approval-callback.ts +126 -0
package/telegram-plugin/gateway/approval-card.test.ts +90 -0
package/telegram-plugin/gateway/approval-card.ts +127 -0
package/telegram-plugin/gateway/approvals-commands.ts +126 -0
package/telegram-plugin/gateway/boot-card.ts +31 -6
package/telegram-plugin/gateway/boot-probes.ts +510 -72
package/telegram-plugin/gateway/gateway.ts +822 -94
package/telegram-plugin/gateway/ipc-protocol.ts +34 -1
package/telegram-plugin/gateway/ipc-server.ts +35 -0
package/telegram-plugin/gateway/startup-mutex.ts +110 -2
package/telegram-plugin/hooks/hooks.json +19 -0
package/telegram-plugin/hooks/tool-label-pretool.mjs +216 -0
package/telegram-plugin/hooks/tool-label-stop.mjs +63 -0
package/telegram-plugin/package.json +4 -1
package/telegram-plugin/plugin-logger.ts +20 -1
package/telegram-plugin/progress-card-driver.ts +202 -13
package/telegram-plugin/progress-card.ts +2 -2
package/telegram-plugin/quota-check.ts +1 -0
package/telegram-plugin/registry/subagents-schema.ts +37 -0
package/telegram-plugin/registry/subagents.test.ts +64 -0
package/telegram-plugin/session-tail.ts +58 -5
package/telegram-plugin/shared/bot-runtime.ts +48 -2
package/telegram-plugin/subagent-watcher.ts +139 -7
package/telegram-plugin/tests/_progress-card-harness.ts +4 -0
package/telegram-plugin/tests/bg-agent-progress-card-757.test.ts +201 -0
package/telegram-plugin/tests/boot-card-probe-target.test.ts +10 -34
package/telegram-plugin/tests/boot-card-render.test.ts +6 -5
package/telegram-plugin/tests/boot-probes.test.ts +564 -0
package/telegram-plugin/tests/card-event-log.test.ts +145 -0
package/telegram-plugin/tests/gateway-startup-mutex.test.ts +102 -0
package/telegram-plugin/tests/ipc-server-validate-inject-inbound.test.ts +134 -0
package/telegram-plugin/tests/progress-card-delay-842.test.ts +160 -0
package/telegram-plugin/tests/quota-check.test.ts +37 -1
package/telegram-plugin/tests/subagent-registry-bugs.test.ts +5 -0
package/telegram-plugin/tests/subagent-watcher-stall-notification.test.ts +104 -1
package/telegram-plugin/tests/subagent-watcher.test.ts +5 -0
package/telegram-plugin/tests/tool-label-sidecar.test.ts +114 -0
package/telegram-plugin/tests/two-zone-bg-done-when-all-terminal.test.ts +5 -3
package/telegram-plugin/tests/two-zone-card-header-phases.test.ts +10 -0
package/telegram-plugin/tests/two-zone-snapshot-extras.test.ts +58 -14
package/telegram-plugin/tests/welcome-text.test.ts +57 -0
package/telegram-plugin/tool-label-sidecar.ts +140 -0
package/telegram-plugin/tool-labels.ts +55 -0
package/telegram-plugin/two-zone-card.ts +27 -7
package/telegram-plugin/uat/SETUP.md +160 -0
package/telegram-plugin/uat/assertions.ts +140 -0
package/telegram-plugin/uat/driver.ts +174 -0
package/telegram-plugin/uat/harness.ts +161 -0
package/telegram-plugin/uat/login.ts +134 -0
package/telegram-plugin/uat/port-allocator.ts +71 -0
package/telegram-plugin/uat/scenarios/smoke-clerk-reply.test.ts +61 -0
package/telegram-plugin/welcome-text.ts +44 -2
package/bin/bridge-watchdog.sh +0 -967

package/telegram-plugin/gateway/boot-probes.ts CHANGED Viewed

@@ -11,7 +11,7 @@
  * caller as a thrown error — only as ProbeResult{ status:'fail', ... }.
  */
-import { readFileSync, existsSync, mkdirSync, writeFileSync } from 'fs'
+import { readFileSync, readdirSync, existsSync, mkdirSync, writeFileSync } from 'fs'
 import { join } from 'path'
 import { execFile as execFileCb } from 'child_process'
 import { promisify } from 'util'
@@ -251,6 +251,148 @@ type ExecFileFnType = (
   args: string[],
 ) => Promise<ExecFileResult>
+/**
+ * Filesystem injection point for the docker-mode /proc walk so tests can
+ * drive synthetic `/proc/<pid>/{comm,stat,status}` strings without
+ * touching the real host fs.
+ */
+export interface ProcFsImpl {
+  readdir: (path: string) => string[]
+  readFile: (path: string) => string
+}
+const realProcFs: ProcFsImpl = {
+  readdir: (p) => readdirSync(p),
+  readFile: (p) => readFileSync(p, 'utf-8'),
+}
+type AgentCandidate = {
+  pid: number
+  rssKb: number
+  comm: string
+  starttime: number
+}
+/**
+ * Walk `/proc` from inside the current pid-namespace and pick the
+ * heaviest claude/node process. Used for the docker-mode agent probe:
+ * inside an agent container, we share the namespace with claude, so a
+ * /proc walk replaces the systemctl-driven cgroup walk used under
+ * systemd. Skips wrappers (tmux/expect/script/bash/sh) and our own
+ * gateway PID. Exported for tests.
+ */
+export function findAgentProcessInContainer(
+  fs: ProcFsImpl = realProcFs,
+): AgentCandidate | null {
+  let entries: string[]
+  try {
+    entries = fs.readdir('/proc')
+  } catch {
+    return null
+  }
+  const candidates: AgentCandidate[] = []
+  for (const entry of entries) {
+    if (!/^\d+$/.test(entry)) continue
+    const pid = Number(entry)
+    if (!Number.isFinite(pid) || pid <= 0) continue
+    if (pid === process.pid) continue
+    let comm = ''
+    try {
+      comm = fs.readFile(`/proc/${pid}/comm`).trim()
+    } catch {
+      continue
+    }
+    let rssKb = 0
+    try {
+      const status = fs.readFile(`/proc/${pid}/status`)
+      const m = status.match(/^VmRSS:\s+(\d+)/m)
+      if (m) rssKb = parseInt(m[1], 10) || 0
+    } catch {
+      continue
+    }
+    let starttime = 0
+    try {
+      const stat = fs.readFile(`/proc/${pid}/stat`)
+      // /proc/<pid>/stat format: pid (comm-with-parens) state ppid ...
+      // field 22 (1-indexed) is starttime in clock ticks since boot.
+      // comm can contain spaces/parens — use the LAST ')' as the
+      // anchor so we tokenize the remainder safely.
+      const close = stat.lastIndexOf(')')
+      const tail = close >= 0 ? stat.slice(close + 2) : stat
+      const fields = tail.trim().split(/\s+/)
+      // After the "(comm)" group, the remaining fields are state, ppid,
+      // ... with starttime at index 19 (0-indexed) of `tail` because
+      // field 3 (state) is `tail[0]`.
+      const st = Number(fields[19])
+      if (Number.isFinite(st) && st > 0) starttime = st
+    } catch {
+      continue
+    }
+    candidates.push({ pid, rssKb, comm, starttime })
+  }
+  if (candidates.length === 0) return null
+  const isAgent = (c: AgentCandidate): boolean => c.comm === 'claude'
+  const isWrapper = (c: AgentCandidate): boolean =>
+    c.comm === 'tmux' || c.comm.startsWith('tmux:') ||
+    c.comm === 'expect' || c.comm === 'script' ||
+    c.comm === 'bash' || c.comm === 'sh' ||
+    c.comm === 'tini' || c.comm === 'sleep'
+  const claudeMatches = candidates.filter(isAgent)
+  if (claudeMatches.length > 0) {
+    claudeMatches.sort((a, b) => b.rssKb - a.rssKb)
+    return claudeMatches[0]
+  }
+  // No `claude` comm — fall back to heaviest non-wrapper node process.
+  const nodeMatches = candidates
+    .filter(c => c.comm === 'node' && !isWrapper(c))
+    .sort((a, b) => b.rssKb - a.rssKb)
+  if (nodeMatches.length > 0) return nodeMatches[0]
+  return null
+}
+/**
+ * Read /proc/uptime to derive the agent process's uptime from its
+ * starttime (clock ticks since boot). Returns null on any failure.
+ *
+ * SC_CLK_TCK (the units of `starttime` in /proc/<pid>/stat) is a stable
+ * kernel ABI value, hardcoded to 100 on x86_64 across Debian/Ubuntu/
+ * Alpine/RHEL. If we ever ship on arm64 hosts where some kernels use
+ * 250, uptimes will look 2.5× too large and we'll revisit.
+ */
+export function uptimeMsForStarttime(
+  starttimeTicks: number,
+  fs: ProcFsImpl = realProcFs,
+): number | null {
+  try {
+    const uptimeRaw = fs.readFile('/proc/uptime').trim()
+    const bootUptimeSec = Number(uptimeRaw.split(/\s+/)[0])
+    if (!Number.isFinite(bootUptimeSec) || bootUptimeSec <= 0) return null
+    const HZ = 100
+    const procUptimeSec = bootUptimeSec - starttimeTicks / HZ
+    if (procUptimeSec < 0) return null
+    return Math.round(procUptimeSec * 1000)
+  } catch {
+    return null
+  }
+}
+function probeAgentProcessDocker(): ProbeResult {
+  const found = findAgentProcessInContainer()
+  if (!found) {
+    return { status: 'fail', label: 'Agent', detail: 'claude process not found' }
+  }
+  const uptimeMs = uptimeMsForStarttime(found.starttime)
+  const mb = Math.round(found.rssKb / 1024)
+  const parts = [
+    `PID ${found.pid}`,
+    uptimeMs != null ? `up ${formatMs(uptimeMs)}` : '',
+    mb > 0 ? `${mb} MB` : '',
+  ].filter(Boolean)
+  return { status: 'ok', label: 'Agent', detail: parts.join(' · ') }
+}
 /**
  * Resolve the "real" agent PID under tmux supervisor by walking the
  * unit's cgroup and picking the heaviest-RSS claude/node process.
@@ -371,8 +513,19 @@ export async function probeAgentProcess(
     /** When true, resolve PID via cgroup walk (heaviest claude/node) — under
      *  tmux supervisor MainPID is the tmux server (~2MB) which is misleading. */
     tmuxSupervisor?: boolean
+    /** When true, skip systemctl entirely. The gateway is running INSIDE the
+     *  agent container alongside claude, so we walk /proc directly. There's
+     *  no "service deactivating/activating" model under docker — claude is
+     *  either there or it isn't, so we return single-shot without retry. */
+    dockerMode?: boolean
+    /** Test override — defaults to the real probeAgentProcessDocker(). */
+    dockerProbeImpl?: () => ProbeResult
   } = {},
 ): Promise<ProbeResult> {
+  if (opts.dockerMode) {
+    const impl = opts.dockerProbeImpl ?? probeAgentProcessDocker
+    return withTimeout('Agent', Promise.resolve(impl()))
+  }
   const retryIntervalMs = opts.retryIntervalMs ?? AGENT_RETRY_INTERVAL_MS
   const retryMaxMs = opts.retryMaxMs ?? AGENT_RETRY_MAX_MS
   const sleep = opts.sleepImpl ?? ((ms: number) => new Promise(resolve => setTimeout(resolve, ms)))
@@ -469,8 +622,18 @@ export async function* watchAgentProcess(
     nowImpl?: () => number
     /** When true, resolve PID via cgroup walk (heaviest claude/node). */
     tmuxSupervisor?: boolean
+    /** When true, skip systemctl: yield once with the current /proc-derived
+     *  state and exit. Mirrors probeAgentProcess's docker-mode shortcut. */
+    dockerMode?: boolean
+    /** Test override — defaults to the real probeAgentProcessDocker(). */
+    dockerProbeImpl?: () => ProbeResult
   } = {},
 ): AsyncGenerator<ProbeResult> {
+  if (opts.dockerMode) {
+    const impl = opts.dockerProbeImpl ?? probeAgentProcessDocker
+    yield impl()
+    return
+  }
   const liveWindowMs = opts.liveWindowMs ?? AGENT_LIVE_WINDOW_MS
   const pollIntervalMs = opts.pollIntervalMs ?? AGENT_LIVE_POLL_INTERVAL_MS
   const followupRepollMs = opts.followupRepollMs ?? AGENT_LIVE_FOLLOWUP_REPOLL_MS
@@ -767,97 +930,372 @@ export async function probeHindsight(
   })())
 }
-// ─── Probe: Cron timers ──────────────────────────────────────────────────────
+// ─── Probe: Scheduler (in-container agent-scheduler since Phase 4) ───────────
-interface SystemctlTimerEntry {
-  next?: string
-  left?: string
-  last?: string
-  unit?: string
-  activates?: string
-  passed?: string
+/**
+ * Default lock and audit-jsonl paths inside the agent container.
+ * Mirrored from src/agent-scheduler/index.ts:194-197 — kept in sync there.
+ */
+const SCHEDULER_LOCK_PATH_DEFAULT = '/state/agent/scheduler.lock'
+const SCHEDULER_JSONL_PATH_DEFAULT = '/state/agent/scheduler.jsonl'
+/**
+ * How long after PID 1 started we treat a missing/dead scheduler as
+ * "still settling" rather than a hard fail. Boot-card already has its
+ * own 6 s settle window before probes run, so this only matters for
+ * /status hits during the first ~30 s of a container's life — long
+ * enough to cover supervisor + bun startup on a slow host without
+ * hiding a genuinely wedged scheduler.
+ */
+const SCHEDULER_FRESH_BOOT_MS = 30_000
+/**
+ * Read PID 1's start time inside the container (ms since epoch). Used
+ * to soften scheduler probe verdicts during the early-boot window.
+ * Mirrors `readContainerBootTimeMs` from src/agent-scheduler/lock.ts —
+ * we duplicate the small reader here rather than import across the
+ * src/telegram-plugin boundary, since the plugin is built standalone.
+ *
+ * Returns null on any /proc parse failure → caller skips the softening.
+ */
+function readContainerBootTimeMsForProbe(): number | null {
+  try {
+    const stat1 = readFileSync('/proc/1/stat', 'utf8')
+    const lastParen = stat1.lastIndexOf(')')
+    if (lastParen < 0) return null
+    const after = stat1.slice(lastParen + 1).trim().split(/\s+/)
+    const starttimeTicks = Number(after[19])
+    if (!Number.isFinite(starttimeTicks)) return null
+    const procStat = readFileSync('/proc/stat', 'utf8')
+    const btimeLine = procStat.split('\n').find((l) => l.startsWith('btime '))
+    if (!btimeLine) return null
+    const btimeSec = Number(btimeLine.split(/\s+/)[1])
+    if (!Number.isFinite(btimeSec)) return null
+    const CLK_TCK = 100
+    return (btimeSec + starttimeTicks / CLK_TCK) * 1000
+  } catch {
+    return null
+  }
 }
-function parseTimerLeft(left: string | undefined): number | null {
-  if (!left) return null
-  // format: "1h 32min left" or "2min 5s left" or similar
-  let ms = 0
-  const h = left.match(/(\d+)h/)
-  const m = left.match(/(\d+)min/)
-  const s = left.match(/(\d+)s/)
-  if (h) ms += Number(h[1]) * 3600_000
-  if (m) ms += Number(m[1]) * 60_000
-  if (s) ms += Number(s[1]) * 1000
-  return ms > 0 ? ms : null
+/**
+ * Filesystem injection point for the scheduler probe. Same shape as
+ * ProcFsImpl but read-only against arbitrary paths. Tests inject a
+ * synthetic fs to drive lockfile contents and jsonl tails without
+ * touching disk.
+ */
+export interface SchedulerFsImpl {
+  readFile: (path: string) => string
+  /** stat-mtime, ms-since-epoch. Used to age the audit jsonl. */
+  mtimeMs: (path: string) => number
+  exists: (path: string) => boolean
 }
-export async function probeCronTimers(
-  agentName: string,
-  opts: { execFileImpl?: ExecFileFnType } = {},
+const realSchedulerFs: SchedulerFsImpl = {
+  readFile: (p) => readFileSync(p, 'utf-8'),
+  mtimeMs: (p) => {
+    // `existsSync` shaped path keeps the probe defensive — caller checks
+    // exists() first. statSync is imported via the readdirSync chain.
+    // eslint-disable-next-line @typescript-eslint/no-require-imports
+    const { statSync } = require('fs') as typeof import('fs')
+    return statSync(p).mtimeMs
+  },
+  exists: (p) => existsSync(p),
+}
+/**
+ * Probe the in-container agent-scheduler (cron-fold-in cutover, Phase 4
+ * — see CLAUDE.md "Cron-fold-in note"). Replaces the pre-Phase-4 probe
+ * that queried `systemctl --user list-timers switchroom-<agent>-cron-*`
+ * (those timers no longer exist) and the dockerMode short-circuit that
+ * lied with "managed by switchroom-cron" (that container was retired in
+ * PR #893).
+ *
+ * The scheduler is a sibling sidecar started by start.sh's
+ * _switchroom_supervise wrapper. It writes a pidfile-with-liveness lock
+ * at /state/agent/scheduler.lock (src/agent-scheduler/lock.ts) and an
+ * audit row per fire to /state/agent/scheduler.jsonl
+ * (src/agent-scheduler/index.ts:256, src/scheduler/audit.ts).
+ *
+ *   ok       — lockfile present, holder PID alive
+ *   degraded — lockfile present but PID dead (supervisor mid-restart, or
+ *              sched crashed and supervisor hasn't relaunched yet)
+ *   fail     — lockfile missing (sidecar never started or supervisor
+ *              gave up after restart-cap)
+ *
+ * Outside dockerMode the probe is silent (returns ok with "n/a"). Phase
+ * 4 deleted the host-side scheduler entirely; non-docker callers
+ * (legacy systemd installs, tests) have no scheduler to probe.
+ */
+export async function probeScheduler(
+  _agentName: string,
+  opts: {
+    dockerMode?: boolean
+    fs?: SchedulerFsImpl
+    /** Override the lockfile path. Defaults to env
+     *  `SWITCHROOM_AGENT_SCHEDULER_LOCK` (matches the override the
+     *  scheduler itself reads at src/agent-scheduler/index.ts:196), then
+     *  to `/state/agent/scheduler.lock`. */
+    lockPath?: string
+    /** Override the audit-jsonl path. Defaults to env
+     *  `SWITCHROOM_AGENT_SCHEDULER_JSONL`, then to
+     *  `/state/agent/scheduler.jsonl` (mirrors index.ts:194). */
+    jsonlPath?: string
+    /** Liveness check for the holder PID — defaults to process.kill(pid, 0). */
+    isAlive?: (pid: number) => boolean
+    now?: () => number
+    /** Container PID-1 start time in ms since epoch. When set AND the
+     *  current time is within `SCHEDULER_FRESH_BOOT_MS` of it, scheduler
+     *  fail/degraded verdicts are softened to "still settling". Pass
+     *  `null` to disable the softening (e.g. unit tests pinning a hard
+     *  fail). Defaults to `readContainerBootTimeMsForProbe()`. */
+    containerBootTimeMs?: number | null
+  } = {},
 ): Promise<ProbeResult> {
-  const execFileFn: ExecFileFnType = opts.execFileImpl ?? execFile
-  return withTimeout('Crons', (async (): Promise<ProbeResult> => {
-    let stdout: string
+  if (!opts.dockerMode) {
+    return { status: 'ok', label: 'Scheduler', detail: 'n/a (non-docker)' }
+  }
+  return withTimeout('Scheduler', (async (): Promise<ProbeResult> => {
+    const fs = opts.fs ?? realSchedulerFs
+    const lockPath = opts.lockPath
+      ?? process.env.SWITCHROOM_AGENT_SCHEDULER_LOCK
+      ?? SCHEDULER_LOCK_PATH_DEFAULT
+    const jsonlPath = opts.jsonlPath
+      ?? process.env.SWITCHROOM_AGENT_SCHEDULER_JSONL
+      ?? SCHEDULER_JSONL_PATH_DEFAULT
+    const now = opts.now ?? Date.now
+    const isAlive = opts.isAlive ?? ((pid: number) => {
+      try { process.kill(pid, 0); return true } catch { return false }
+    })
+    const bootTimeMs = 'containerBootTimeMs' in opts
+      ? opts.containerBootTimeMs
+      : readContainerBootTimeMsForProbe()
+    const stillSettling = bootTimeMs != null
+      && (now() - bootTimeMs) < SCHEDULER_FRESH_BOOT_MS
+    const settlingNote = stillSettling ? ' (still settling)' : ''
+    if (!fs.exists(lockPath)) {
+      // During the first ~30 s of a container's life, "no lockfile" is
+      // the supervisor + bun still starting up. /status hit at that
+      // moment shouldn't show 🔴 for a non-issue.
+      return {
+        status: stillSettling ? 'degraded' : 'fail',
+        label: 'Scheduler',
+        detail: `sidecar not running (no lockfile)${settlingNote}`,
+      }
+    }
+    let holderPid: number | null = null
     try {
-      const result = await execFileFn('systemctl', [
-        '--user', 'list-timers',
-        `switchroom-${agentName}-cron-*`,
-        '--output=json',
-        '--all',
-      ])
-      stdout = result.stdout.trim()
-    } catch (err: unknown) {
-      // systemctl exits non-zero when no units match
-      const msg = (err as NodeJS.ErrnoException)?.message ?? String(err)
-      // child_process exec errors have `code` typed as string in
-      // NodeJS.ErrnoException, but at runtime it's numeric for shell
-      // exit codes. Stringify to avoid the type-system mismatch and
-      // the comparison "looks unintentional" warning.
-      if (msg.includes('No timers found') || String((err as NodeJS.ErrnoException)?.code) === '1') {
-        return { status: 'ok', label: 'Crons', detail: '0 timers' }
+      const raw = fs.readFile(lockPath).trim()
+      const parsed = Number.parseInt(raw, 10)
+      if (Number.isInteger(parsed) && parsed > 0) holderPid = parsed
+    } catch {
+      return { status: 'degraded', label: 'Scheduler', detail: 'lockfile unreadable' }
+    }
+    if (holderPid == null) {
+      return { status: 'degraded', label: 'Scheduler', detail: 'lockfile contents invalid' }
+    }
+    if (!isAlive(holderPid)) {
+      return {
+        status: 'degraded',
+        label: 'Scheduler',
+        detail: `lock holder pid ${holderPid} not alive (supervisor restart in progress?)`,
       }
-      return { status: 'fail', label: 'Crons', detail: `systemctl failed: ${msg}` }
     }
-    if (!stdout || stdout === '[]' || stdout.length === 0) {
-      return { status: 'ok', label: 'Crons', detail: '0 timers' }
+    // Sidecar is up. Add a freshness hint from scheduler.jsonl if present
+    // — gives the user signal that fires are actually happening, not just
+    // that the daemon is breathing. Absence is fine: a freshly booted
+    // agent or a 0-entry agent has no fires to report.
+    let detail = `running (pid ${holderPid})`
+    if (fs.exists(jsonlPath)) {
+      try {
+        const ageMs = now() - fs.mtimeMs(jsonlPath)
+        if (Number.isFinite(ageMs) && ageMs >= 0) {
+          detail += ` · last fire ${formatMs(ageMs)} ago`
+        }
+      } catch {
+        // mtime read failed — keep the basic detail; non-blocking.
+      }
     }
+    return { status: 'ok', label: 'Scheduler', detail }
+  })())
+}
+// ─── Probe: Vault broker / approval kernel reachability ──────────────────────
-    let timers: SystemctlTimerEntry[] = []
+/**
+ * Generic UDS-reachability probe used for both vault-broker and
+ * approval-kernel. Path-as-identity invariant (CLAUDE.md "Per-agent
+ * socket model") — bind paths are mounted into each agent container at
+ * /run/switchroom/{broker,kernel}/<agent>/sock. ENOENT means the
+ * compose volume isn't mounted (broker container down or no agent dir
+ * yet); ECONNREFUSED means the bind disappeared between us and the
+ * daemon (rare, broker shutdown removes the socket).
+ *
+ * Connect-test only — we do NOT send a wire request. The probe must not
+ * authenticate as the agent or do any vault/grant work; that's the
+ * agent's job. We just want to know "is something listening on this
+ * socket". Connection is closed immediately on success.
+ */
+async function probeUds(
+  label: string,
+  socketPath: string | undefined,
+  opts: { dockerMode?: boolean; connectImpl?: (path: string) => Promise<void> } = {},
+): Promise<ProbeResult> {
+  if (!opts.dockerMode) {
+    return { status: 'ok', label, detail: 'n/a (non-docker)' }
+  }
+  if (!socketPath) {
+    return { status: 'fail', label, detail: 'socket path not configured' }
+  }
+  return withTimeout(label, (async (): Promise<ProbeResult> => {
+    if (!opts.connectImpl) {
+      // Cheap pre-check: stat the file. Saves the connect round-trip on
+      // the common "broker container down → bind mount empty" case.
+      if (!existsSync(socketPath)) {
+        return { status: 'fail', label, detail: `socket missing: ${socketPath}` }
+      }
+    }
+    const connect = opts.connectImpl ?? defaultUdsConnect
     try {
-      timers = JSON.parse(stdout) as SystemctlTimerEntry[]
-    } catch {
-      // Fall back to line-count if JSON failed
-      const count = stdout.split('\n').filter(l => l.includes('cron')).length
-      return { status: 'ok', label: 'Crons', detail: `${count} timers` }
+      await connect(socketPath)
+      return { status: 'ok', label, detail: 'reachable' }
+    } catch (err: unknown) {
+      const code = (err as NodeJS.ErrnoException)?.code
+      const msg = (err as Error)?.message ?? String(err)
+      if (code === 'ENOENT') return { status: 'fail', label, detail: 'socket missing' }
+      if (code === 'ECONNREFUSED') return { status: 'fail', label, detail: 'connection refused' }
+      return { status: 'fail', label, detail: `connect failed: ${msg}` }
     }
+  })())
+}
-    if (!Array.isArray(timers) || timers.length === 0) {
-      return { status: 'ok', label: 'Crons', detail: '0 timers' }
-    }
+/**
+ * Default UDS connect — opens a stream, then immediately closes it.
+ * Resolves on `connect` event, rejects on `error`. 1s connect timeout
+ * is plenty for a local socket (the per-probe timeout in withTimeout
+ * is the outer guard).
+ */
+function defaultUdsConnect(socketPath: string): Promise<void> {
+  // eslint-disable-next-line @typescript-eslint/no-require-imports
+  const net = require('net') as typeof import('net')
+  return new Promise<void>((resolve, reject) => {
+    const sock = net.createConnection({ path: socketPath })
+    const t = setTimeout(() => {
+      sock.destroy()
+      reject(new Error('connect timeout'))
+    }, 1000)
+    sock.once('connect', () => {
+      clearTimeout(t)
+      sock.end()
+      resolve()
+    })
+    sock.once('error', (err) => {
+      clearTimeout(t)
+      sock.destroy()
+      reject(err)
+    })
+  })
+}
+export async function probeBroker(
+  socketPath?: string,
+  opts: { dockerMode?: boolean; connectImpl?: (path: string) => Promise<void> } = {},
+): Promise<ProbeResult> {
+  // SWITCHROOM_VAULT_BROKER_SOCK is the canonical client-side env name
+  // — matches what src/vault/broker/client.ts:293 and the secret-guard
+  // hook (telegram-plugin/hooks/secret-guard-pretool.mjs:36) read.
+  // The broker SERVER reads SWITCHROOM_BROKER_SOCKET as its bind-path
+  // env (in the broker container only). Pre-fix the probe + compose
+  // both used SWITCHROOM_BROKER_SOCKET in the agent container — wrong
+  // name, fell through to dangling-symlink fallback, false-failed.
+  return probeUds('Broker', socketPath ?? process.env.SWITCHROOM_VAULT_BROKER_SOCK, opts)
+}
+export async function probeKernel(
+  socketPath?: string,
+  opts: { dockerMode?: boolean; connectImpl?: (path: string) => Promise<void> } = {},
+): Promise<ProbeResult> {
+  return probeUds('Kernel', socketPath ?? process.env.SWITCHROOM_KERNEL_SOCKET, opts)
+}
-    // Find the timer that fires soonest
-    let earliest: { name: string; leftMs: number } | null = null
-    for (const t of timers) {
-      const ms = parseTimerLeft(t.left)
-      const name = (t.unit ?? t.activates ?? '').replace(/^switchroom-[^-]+-cron-/, '').replace(/\.timer$/, '')
-      if (ms != null && (earliest == null || ms < earliest.leftMs)) {
-        earliest = { name, leftMs: ms }
+// ─── Probe: Skills (symlink validity) ────────────────────────────────────────
+/**
+ * Validate that every entry under <agentDir>/.claude/skills/ resolves
+ * to a readable file. Skills are normally symlinks into the global pool
+ * `~/.switchroom/skills/` (src/agents/scaffold.ts:639); a renamed or
+ * deleted skill in the pool dangles silently — claude won't surface the
+ * skill, the user wonders why /<skill> doesn't work.
+ *
+ *   ok       — every entry resolves OR the dir doesn't exist (no skills
+ *              configured is a normal state, not a failure)
+ *   degraded — at least one symlink dangles; rendered detail names them
+ *              up to a cap so the row doesn't wrap forever
+ */
+export async function probeSkills(
+  agentDir: string,
+  opts: { fs?: SkillsFsImpl; maxNamesShown?: number } = {},
+): Promise<ProbeResult> {
+  return withTimeout('Skills', (async (): Promise<ProbeResult> => {
+    const fs = opts.fs ?? realSkillsFs
+    const max = opts.maxNamesShown ?? 3
+    const skillsDir = join(agentDir, '.claude', 'skills')
+    if (!fs.exists(skillsDir)) {
+      return { status: 'ok', label: 'Skills', detail: 'no skills dir' }
+    }
+    let entries: string[]
+    try {
+      entries = fs.readdir(skillsDir)
+    } catch {
+      return { status: 'degraded', label: 'Skills', detail: 'skills dir unreadable' }
+    }
+    if (entries.length === 0) {
+      return { status: 'ok', label: 'Skills', detail: '0 skills' }
+    }
+    const dangling: string[] = []
+    for (const name of entries) {
+      // Skills are dirs containing a SKILL.md (claude convention). The
+      // dangle case we worry about is a symlink whose target was
+      // removed — readability of <name>/SKILL.md is the simplest proxy
+      // and matches what claude itself would discover.
+      const skillPath = join(skillsDir, name)
+      if (!fs.exists(skillPath)) {
+        dangling.push(name)
+        continue
+      }
+      // Single-file skills exist (rare but allowed); accept them too.
+      const skillMd = join(skillPath, 'SKILL.md')
+      if (!fs.exists(skillMd) && !fs.exists(skillPath + '.md')) {
+        // Only flag as dangling if the entry IS a symlink (a real dir
+        // without SKILL.md is weird but not necessarily broken — could
+        // be an in-progress local skill). We have no symlink-test in
+        // SkillsFsImpl by design; conservatively don't flag as dangling.
+        // The user's main risk is removed-pool-target, which existsSync
+        // catches above.
+        continue
       }
     }
-    const count = timers.length
-    if (!earliest) {
-      return { status: 'ok', label: 'Crons', detail: `${count} timers` }
+    if (dangling.length === 0) {
+      return { status: 'ok', label: 'Skills', detail: `${entries.length} resolved` }
     }
-    const h = Math.floor(earliest.leftMs / 3600_000)
-    const m = Math.round((earliest.leftMs % 3600_000) / 60_000)
-    const timeStr = h > 0 ? `${h}h ${m}m` : `${m}m`
+    const named = dangling.slice(0, max).join(', ')
+    const more = dangling.length > max ? ` +${dangling.length - max} more` : ''
     return {
-      status: 'ok',
-      label: 'Crons',
-      detail: `${count} timers · next: ${earliest.name} in ${timeStr}`,
+      status: 'degraded',
+      label: 'Skills',
+      detail: `${dangling.length}/${entries.length} dangling: ${named}${more}`,
     }
   })())
 }
+export interface SkillsFsImpl {
+  readdir: (p: string) => string[]
+  exists: (p: string) => boolean
+}
+const realSkillsFs: SkillsFsImpl = {
+  readdir: (p) => readdirSync(p),
+  exists: (p) => existsSync(p),
+}