switchroom 0.5.0 → 0.7.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. package/README.md +142 -121
  2. package/bin/autoaccept.exp +29 -6
  3. package/dist/agent-scheduler/index.js +12261 -0
  4. package/dist/cli/autoaccept-poll.js +10 -0
  5. package/dist/cli/switchroom.js +27250 -25324
  6. package/dist/vault/approvals/kernel-server.js +12709 -0
  7. package/dist/vault/broker/server.js +15724 -0
  8. package/package.json +4 -3
  9. package/profiles/_base/start.sh.hbs +133 -0
  10. package/profiles/_shared/telegram-style.md.hbs +3 -3
  11. package/profiles/default/CLAUDE.md +3 -3
  12. package/profiles/default/CLAUDE.md.hbs +2 -2
  13. package/profiles/default/workspace/CLAUDE.md.hbs +9 -0
  14. package/skills/docx/VENDORED.md +1 -1
  15. package/skills/mcp-builder/VENDORED.md +1 -1
  16. package/skills/pdf/VENDORED.md +1 -1
  17. package/skills/pptx/VENDORED.md +1 -1
  18. package/skills/skill-creator/VENDORED.md +1 -1
  19. package/skills/switchroom-architecture/SKILL.md +8 -7
  20. package/skills/switchroom-cli/SKILL.md +23 -15
  21. package/skills/switchroom-health/SKILL.md +7 -7
  22. package/skills/switchroom-install/SKILL.md +36 -39
  23. package/skills/switchroom-manage/SKILL.md +4 -4
  24. package/skills/switchroom-status/SKILL.md +1 -1
  25. package/skills/webapp-testing/VENDORED.md +1 -1
  26. package/skills/xlsx/VENDORED.md +1 -1
  27. package/telegram-plugin/admin-commands/dispatch.test.ts +119 -1
  28. package/telegram-plugin/admin-commands/index.ts +71 -0
  29. package/telegram-plugin/ask-user.ts +1 -0
  30. package/telegram-plugin/card-event-log.ts +138 -0
  31. package/telegram-plugin/dist/bridge/bridge.js +178 -31
  32. package/telegram-plugin/dist/foreman/foreman.js +6875 -6526
  33. package/telegram-plugin/dist/gateway/gateway.js +13862 -11834
  34. package/telegram-plugin/dist/server.js +202 -40
  35. package/telegram-plugin/fleet-state.ts +25 -10
  36. package/telegram-plugin/foreman/foreman.ts +38 -3
  37. package/telegram-plugin/gateway/approval-callback.ts +126 -0
  38. package/telegram-plugin/gateway/approval-card.test.ts +90 -0
  39. package/telegram-plugin/gateway/approval-card.ts +127 -0
  40. package/telegram-plugin/gateway/approvals-commands.ts +126 -0
  41. package/telegram-plugin/gateway/boot-card.ts +31 -6
  42. package/telegram-plugin/gateway/boot-probes.ts +510 -72
  43. package/telegram-plugin/gateway/gateway.ts +822 -94
  44. package/telegram-plugin/gateway/ipc-protocol.ts +34 -1
  45. package/telegram-plugin/gateway/ipc-server.ts +35 -0
  46. package/telegram-plugin/gateway/startup-mutex.ts +110 -2
  47. package/telegram-plugin/hooks/hooks.json +19 -0
  48. package/telegram-plugin/hooks/tool-label-pretool.mjs +216 -0
  49. package/telegram-plugin/hooks/tool-label-stop.mjs +63 -0
  50. package/telegram-plugin/package.json +4 -1
  51. package/telegram-plugin/plugin-logger.ts +20 -1
  52. package/telegram-plugin/progress-card-driver.ts +202 -13
  53. package/telegram-plugin/progress-card.ts +2 -2
  54. package/telegram-plugin/quota-check.ts +1 -0
  55. package/telegram-plugin/registry/subagents-schema.ts +37 -0
  56. package/telegram-plugin/registry/subagents.test.ts +64 -0
  57. package/telegram-plugin/session-tail.ts +58 -5
  58. package/telegram-plugin/shared/bot-runtime.ts +48 -2
  59. package/telegram-plugin/subagent-watcher.ts +139 -7
  60. package/telegram-plugin/tests/_progress-card-harness.ts +4 -0
  61. package/telegram-plugin/tests/bg-agent-progress-card-757.test.ts +201 -0
  62. package/telegram-plugin/tests/boot-card-probe-target.test.ts +10 -34
  63. package/telegram-plugin/tests/boot-card-render.test.ts +6 -5
  64. package/telegram-plugin/tests/boot-probes.test.ts +564 -0
  65. package/telegram-plugin/tests/card-event-log.test.ts +145 -0
  66. package/telegram-plugin/tests/gateway-startup-mutex.test.ts +102 -0
  67. package/telegram-plugin/tests/ipc-server-validate-inject-inbound.test.ts +134 -0
  68. package/telegram-plugin/tests/progress-card-delay-842.test.ts +160 -0
  69. package/telegram-plugin/tests/quota-check.test.ts +37 -1
  70. package/telegram-plugin/tests/subagent-registry-bugs.test.ts +5 -0
  71. package/telegram-plugin/tests/subagent-watcher-stall-notification.test.ts +104 -1
  72. package/telegram-plugin/tests/subagent-watcher.test.ts +5 -0
  73. package/telegram-plugin/tests/tool-label-sidecar.test.ts +114 -0
  74. package/telegram-plugin/tests/two-zone-bg-done-when-all-terminal.test.ts +5 -3
  75. package/telegram-plugin/tests/two-zone-card-header-phases.test.ts +10 -0
  76. package/telegram-plugin/tests/two-zone-snapshot-extras.test.ts +58 -14
  77. package/telegram-plugin/tests/welcome-text.test.ts +57 -0
  78. package/telegram-plugin/tool-label-sidecar.ts +140 -0
  79. package/telegram-plugin/tool-labels.ts +55 -0
  80. package/telegram-plugin/two-zone-card.ts +27 -7
  81. package/telegram-plugin/uat/SETUP.md +160 -0
  82. package/telegram-plugin/uat/assertions.ts +140 -0
  83. package/telegram-plugin/uat/driver.ts +174 -0
  84. package/telegram-plugin/uat/harness.ts +161 -0
  85. package/telegram-plugin/uat/login.ts +134 -0
  86. package/telegram-plugin/uat/port-allocator.ts +71 -0
  87. package/telegram-plugin/uat/scenarios/smoke-clerk-reply.test.ts +61 -0
  88. package/telegram-plugin/welcome-text.ts +44 -2
  89. package/bin/bridge-watchdog.sh +0 -967
@@ -11,7 +11,7 @@
11
11
  * caller as a thrown error — only as ProbeResult{ status:'fail', ... }.
12
12
  */
13
13
 
14
- import { readFileSync, existsSync, mkdirSync, writeFileSync } from 'fs'
14
+ import { readFileSync, readdirSync, existsSync, mkdirSync, writeFileSync } from 'fs'
15
15
  import { join } from 'path'
16
16
  import { execFile as execFileCb } from 'child_process'
17
17
  import { promisify } from 'util'
@@ -251,6 +251,148 @@ type ExecFileFnType = (
251
251
  args: string[],
252
252
  ) => Promise<ExecFileResult>
253
253
 
254
+ /**
255
+ * Filesystem injection point for the docker-mode /proc walk so tests can
256
+ * drive synthetic `/proc/<pid>/{comm,stat,status}` strings without
257
+ * touching the real host fs.
258
+ */
259
+ export interface ProcFsImpl {
260
+ readdir: (path: string) => string[]
261
+ readFile: (path: string) => string
262
+ }
263
+
264
+ const realProcFs: ProcFsImpl = {
265
+ readdir: (p) => readdirSync(p),
266
+ readFile: (p) => readFileSync(p, 'utf-8'),
267
+ }
268
+
269
+ type AgentCandidate = {
270
+ pid: number
271
+ rssKb: number
272
+ comm: string
273
+ starttime: number
274
+ }
275
+
276
+ /**
277
+ * Walk `/proc` from inside the current pid-namespace and pick the
278
+ * heaviest claude/node process. Used for the docker-mode agent probe:
279
+ * inside an agent container, we share the namespace with claude, so a
280
+ * /proc walk replaces the systemctl-driven cgroup walk used under
281
+ * systemd. Skips wrappers (tmux/expect/script/bash/sh) and our own
282
+ * gateway PID. Exported for tests.
283
+ */
284
+ export function findAgentProcessInContainer(
285
+ fs: ProcFsImpl = realProcFs,
286
+ ): AgentCandidate | null {
287
+ let entries: string[]
288
+ try {
289
+ entries = fs.readdir('/proc')
290
+ } catch {
291
+ return null
292
+ }
293
+ const candidates: AgentCandidate[] = []
294
+ for (const entry of entries) {
295
+ if (!/^\d+$/.test(entry)) continue
296
+ const pid = Number(entry)
297
+ if (!Number.isFinite(pid) || pid <= 0) continue
298
+ if (pid === process.pid) continue
299
+ let comm = ''
300
+ try {
301
+ comm = fs.readFile(`/proc/${pid}/comm`).trim()
302
+ } catch {
303
+ continue
304
+ }
305
+ let rssKb = 0
306
+ try {
307
+ const status = fs.readFile(`/proc/${pid}/status`)
308
+ const m = status.match(/^VmRSS:\s+(\d+)/m)
309
+ if (m) rssKb = parseInt(m[1], 10) || 0
310
+ } catch {
311
+ continue
312
+ }
313
+ let starttime = 0
314
+ try {
315
+ const stat = fs.readFile(`/proc/${pid}/stat`)
316
+ // /proc/<pid>/stat format: pid (comm-with-parens) state ppid ...
317
+ // field 22 (1-indexed) is starttime in clock ticks since boot.
318
+ // comm can contain spaces/parens — use the LAST ')' as the
319
+ // anchor so we tokenize the remainder safely.
320
+ const close = stat.lastIndexOf(')')
321
+ const tail = close >= 0 ? stat.slice(close + 2) : stat
322
+ const fields = tail.trim().split(/\s+/)
323
+ // After the "(comm)" group, the remaining fields are state, ppid,
324
+ // ... with starttime at index 19 (0-indexed) of `tail` because
325
+ // field 3 (state) is `tail[0]`.
326
+ const st = Number(fields[19])
327
+ if (Number.isFinite(st) && st > 0) starttime = st
328
+ } catch {
329
+ continue
330
+ }
331
+ candidates.push({ pid, rssKb, comm, starttime })
332
+ }
333
+ if (candidates.length === 0) return null
334
+
335
+ const isAgent = (c: AgentCandidate): boolean => c.comm === 'claude'
336
+ const isWrapper = (c: AgentCandidate): boolean =>
337
+ c.comm === 'tmux' || c.comm.startsWith('tmux:') ||
338
+ c.comm === 'expect' || c.comm === 'script' ||
339
+ c.comm === 'bash' || c.comm === 'sh' ||
340
+ c.comm === 'tini' || c.comm === 'sleep'
341
+
342
+ const claudeMatches = candidates.filter(isAgent)
343
+ if (claudeMatches.length > 0) {
344
+ claudeMatches.sort((a, b) => b.rssKb - a.rssKb)
345
+ return claudeMatches[0]
346
+ }
347
+ // No `claude` comm — fall back to heaviest non-wrapper node process.
348
+ const nodeMatches = candidates
349
+ .filter(c => c.comm === 'node' && !isWrapper(c))
350
+ .sort((a, b) => b.rssKb - a.rssKb)
351
+ if (nodeMatches.length > 0) return nodeMatches[0]
352
+ return null
353
+ }
354
+
355
+ /**
356
+ * Read /proc/uptime to derive the agent process's uptime from its
357
+ * starttime (clock ticks since boot). Returns null on any failure.
358
+ *
359
+ * SC_CLK_TCK (the units of `starttime` in /proc/<pid>/stat) is a stable
360
+ * kernel ABI value, hardcoded to 100 on x86_64 across Debian/Ubuntu/
361
+ * Alpine/RHEL. If we ever ship on arm64 hosts where some kernels use
362
+ * 250, uptimes will look 2.5× too large and we'll revisit.
363
+ */
364
+ export function uptimeMsForStarttime(
365
+ starttimeTicks: number,
366
+ fs: ProcFsImpl = realProcFs,
367
+ ): number | null {
368
+ try {
369
+ const uptimeRaw = fs.readFile('/proc/uptime').trim()
370
+ const bootUptimeSec = Number(uptimeRaw.split(/\s+/)[0])
371
+ if (!Number.isFinite(bootUptimeSec) || bootUptimeSec <= 0) return null
372
+ const HZ = 100
373
+ const procUptimeSec = bootUptimeSec - starttimeTicks / HZ
374
+ if (procUptimeSec < 0) return null
375
+ return Math.round(procUptimeSec * 1000)
376
+ } catch {
377
+ return null
378
+ }
379
+ }
380
+
381
+ function probeAgentProcessDocker(): ProbeResult {
382
+ const found = findAgentProcessInContainer()
383
+ if (!found) {
384
+ return { status: 'fail', label: 'Agent', detail: 'claude process not found' }
385
+ }
386
+ const uptimeMs = uptimeMsForStarttime(found.starttime)
387
+ const mb = Math.round(found.rssKb / 1024)
388
+ const parts = [
389
+ `PID ${found.pid}`,
390
+ uptimeMs != null ? `up ${formatMs(uptimeMs)}` : '',
391
+ mb > 0 ? `${mb} MB` : '',
392
+ ].filter(Boolean)
393
+ return { status: 'ok', label: 'Agent', detail: parts.join(' · ') }
394
+ }
395
+
254
396
  /**
255
397
  * Resolve the "real" agent PID under tmux supervisor by walking the
256
398
  * unit's cgroup and picking the heaviest-RSS claude/node process.
@@ -371,8 +513,19 @@ export async function probeAgentProcess(
371
513
  /** When true, resolve PID via cgroup walk (heaviest claude/node) — under
372
514
  * tmux supervisor MainPID is the tmux server (~2MB) which is misleading. */
373
515
  tmuxSupervisor?: boolean
516
+ /** When true, skip systemctl entirely. The gateway is running INSIDE the
517
+ * agent container alongside claude, so we walk /proc directly. There's
518
+ * no "service deactivating/activating" model under docker — claude is
519
+ * either there or it isn't, so we return single-shot without retry. */
520
+ dockerMode?: boolean
521
+ /** Test override — defaults to the real probeAgentProcessDocker(). */
522
+ dockerProbeImpl?: () => ProbeResult
374
523
  } = {},
375
524
  ): Promise<ProbeResult> {
525
+ if (opts.dockerMode) {
526
+ const impl = opts.dockerProbeImpl ?? probeAgentProcessDocker
527
+ return withTimeout('Agent', Promise.resolve(impl()))
528
+ }
376
529
  const retryIntervalMs = opts.retryIntervalMs ?? AGENT_RETRY_INTERVAL_MS
377
530
  const retryMaxMs = opts.retryMaxMs ?? AGENT_RETRY_MAX_MS
378
531
  const sleep = opts.sleepImpl ?? ((ms: number) => new Promise(resolve => setTimeout(resolve, ms)))
@@ -469,8 +622,18 @@ export async function* watchAgentProcess(
469
622
  nowImpl?: () => number
470
623
  /** When true, resolve PID via cgroup walk (heaviest claude/node). */
471
624
  tmuxSupervisor?: boolean
625
+ /** When true, skip systemctl: yield once with the current /proc-derived
626
+ * state and exit. Mirrors probeAgentProcess's docker-mode shortcut. */
627
+ dockerMode?: boolean
628
+ /** Test override — defaults to the real probeAgentProcessDocker(). */
629
+ dockerProbeImpl?: () => ProbeResult
472
630
  } = {},
473
631
  ): AsyncGenerator<ProbeResult> {
632
+ if (opts.dockerMode) {
633
+ const impl = opts.dockerProbeImpl ?? probeAgentProcessDocker
634
+ yield impl()
635
+ return
636
+ }
474
637
  const liveWindowMs = opts.liveWindowMs ?? AGENT_LIVE_WINDOW_MS
475
638
  const pollIntervalMs = opts.pollIntervalMs ?? AGENT_LIVE_POLL_INTERVAL_MS
476
639
  const followupRepollMs = opts.followupRepollMs ?? AGENT_LIVE_FOLLOWUP_REPOLL_MS
@@ -767,97 +930,372 @@ export async function probeHindsight(
767
930
  })())
768
931
  }
769
932
 
770
- // ─── Probe: Cron timers ──────────────────────────────────────────────────────
933
+ // ─── Probe: Scheduler (in-container agent-scheduler since Phase 4) ───────────
771
934
 
772
- interface SystemctlTimerEntry {
773
- next?: string
774
- left?: string
775
- last?: string
776
- unit?: string
777
- activates?: string
778
- passed?: string
935
+ /**
936
+ * Default lock and audit-jsonl paths inside the agent container.
937
+ * Mirrored from src/agent-scheduler/index.ts:194-197 — kept in sync there.
938
+ */
939
+ const SCHEDULER_LOCK_PATH_DEFAULT = '/state/agent/scheduler.lock'
940
+ const SCHEDULER_JSONL_PATH_DEFAULT = '/state/agent/scheduler.jsonl'
941
+
942
+ /**
943
+ * How long after PID 1 started we treat a missing/dead scheduler as
944
+ * "still settling" rather than a hard fail. Boot-card already has its
945
+ * own 6 s settle window before probes run, so this only matters for
946
+ * /status hits during the first ~30 s of a container's life — long
947
+ * enough to cover supervisor + bun startup on a slow host without
948
+ * hiding a genuinely wedged scheduler.
949
+ */
950
+ const SCHEDULER_FRESH_BOOT_MS = 30_000
951
+
952
+ /**
953
+ * Read PID 1's start time inside the container (ms since epoch). Used
954
+ * to soften scheduler probe verdicts during the early-boot window.
955
+ * Mirrors `readContainerBootTimeMs` from src/agent-scheduler/lock.ts —
956
+ * we duplicate the small reader here rather than import across the
957
+ * src/telegram-plugin boundary, since the plugin is built standalone.
958
+ *
959
+ * Returns null on any /proc parse failure → caller skips the softening.
960
+ */
961
+ function readContainerBootTimeMsForProbe(): number | null {
962
+ try {
963
+ const stat1 = readFileSync('/proc/1/stat', 'utf8')
964
+ const lastParen = stat1.lastIndexOf(')')
965
+ if (lastParen < 0) return null
966
+ const after = stat1.slice(lastParen + 1).trim().split(/\s+/)
967
+ const starttimeTicks = Number(after[19])
968
+ if (!Number.isFinite(starttimeTicks)) return null
969
+ const procStat = readFileSync('/proc/stat', 'utf8')
970
+ const btimeLine = procStat.split('\n').find((l) => l.startsWith('btime '))
971
+ if (!btimeLine) return null
972
+ const btimeSec = Number(btimeLine.split(/\s+/)[1])
973
+ if (!Number.isFinite(btimeSec)) return null
974
+ const CLK_TCK = 100
975
+ return (btimeSec + starttimeTicks / CLK_TCK) * 1000
976
+ } catch {
977
+ return null
978
+ }
779
979
  }
780
980
 
781
- function parseTimerLeft(left: string | undefined): number | null {
782
- if (!left) return null
783
- // format: "1h 32min left" or "2min 5s left" or similar
784
- let ms = 0
785
- const h = left.match(/(\d+)h/)
786
- const m = left.match(/(\d+)min/)
787
- const s = left.match(/(\d+)s/)
788
- if (h) ms += Number(h[1]) * 3600_000
789
- if (m) ms += Number(m[1]) * 60_000
790
- if (s) ms += Number(s[1]) * 1000
791
- return ms > 0 ? ms : null
981
+ /**
982
+ * Filesystem injection point for the scheduler probe. Same shape as
983
+ * ProcFsImpl but read-only against arbitrary paths. Tests inject a
984
+ * synthetic fs to drive lockfile contents and jsonl tails without
985
+ * touching disk.
986
+ */
987
+ export interface SchedulerFsImpl {
988
+ readFile: (path: string) => string
989
+ /** stat-mtime, ms-since-epoch. Used to age the audit jsonl. */
990
+ mtimeMs: (path: string) => number
991
+ exists: (path: string) => boolean
792
992
  }
793
993
 
794
- export async function probeCronTimers(
795
- agentName: string,
796
- opts: { execFileImpl?: ExecFileFnType } = {},
994
+ const realSchedulerFs: SchedulerFsImpl = {
995
+ readFile: (p) => readFileSync(p, 'utf-8'),
996
+ mtimeMs: (p) => {
997
+ // `existsSync` shaped path keeps the probe defensive — caller checks
998
+ // exists() first. statSync is imported via the readdirSync chain.
999
+ // eslint-disable-next-line @typescript-eslint/no-require-imports
1000
+ const { statSync } = require('fs') as typeof import('fs')
1001
+ return statSync(p).mtimeMs
1002
+ },
1003
+ exists: (p) => existsSync(p),
1004
+ }
1005
+
1006
+ /**
1007
+ * Probe the in-container agent-scheduler (cron-fold-in cutover, Phase 4
1008
+ * — see CLAUDE.md "Cron-fold-in note"). Replaces the pre-Phase-4 probe
1009
+ * that queried `systemctl --user list-timers switchroom-<agent>-cron-*`
1010
+ * (those timers no longer exist) and the dockerMode short-circuit that
1011
+ * lied with "managed by switchroom-cron" (that container was retired in
1012
+ * PR #893).
1013
+ *
1014
+ * The scheduler is a sibling sidecar started by start.sh's
1015
+ * _switchroom_supervise wrapper. It writes a pidfile-with-liveness lock
1016
+ * at /state/agent/scheduler.lock (src/agent-scheduler/lock.ts) and an
1017
+ * audit row per fire to /state/agent/scheduler.jsonl
1018
+ * (src/agent-scheduler/index.ts:256, src/scheduler/audit.ts).
1019
+ *
1020
+ * ok — lockfile present, holder PID alive
1021
+ * degraded — lockfile present but PID dead (supervisor mid-restart, or
1022
+ * sched crashed and supervisor hasn't relaunched yet)
1023
+ * fail — lockfile missing (sidecar never started or supervisor
1024
+ * gave up after restart-cap)
1025
+ *
1026
+ * Outside dockerMode the probe is silent (returns ok with "n/a"). Phase
1027
+ * 4 deleted the host-side scheduler entirely; non-docker callers
1028
+ * (legacy systemd installs, tests) have no scheduler to probe.
1029
+ */
1030
+ export async function probeScheduler(
1031
+ _agentName: string,
1032
+ opts: {
1033
+ dockerMode?: boolean
1034
+ fs?: SchedulerFsImpl
1035
+ /** Override the lockfile path. Defaults to env
1036
+ * `SWITCHROOM_AGENT_SCHEDULER_LOCK` (matches the override the
1037
+ * scheduler itself reads at src/agent-scheduler/index.ts:196), then
1038
+ * to `/state/agent/scheduler.lock`. */
1039
+ lockPath?: string
1040
+ /** Override the audit-jsonl path. Defaults to env
1041
+ * `SWITCHROOM_AGENT_SCHEDULER_JSONL`, then to
1042
+ * `/state/agent/scheduler.jsonl` (mirrors index.ts:194). */
1043
+ jsonlPath?: string
1044
+ /** Liveness check for the holder PID — defaults to process.kill(pid, 0). */
1045
+ isAlive?: (pid: number) => boolean
1046
+ now?: () => number
1047
+ /** Container PID-1 start time in ms since epoch. When set AND the
1048
+ * current time is within `SCHEDULER_FRESH_BOOT_MS` of it, scheduler
1049
+ * fail/degraded verdicts are softened to "still settling". Pass
1050
+ * `null` to disable the softening (e.g. unit tests pinning a hard
1051
+ * fail). Defaults to `readContainerBootTimeMsForProbe()`. */
1052
+ containerBootTimeMs?: number | null
1053
+ } = {},
797
1054
  ): Promise<ProbeResult> {
798
- const execFileFn: ExecFileFnType = opts.execFileImpl ?? execFile
799
- return withTimeout('Crons', (async (): Promise<ProbeResult> => {
800
- let stdout: string
1055
+ if (!opts.dockerMode) {
1056
+ return { status: 'ok', label: 'Scheduler', detail: 'n/a (non-docker)' }
1057
+ }
1058
+ return withTimeout('Scheduler', (async (): Promise<ProbeResult> => {
1059
+ const fs = opts.fs ?? realSchedulerFs
1060
+ const lockPath = opts.lockPath
1061
+ ?? process.env.SWITCHROOM_AGENT_SCHEDULER_LOCK
1062
+ ?? SCHEDULER_LOCK_PATH_DEFAULT
1063
+ const jsonlPath = opts.jsonlPath
1064
+ ?? process.env.SWITCHROOM_AGENT_SCHEDULER_JSONL
1065
+ ?? SCHEDULER_JSONL_PATH_DEFAULT
1066
+ const now = opts.now ?? Date.now
1067
+ const isAlive = opts.isAlive ?? ((pid: number) => {
1068
+ try { process.kill(pid, 0); return true } catch { return false }
1069
+ })
1070
+ const bootTimeMs = 'containerBootTimeMs' in opts
1071
+ ? opts.containerBootTimeMs
1072
+ : readContainerBootTimeMsForProbe()
1073
+ const stillSettling = bootTimeMs != null
1074
+ && (now() - bootTimeMs) < SCHEDULER_FRESH_BOOT_MS
1075
+ const settlingNote = stillSettling ? ' (still settling)' : ''
1076
+
1077
+ if (!fs.exists(lockPath)) {
1078
+ // During the first ~30 s of a container's life, "no lockfile" is
1079
+ // the supervisor + bun still starting up. /status hit at that
1080
+ // moment shouldn't show 🔴 for a non-issue.
1081
+ return {
1082
+ status: stillSettling ? 'degraded' : 'fail',
1083
+ label: 'Scheduler',
1084
+ detail: `sidecar not running (no lockfile)${settlingNote}`,
1085
+ }
1086
+ }
1087
+ let holderPid: number | null = null
801
1088
  try {
802
- const result = await execFileFn('systemctl', [
803
- '--user', 'list-timers',
804
- `switchroom-${agentName}-cron-*`,
805
- '--output=json',
806
- '--all',
807
- ])
808
- stdout = result.stdout.trim()
809
- } catch (err: unknown) {
810
- // systemctl exits non-zero when no units match
811
- const msg = (err as NodeJS.ErrnoException)?.message ?? String(err)
812
- // child_process exec errors have `code` typed as string in
813
- // NodeJS.ErrnoException, but at runtime it's numeric for shell
814
- // exit codes. Stringify to avoid the type-system mismatch and
815
- // the comparison "looks unintentional" warning.
816
- if (msg.includes('No timers found') || String((err as NodeJS.ErrnoException)?.code) === '1') {
817
- return { status: 'ok', label: 'Crons', detail: '0 timers' }
1089
+ const raw = fs.readFile(lockPath).trim()
1090
+ const parsed = Number.parseInt(raw, 10)
1091
+ if (Number.isInteger(parsed) && parsed > 0) holderPid = parsed
1092
+ } catch {
1093
+ return { status: 'degraded', label: 'Scheduler', detail: 'lockfile unreadable' }
1094
+ }
1095
+ if (holderPid == null) {
1096
+ return { status: 'degraded', label: 'Scheduler', detail: 'lockfile contents invalid' }
1097
+ }
1098
+ if (!isAlive(holderPid)) {
1099
+ return {
1100
+ status: 'degraded',
1101
+ label: 'Scheduler',
1102
+ detail: `lock holder pid ${holderPid} not alive (supervisor restart in progress?)`,
818
1103
  }
819
- return { status: 'fail', label: 'Crons', detail: `systemctl failed: ${msg}` }
820
1104
  }
821
1105
 
822
- if (!stdout || stdout === '[]' || stdout.length === 0) {
823
- return { status: 'ok', label: 'Crons', detail: '0 timers' }
1106
+ // Sidecar is up. Add a freshness hint from scheduler.jsonl if present
1107
+ // gives the user signal that fires are actually happening, not just
1108
+ // that the daemon is breathing. Absence is fine: a freshly booted
1109
+ // agent or a 0-entry agent has no fires to report.
1110
+ let detail = `running (pid ${holderPid})`
1111
+ if (fs.exists(jsonlPath)) {
1112
+ try {
1113
+ const ageMs = now() - fs.mtimeMs(jsonlPath)
1114
+ if (Number.isFinite(ageMs) && ageMs >= 0) {
1115
+ detail += ` · last fire ${formatMs(ageMs)} ago`
1116
+ }
1117
+ } catch {
1118
+ // mtime read failed — keep the basic detail; non-blocking.
1119
+ }
824
1120
  }
1121
+ return { status: 'ok', label: 'Scheduler', detail }
1122
+ })())
1123
+ }
1124
+
1125
+ // ─── Probe: Vault broker / approval kernel reachability ──────────────────────
825
1126
 
826
- let timers: SystemctlTimerEntry[] = []
1127
+ /**
1128
+ * Generic UDS-reachability probe used for both vault-broker and
1129
+ * approval-kernel. Path-as-identity invariant (CLAUDE.md "Per-agent
1130
+ * socket model") — bind paths are mounted into each agent container at
1131
+ * /run/switchroom/{broker,kernel}/<agent>/sock. ENOENT means the
1132
+ * compose volume isn't mounted (broker container down or no agent dir
1133
+ * yet); ECONNREFUSED means the bind disappeared between us and the
1134
+ * daemon (rare, broker shutdown removes the socket).
1135
+ *
1136
+ * Connect-test only — we do NOT send a wire request. The probe must not
1137
+ * authenticate as the agent or do any vault/grant work; that's the
1138
+ * agent's job. We just want to know "is something listening on this
1139
+ * socket". Connection is closed immediately on success.
1140
+ */
1141
+ async function probeUds(
1142
+ label: string,
1143
+ socketPath: string | undefined,
1144
+ opts: { dockerMode?: boolean; connectImpl?: (path: string) => Promise<void> } = {},
1145
+ ): Promise<ProbeResult> {
1146
+ if (!opts.dockerMode) {
1147
+ return { status: 'ok', label, detail: 'n/a (non-docker)' }
1148
+ }
1149
+ if (!socketPath) {
1150
+ return { status: 'fail', label, detail: 'socket path not configured' }
1151
+ }
1152
+ return withTimeout(label, (async (): Promise<ProbeResult> => {
1153
+ if (!opts.connectImpl) {
1154
+ // Cheap pre-check: stat the file. Saves the connect round-trip on
1155
+ // the common "broker container down → bind mount empty" case.
1156
+ if (!existsSync(socketPath)) {
1157
+ return { status: 'fail', label, detail: `socket missing: ${socketPath}` }
1158
+ }
1159
+ }
1160
+ const connect = opts.connectImpl ?? defaultUdsConnect
827
1161
  try {
828
- timers = JSON.parse(stdout) as SystemctlTimerEntry[]
829
- } catch {
830
- // Fall back to line-count if JSON failed
831
- const count = stdout.split('\n').filter(l => l.includes('cron')).length
832
- return { status: 'ok', label: 'Crons', detail: `${count} timers` }
1162
+ await connect(socketPath)
1163
+ return { status: 'ok', label, detail: 'reachable' }
1164
+ } catch (err: unknown) {
1165
+ const code = (err as NodeJS.ErrnoException)?.code
1166
+ const msg = (err as Error)?.message ?? String(err)
1167
+ if (code === 'ENOENT') return { status: 'fail', label, detail: 'socket missing' }
1168
+ if (code === 'ECONNREFUSED') return { status: 'fail', label, detail: 'connection refused' }
1169
+ return { status: 'fail', label, detail: `connect failed: ${msg}` }
833
1170
  }
1171
+ })())
1172
+ }
834
1173
 
835
- if (!Array.isArray(timers) || timers.length === 0) {
836
- return { status: 'ok', label: 'Crons', detail: '0 timers' }
837
- }
1174
+ /**
1175
+ * Default UDS connect — opens a stream, then immediately closes it.
1176
+ * Resolves on `connect` event, rejects on `error`. 1s connect timeout
1177
+ * is plenty for a local socket (the per-probe timeout in withTimeout
1178
+ * is the outer guard).
1179
+ */
1180
+ function defaultUdsConnect(socketPath: string): Promise<void> {
1181
+ // eslint-disable-next-line @typescript-eslint/no-require-imports
1182
+ const net = require('net') as typeof import('net')
1183
+ return new Promise<void>((resolve, reject) => {
1184
+ const sock = net.createConnection({ path: socketPath })
1185
+ const t = setTimeout(() => {
1186
+ sock.destroy()
1187
+ reject(new Error('connect timeout'))
1188
+ }, 1000)
1189
+ sock.once('connect', () => {
1190
+ clearTimeout(t)
1191
+ sock.end()
1192
+ resolve()
1193
+ })
1194
+ sock.once('error', (err) => {
1195
+ clearTimeout(t)
1196
+ sock.destroy()
1197
+ reject(err)
1198
+ })
1199
+ })
1200
+ }
1201
+
1202
+ export async function probeBroker(
1203
+ socketPath?: string,
1204
+ opts: { dockerMode?: boolean; connectImpl?: (path: string) => Promise<void> } = {},
1205
+ ): Promise<ProbeResult> {
1206
+ // SWITCHROOM_VAULT_BROKER_SOCK is the canonical client-side env name
1207
+ // — matches what src/vault/broker/client.ts:293 and the secret-guard
1208
+ // hook (telegram-plugin/hooks/secret-guard-pretool.mjs:36) read.
1209
+ // The broker SERVER reads SWITCHROOM_BROKER_SOCKET as its bind-path
1210
+ // env (in the broker container only). Pre-fix the probe + compose
1211
+ // both used SWITCHROOM_BROKER_SOCKET in the agent container — wrong
1212
+ // name, fell through to dangling-symlink fallback, false-failed.
1213
+ return probeUds('Broker', socketPath ?? process.env.SWITCHROOM_VAULT_BROKER_SOCK, opts)
1214
+ }
1215
+
1216
+ export async function probeKernel(
1217
+ socketPath?: string,
1218
+ opts: { dockerMode?: boolean; connectImpl?: (path: string) => Promise<void> } = {},
1219
+ ): Promise<ProbeResult> {
1220
+ return probeUds('Kernel', socketPath ?? process.env.SWITCHROOM_KERNEL_SOCKET, opts)
1221
+ }
838
1222
 
839
- // Find the timer that fires soonest
840
- let earliest: { name: string; leftMs: number } | null = null
841
- for (const t of timers) {
842
- const ms = parseTimerLeft(t.left)
843
- const name = (t.unit ?? t.activates ?? '').replace(/^switchroom-[^-]+-cron-/, '').replace(/\.timer$/, '')
844
- if (ms != null && (earliest == null || ms < earliest.leftMs)) {
845
- earliest = { name, leftMs: ms }
1223
+ // ─── Probe: Skills (symlink validity) ────────────────────────────────────────
1224
+
1225
+ /**
1226
+ * Validate that every entry under <agentDir>/.claude/skills/ resolves
1227
+ * to a readable file. Skills are normally symlinks into the global pool
1228
+ * `~/.switchroom/skills/` (src/agents/scaffold.ts:639); a renamed or
1229
+ * deleted skill in the pool dangles silently — claude won't surface the
1230
+ * skill, the user wonders why /<skill> doesn't work.
1231
+ *
1232
+ * ok — every entry resolves OR the dir doesn't exist (no skills
1233
+ * configured is a normal state, not a failure)
1234
+ * degraded — at least one symlink dangles; rendered detail names them
1235
+ * up to a cap so the row doesn't wrap forever
1236
+ */
1237
+ export async function probeSkills(
1238
+ agentDir: string,
1239
+ opts: { fs?: SkillsFsImpl; maxNamesShown?: number } = {},
1240
+ ): Promise<ProbeResult> {
1241
+ return withTimeout('Skills', (async (): Promise<ProbeResult> => {
1242
+ const fs = opts.fs ?? realSkillsFs
1243
+ const max = opts.maxNamesShown ?? 3
1244
+ const skillsDir = join(agentDir, '.claude', 'skills')
1245
+ if (!fs.exists(skillsDir)) {
1246
+ return { status: 'ok', label: 'Skills', detail: 'no skills dir' }
1247
+ }
1248
+ let entries: string[]
1249
+ try {
1250
+ entries = fs.readdir(skillsDir)
1251
+ } catch {
1252
+ return { status: 'degraded', label: 'Skills', detail: 'skills dir unreadable' }
1253
+ }
1254
+ if (entries.length === 0) {
1255
+ return { status: 'ok', label: 'Skills', detail: '0 skills' }
1256
+ }
1257
+ const dangling: string[] = []
1258
+ for (const name of entries) {
1259
+ // Skills are dirs containing a SKILL.md (claude convention). The
1260
+ // dangle case we worry about is a symlink whose target was
1261
+ // removed — readability of <name>/SKILL.md is the simplest proxy
1262
+ // and matches what claude itself would discover.
1263
+ const skillPath = join(skillsDir, name)
1264
+ if (!fs.exists(skillPath)) {
1265
+ dangling.push(name)
1266
+ continue
1267
+ }
1268
+ // Single-file skills exist (rare but allowed); accept them too.
1269
+ const skillMd = join(skillPath, 'SKILL.md')
1270
+ if (!fs.exists(skillMd) && !fs.exists(skillPath + '.md')) {
1271
+ // Only flag as dangling if the entry IS a symlink (a real dir
1272
+ // without SKILL.md is weird but not necessarily broken — could
1273
+ // be an in-progress local skill). We have no symlink-test in
1274
+ // SkillsFsImpl by design; conservatively don't flag as dangling.
1275
+ // The user's main risk is removed-pool-target, which existsSync
1276
+ // catches above.
1277
+ continue
846
1278
  }
847
1279
  }
848
-
849
- const count = timers.length
850
- if (!earliest) {
851
- return { status: 'ok', label: 'Crons', detail: `${count} timers` }
1280
+ if (dangling.length === 0) {
1281
+ return { status: 'ok', label: 'Skills', detail: `${entries.length} resolved` }
852
1282
  }
853
-
854
- const h = Math.floor(earliest.leftMs / 3600_000)
855
- const m = Math.round((earliest.leftMs % 3600_000) / 60_000)
856
- const timeStr = h > 0 ? `${h}h ${m}m` : `${m}m`
1283
+ const named = dangling.slice(0, max).join(', ')
1284
+ const more = dangling.length > max ? ` +${dangling.length - max} more` : ''
857
1285
  return {
858
- status: 'ok',
859
- label: 'Crons',
860
- detail: `${count} timers · next: ${earliest.name} in ${timeStr}`,
1286
+ status: 'degraded',
1287
+ label: 'Skills',
1288
+ detail: `${dangling.length}/${entries.length} dangling: ${named}${more}`,
861
1289
  }
862
1290
  })())
863
1291
  }
1292
+
1293
+ export interface SkillsFsImpl {
1294
+ readdir: (p: string) => string[]
1295
+ exists: (p: string) => boolean
1296
+ }
1297
+
1298
+ const realSkillsFs: SkillsFsImpl = {
1299
+ readdir: (p) => readdirSync(p),
1300
+ exists: (p) => existsSync(p),
1301
+ }