switchroom 0.5.0 → 0.7.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +142 -121
- package/bin/autoaccept.exp +29 -6
- package/dist/agent-scheduler/index.js +12261 -0
- package/dist/cli/autoaccept-poll.js +10 -0
- package/dist/cli/switchroom.js +27250 -25324
- package/dist/vault/approvals/kernel-server.js +12709 -0
- package/dist/vault/broker/server.js +15724 -0
- package/package.json +4 -3
- package/profiles/_base/start.sh.hbs +133 -0
- package/profiles/_shared/telegram-style.md.hbs +3 -3
- package/profiles/default/CLAUDE.md +3 -3
- package/profiles/default/CLAUDE.md.hbs +2 -2
- package/profiles/default/workspace/CLAUDE.md.hbs +9 -0
- package/skills/docx/VENDORED.md +1 -1
- package/skills/mcp-builder/VENDORED.md +1 -1
- package/skills/pdf/VENDORED.md +1 -1
- package/skills/pptx/VENDORED.md +1 -1
- package/skills/skill-creator/VENDORED.md +1 -1
- package/skills/switchroom-architecture/SKILL.md +8 -7
- package/skills/switchroom-cli/SKILL.md +23 -15
- package/skills/switchroom-health/SKILL.md +7 -7
- package/skills/switchroom-install/SKILL.md +36 -39
- package/skills/switchroom-manage/SKILL.md +4 -4
- package/skills/switchroom-status/SKILL.md +1 -1
- package/skills/webapp-testing/VENDORED.md +1 -1
- package/skills/xlsx/VENDORED.md +1 -1
- package/telegram-plugin/admin-commands/dispatch.test.ts +119 -1
- package/telegram-plugin/admin-commands/index.ts +71 -0
- package/telegram-plugin/ask-user.ts +1 -0
- package/telegram-plugin/card-event-log.ts +138 -0
- package/telegram-plugin/dist/bridge/bridge.js +178 -31
- package/telegram-plugin/dist/foreman/foreman.js +6875 -6526
- package/telegram-plugin/dist/gateway/gateway.js +13862 -11834
- package/telegram-plugin/dist/server.js +202 -40
- package/telegram-plugin/fleet-state.ts +25 -10
- package/telegram-plugin/foreman/foreman.ts +38 -3
- package/telegram-plugin/gateway/approval-callback.ts +126 -0
- package/telegram-plugin/gateway/approval-card.test.ts +90 -0
- package/telegram-plugin/gateway/approval-card.ts +127 -0
- package/telegram-plugin/gateway/approvals-commands.ts +126 -0
- package/telegram-plugin/gateway/boot-card.ts +31 -6
- package/telegram-plugin/gateway/boot-probes.ts +503 -72
- package/telegram-plugin/gateway/gateway.ts +822 -94
- package/telegram-plugin/gateway/ipc-protocol.ts +34 -1
- package/telegram-plugin/gateway/ipc-server.ts +35 -0
- package/telegram-plugin/gateway/startup-mutex.ts +110 -2
- package/telegram-plugin/hooks/hooks.json +19 -0
- package/telegram-plugin/hooks/tool-label-pretool.mjs +216 -0
- package/telegram-plugin/hooks/tool-label-stop.mjs +63 -0
- package/telegram-plugin/package.json +4 -1
- package/telegram-plugin/plugin-logger.ts +20 -1
- package/telegram-plugin/progress-card-driver.ts +202 -13
- package/telegram-plugin/progress-card.ts +2 -2
- package/telegram-plugin/quota-check.ts +1 -0
- package/telegram-plugin/registry/subagents-schema.ts +37 -0
- package/telegram-plugin/registry/subagents.test.ts +64 -0
- package/telegram-plugin/session-tail.ts +58 -5
- package/telegram-plugin/shared/bot-runtime.ts +48 -2
- package/telegram-plugin/subagent-watcher.ts +139 -7
- package/telegram-plugin/tests/_progress-card-harness.ts +4 -0
- package/telegram-plugin/tests/bg-agent-progress-card-757.test.ts +201 -0
- package/telegram-plugin/tests/boot-card-probe-target.test.ts +10 -34
- package/telegram-plugin/tests/boot-card-render.test.ts +6 -5
- package/telegram-plugin/tests/boot-probes.test.ts +558 -0
- package/telegram-plugin/tests/card-event-log.test.ts +145 -0
- package/telegram-plugin/tests/gateway-startup-mutex.test.ts +102 -0
- package/telegram-plugin/tests/ipc-server-validate-inject-inbound.test.ts +134 -0
- package/telegram-plugin/tests/progress-card-delay-842.test.ts +160 -0
- package/telegram-plugin/tests/quota-check.test.ts +37 -1
- package/telegram-plugin/tests/subagent-registry-bugs.test.ts +5 -0
- package/telegram-plugin/tests/subagent-watcher-stall-notification.test.ts +104 -1
- package/telegram-plugin/tests/subagent-watcher.test.ts +5 -0
- package/telegram-plugin/tests/tool-label-sidecar.test.ts +114 -0
- package/telegram-plugin/tests/two-zone-bg-done-when-all-terminal.test.ts +5 -3
- package/telegram-plugin/tests/two-zone-card-header-phases.test.ts +10 -0
- package/telegram-plugin/tests/two-zone-snapshot-extras.test.ts +58 -14
- package/telegram-plugin/tests/welcome-text.test.ts +57 -0
- package/telegram-plugin/tool-label-sidecar.ts +140 -0
- package/telegram-plugin/tool-labels.ts +55 -0
- package/telegram-plugin/two-zone-card.ts +27 -7
- package/telegram-plugin/uat/SETUP.md +160 -0
- package/telegram-plugin/uat/assertions.ts +140 -0
- package/telegram-plugin/uat/driver.ts +174 -0
- package/telegram-plugin/uat/harness.ts +161 -0
- package/telegram-plugin/uat/login.ts +134 -0
- package/telegram-plugin/uat/port-allocator.ts +71 -0
- package/telegram-plugin/uat/scenarios/smoke-clerk-reply.test.ts +61 -0
- package/telegram-plugin/welcome-text.ts +44 -2
- package/bin/bridge-watchdog.sh +0 -967
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
* caller as a thrown error — only as ProbeResult{ status:'fail', ... }.
|
|
12
12
|
*/
|
|
13
13
|
|
|
14
|
-
import { readFileSync, existsSync, mkdirSync, writeFileSync } from 'fs'
|
|
14
|
+
import { readFileSync, readdirSync, existsSync, mkdirSync, writeFileSync } from 'fs'
|
|
15
15
|
import { join } from 'path'
|
|
16
16
|
import { execFile as execFileCb } from 'child_process'
|
|
17
17
|
import { promisify } from 'util'
|
|
@@ -251,6 +251,148 @@ type ExecFileFnType = (
|
|
|
251
251
|
args: string[],
|
|
252
252
|
) => Promise<ExecFileResult>
|
|
253
253
|
|
|
254
|
+
/**
|
|
255
|
+
* Filesystem injection point for the docker-mode /proc walk so tests can
|
|
256
|
+
* drive synthetic `/proc/<pid>/{comm,stat,status}` strings without
|
|
257
|
+
* touching the real host fs.
|
|
258
|
+
*/
|
|
259
|
+
export interface ProcFsImpl {
|
|
260
|
+
readdir: (path: string) => string[]
|
|
261
|
+
readFile: (path: string) => string
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
const realProcFs: ProcFsImpl = {
|
|
265
|
+
readdir: (p) => readdirSync(p),
|
|
266
|
+
readFile: (p) => readFileSync(p, 'utf-8'),
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
type AgentCandidate = {
|
|
270
|
+
pid: number
|
|
271
|
+
rssKb: number
|
|
272
|
+
comm: string
|
|
273
|
+
starttime: number
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
/**
|
|
277
|
+
* Walk `/proc` from inside the current pid-namespace and pick the
|
|
278
|
+
* heaviest claude/node process. Used for the docker-mode agent probe:
|
|
279
|
+
* inside an agent container, we share the namespace with claude, so a
|
|
280
|
+
* /proc walk replaces the systemctl-driven cgroup walk used under
|
|
281
|
+
* systemd. Skips wrappers (tmux/expect/script/bash/sh) and our own
|
|
282
|
+
* gateway PID. Exported for tests.
|
|
283
|
+
*/
|
|
284
|
+
export function findAgentProcessInContainer(
|
|
285
|
+
fs: ProcFsImpl = realProcFs,
|
|
286
|
+
): AgentCandidate | null {
|
|
287
|
+
let entries: string[]
|
|
288
|
+
try {
|
|
289
|
+
entries = fs.readdir('/proc')
|
|
290
|
+
} catch {
|
|
291
|
+
return null
|
|
292
|
+
}
|
|
293
|
+
const candidates: AgentCandidate[] = []
|
|
294
|
+
for (const entry of entries) {
|
|
295
|
+
if (!/^\d+$/.test(entry)) continue
|
|
296
|
+
const pid = Number(entry)
|
|
297
|
+
if (!Number.isFinite(pid) || pid <= 0) continue
|
|
298
|
+
if (pid === process.pid) continue
|
|
299
|
+
let comm = ''
|
|
300
|
+
try {
|
|
301
|
+
comm = fs.readFile(`/proc/${pid}/comm`).trim()
|
|
302
|
+
} catch {
|
|
303
|
+
continue
|
|
304
|
+
}
|
|
305
|
+
let rssKb = 0
|
|
306
|
+
try {
|
|
307
|
+
const status = fs.readFile(`/proc/${pid}/status`)
|
|
308
|
+
const m = status.match(/^VmRSS:\s+(\d+)/m)
|
|
309
|
+
if (m) rssKb = parseInt(m[1], 10) || 0
|
|
310
|
+
} catch {
|
|
311
|
+
continue
|
|
312
|
+
}
|
|
313
|
+
let starttime = 0
|
|
314
|
+
try {
|
|
315
|
+
const stat = fs.readFile(`/proc/${pid}/stat`)
|
|
316
|
+
// /proc/<pid>/stat format: pid (comm-with-parens) state ppid ...
|
|
317
|
+
// field 22 (1-indexed) is starttime in clock ticks since boot.
|
|
318
|
+
// comm can contain spaces/parens — use the LAST ')' as the
|
|
319
|
+
// anchor so we tokenize the remainder safely.
|
|
320
|
+
const close = stat.lastIndexOf(')')
|
|
321
|
+
const tail = close >= 0 ? stat.slice(close + 2) : stat
|
|
322
|
+
const fields = tail.trim().split(/\s+/)
|
|
323
|
+
// After the "(comm)" group, the remaining fields are state, ppid,
|
|
324
|
+
// ... with starttime at index 19 (0-indexed) of `tail` because
|
|
325
|
+
// field 3 (state) is `tail[0]`.
|
|
326
|
+
const st = Number(fields[19])
|
|
327
|
+
if (Number.isFinite(st) && st > 0) starttime = st
|
|
328
|
+
} catch {
|
|
329
|
+
continue
|
|
330
|
+
}
|
|
331
|
+
candidates.push({ pid, rssKb, comm, starttime })
|
|
332
|
+
}
|
|
333
|
+
if (candidates.length === 0) return null
|
|
334
|
+
|
|
335
|
+
const isAgent = (c: AgentCandidate): boolean => c.comm === 'claude'
|
|
336
|
+
const isWrapper = (c: AgentCandidate): boolean =>
|
|
337
|
+
c.comm === 'tmux' || c.comm.startsWith('tmux:') ||
|
|
338
|
+
c.comm === 'expect' || c.comm === 'script' ||
|
|
339
|
+
c.comm === 'bash' || c.comm === 'sh' ||
|
|
340
|
+
c.comm === 'tini' || c.comm === 'sleep'
|
|
341
|
+
|
|
342
|
+
const claudeMatches = candidates.filter(isAgent)
|
|
343
|
+
if (claudeMatches.length > 0) {
|
|
344
|
+
claudeMatches.sort((a, b) => b.rssKb - a.rssKb)
|
|
345
|
+
return claudeMatches[0]
|
|
346
|
+
}
|
|
347
|
+
// No `claude` comm — fall back to heaviest non-wrapper node process.
|
|
348
|
+
const nodeMatches = candidates
|
|
349
|
+
.filter(c => c.comm === 'node' && !isWrapper(c))
|
|
350
|
+
.sort((a, b) => b.rssKb - a.rssKb)
|
|
351
|
+
if (nodeMatches.length > 0) return nodeMatches[0]
|
|
352
|
+
return null
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
/**
|
|
356
|
+
* Read /proc/uptime to derive the agent process's uptime from its
|
|
357
|
+
* starttime (clock ticks since boot). Returns null on any failure.
|
|
358
|
+
*
|
|
359
|
+
* SC_CLK_TCK (the units of `starttime` in /proc/<pid>/stat) is a stable
|
|
360
|
+
* kernel ABI value, hardcoded to 100 on x86_64 across Debian/Ubuntu/
|
|
361
|
+
* Alpine/RHEL. If we ever ship on arm64 hosts where some kernels use
|
|
362
|
+
* 250, uptimes will look 2.5× too large and we'll revisit.
|
|
363
|
+
*/
|
|
364
|
+
export function uptimeMsForStarttime(
|
|
365
|
+
starttimeTicks: number,
|
|
366
|
+
fs: ProcFsImpl = realProcFs,
|
|
367
|
+
): number | null {
|
|
368
|
+
try {
|
|
369
|
+
const uptimeRaw = fs.readFile('/proc/uptime').trim()
|
|
370
|
+
const bootUptimeSec = Number(uptimeRaw.split(/\s+/)[0])
|
|
371
|
+
if (!Number.isFinite(bootUptimeSec) || bootUptimeSec <= 0) return null
|
|
372
|
+
const HZ = 100
|
|
373
|
+
const procUptimeSec = bootUptimeSec - starttimeTicks / HZ
|
|
374
|
+
if (procUptimeSec < 0) return null
|
|
375
|
+
return Math.round(procUptimeSec * 1000)
|
|
376
|
+
} catch {
|
|
377
|
+
return null
|
|
378
|
+
}
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
function probeAgentProcessDocker(): ProbeResult {
|
|
382
|
+
const found = findAgentProcessInContainer()
|
|
383
|
+
if (!found) {
|
|
384
|
+
return { status: 'fail', label: 'Agent', detail: 'claude process not found' }
|
|
385
|
+
}
|
|
386
|
+
const uptimeMs = uptimeMsForStarttime(found.starttime)
|
|
387
|
+
const mb = Math.round(found.rssKb / 1024)
|
|
388
|
+
const parts = [
|
|
389
|
+
`PID ${found.pid}`,
|
|
390
|
+
uptimeMs != null ? `up ${formatMs(uptimeMs)}` : '',
|
|
391
|
+
mb > 0 ? `${mb} MB` : '',
|
|
392
|
+
].filter(Boolean)
|
|
393
|
+
return { status: 'ok', label: 'Agent', detail: parts.join(' · ') }
|
|
394
|
+
}
|
|
395
|
+
|
|
254
396
|
/**
|
|
255
397
|
* Resolve the "real" agent PID under tmux supervisor by walking the
|
|
256
398
|
* unit's cgroup and picking the heaviest-RSS claude/node process.
|
|
@@ -371,8 +513,19 @@ export async function probeAgentProcess(
|
|
|
371
513
|
/** When true, resolve PID via cgroup walk (heaviest claude/node) — under
|
|
372
514
|
* tmux supervisor MainPID is the tmux server (~2MB) which is misleading. */
|
|
373
515
|
tmuxSupervisor?: boolean
|
|
516
|
+
/** When true, skip systemctl entirely. The gateway is running INSIDE the
|
|
517
|
+
* agent container alongside claude, so we walk /proc directly. There's
|
|
518
|
+
* no "service deactivating/activating" model under docker — claude is
|
|
519
|
+
* either there or it isn't, so we return single-shot without retry. */
|
|
520
|
+
dockerMode?: boolean
|
|
521
|
+
/** Test override — defaults to the real probeAgentProcessDocker(). */
|
|
522
|
+
dockerProbeImpl?: () => ProbeResult
|
|
374
523
|
} = {},
|
|
375
524
|
): Promise<ProbeResult> {
|
|
525
|
+
if (opts.dockerMode) {
|
|
526
|
+
const impl = opts.dockerProbeImpl ?? probeAgentProcessDocker
|
|
527
|
+
return withTimeout('Agent', Promise.resolve(impl()))
|
|
528
|
+
}
|
|
376
529
|
const retryIntervalMs = opts.retryIntervalMs ?? AGENT_RETRY_INTERVAL_MS
|
|
377
530
|
const retryMaxMs = opts.retryMaxMs ?? AGENT_RETRY_MAX_MS
|
|
378
531
|
const sleep = opts.sleepImpl ?? ((ms: number) => new Promise(resolve => setTimeout(resolve, ms)))
|
|
@@ -469,8 +622,18 @@ export async function* watchAgentProcess(
|
|
|
469
622
|
nowImpl?: () => number
|
|
470
623
|
/** When true, resolve PID via cgroup walk (heaviest claude/node). */
|
|
471
624
|
tmuxSupervisor?: boolean
|
|
625
|
+
/** When true, skip systemctl: yield once with the current /proc-derived
|
|
626
|
+
* state and exit. Mirrors probeAgentProcess's docker-mode shortcut. */
|
|
627
|
+
dockerMode?: boolean
|
|
628
|
+
/** Test override — defaults to the real probeAgentProcessDocker(). */
|
|
629
|
+
dockerProbeImpl?: () => ProbeResult
|
|
472
630
|
} = {},
|
|
473
631
|
): AsyncGenerator<ProbeResult> {
|
|
632
|
+
if (opts.dockerMode) {
|
|
633
|
+
const impl = opts.dockerProbeImpl ?? probeAgentProcessDocker
|
|
634
|
+
yield impl()
|
|
635
|
+
return
|
|
636
|
+
}
|
|
474
637
|
const liveWindowMs = opts.liveWindowMs ?? AGENT_LIVE_WINDOW_MS
|
|
475
638
|
const pollIntervalMs = opts.pollIntervalMs ?? AGENT_LIVE_POLL_INTERVAL_MS
|
|
476
639
|
const followupRepollMs = opts.followupRepollMs ?? AGENT_LIVE_FOLLOWUP_REPOLL_MS
|
|
@@ -767,97 +930,365 @@ export async function probeHindsight(
|
|
|
767
930
|
})())
|
|
768
931
|
}
|
|
769
932
|
|
|
770
|
-
// ─── Probe:
|
|
933
|
+
// ─── Probe: Scheduler (in-container agent-scheduler since Phase 4) ───────────
|
|
771
934
|
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
935
|
+
/**
|
|
936
|
+
* Default lock and audit-jsonl paths inside the agent container.
|
|
937
|
+
* Mirrored from src/agent-scheduler/index.ts:194-197 — kept in sync there.
|
|
938
|
+
*/
|
|
939
|
+
const SCHEDULER_LOCK_PATH_DEFAULT = '/state/agent/scheduler.lock'
|
|
940
|
+
const SCHEDULER_JSONL_PATH_DEFAULT = '/state/agent/scheduler.jsonl'
|
|
941
|
+
|
|
942
|
+
/**
|
|
943
|
+
* How long after PID 1 started we treat a missing/dead scheduler as
|
|
944
|
+
* "still settling" rather than a hard fail. Boot-card already has its
|
|
945
|
+
* own 6 s settle window before probes run, so this only matters for
|
|
946
|
+
* /status hits during the first ~30 s of a container's life — long
|
|
947
|
+
* enough to cover supervisor + bun startup on a slow host without
|
|
948
|
+
* hiding a genuinely wedged scheduler.
|
|
949
|
+
*/
|
|
950
|
+
const SCHEDULER_FRESH_BOOT_MS = 30_000
|
|
951
|
+
|
|
952
|
+
/**
|
|
953
|
+
* Read PID 1's start time inside the container (ms since epoch). Used
|
|
954
|
+
* to soften scheduler probe verdicts during the early-boot window.
|
|
955
|
+
* Mirrors `readContainerBootTimeMs` from src/agent-scheduler/lock.ts —
|
|
956
|
+
* we duplicate the small reader here rather than import across the
|
|
957
|
+
* src/telegram-plugin boundary, since the plugin is built standalone.
|
|
958
|
+
*
|
|
959
|
+
* Returns null on any /proc parse failure → caller skips the softening.
|
|
960
|
+
*/
|
|
961
|
+
function readContainerBootTimeMsForProbe(): number | null {
|
|
962
|
+
try {
|
|
963
|
+
const stat1 = readFileSync('/proc/1/stat', 'utf8')
|
|
964
|
+
const lastParen = stat1.lastIndexOf(')')
|
|
965
|
+
if (lastParen < 0) return null
|
|
966
|
+
const after = stat1.slice(lastParen + 1).trim().split(/\s+/)
|
|
967
|
+
const starttimeTicks = Number(after[19])
|
|
968
|
+
if (!Number.isFinite(starttimeTicks)) return null
|
|
969
|
+
const procStat = readFileSync('/proc/stat', 'utf8')
|
|
970
|
+
const btimeLine = procStat.split('\n').find((l) => l.startsWith('btime '))
|
|
971
|
+
if (!btimeLine) return null
|
|
972
|
+
const btimeSec = Number(btimeLine.split(/\s+/)[1])
|
|
973
|
+
if (!Number.isFinite(btimeSec)) return null
|
|
974
|
+
const CLK_TCK = 100
|
|
975
|
+
return (btimeSec + starttimeTicks / CLK_TCK) * 1000
|
|
976
|
+
} catch {
|
|
977
|
+
return null
|
|
978
|
+
}
|
|
779
979
|
}
|
|
780
980
|
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
981
|
+
/**
|
|
982
|
+
* Filesystem injection point for the scheduler probe. Same shape as
|
|
983
|
+
* ProcFsImpl but read-only against arbitrary paths. Tests inject a
|
|
984
|
+
* synthetic fs to drive lockfile contents and jsonl tails without
|
|
985
|
+
* touching disk.
|
|
986
|
+
*/
|
|
987
|
+
export interface SchedulerFsImpl {
|
|
988
|
+
readFile: (path: string) => string
|
|
989
|
+
/** stat-mtime, ms-since-epoch. Used to age the audit jsonl. */
|
|
990
|
+
mtimeMs: (path: string) => number
|
|
991
|
+
exists: (path: string) => boolean
|
|
792
992
|
}
|
|
793
993
|
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
994
|
+
const realSchedulerFs: SchedulerFsImpl = {
|
|
995
|
+
readFile: (p) => readFileSync(p, 'utf-8'),
|
|
996
|
+
mtimeMs: (p) => {
|
|
997
|
+
// `existsSync` shaped path keeps the probe defensive — caller checks
|
|
998
|
+
// exists() first. statSync is imported via the readdirSync chain.
|
|
999
|
+
// eslint-disable-next-line @typescript-eslint/no-require-imports
|
|
1000
|
+
const { statSync } = require('fs') as typeof import('fs')
|
|
1001
|
+
return statSync(p).mtimeMs
|
|
1002
|
+
},
|
|
1003
|
+
exists: (p) => existsSync(p),
|
|
1004
|
+
}
|
|
1005
|
+
|
|
1006
|
+
/**
|
|
1007
|
+
* Probe the in-container agent-scheduler (cron-fold-in cutover, Phase 4
|
|
1008
|
+
* — see CLAUDE.md "Cron-fold-in note"). Replaces the pre-Phase-4 probe
|
|
1009
|
+
* that queried `systemctl --user list-timers switchroom-<agent>-cron-*`
|
|
1010
|
+
* (those timers no longer exist) and the dockerMode short-circuit that
|
|
1011
|
+
* lied with "managed by switchroom-cron" (that container was retired in
|
|
1012
|
+
* PR #893).
|
|
1013
|
+
*
|
|
1014
|
+
* The scheduler is a sibling sidecar started by start.sh's
|
|
1015
|
+
* _switchroom_supervise wrapper. It writes a pidfile-with-liveness lock
|
|
1016
|
+
* at /state/agent/scheduler.lock (src/agent-scheduler/lock.ts) and an
|
|
1017
|
+
* audit row per fire to /state/agent/scheduler.jsonl
|
|
1018
|
+
* (src/agent-scheduler/index.ts:256, src/scheduler/audit.ts).
|
|
1019
|
+
*
|
|
1020
|
+
* ok — lockfile present, holder PID alive
|
|
1021
|
+
* degraded — lockfile present but PID dead (supervisor mid-restart, or
|
|
1022
|
+
* sched crashed and supervisor hasn't relaunched yet)
|
|
1023
|
+
* fail — lockfile missing (sidecar never started or supervisor
|
|
1024
|
+
* gave up after restart-cap)
|
|
1025
|
+
*
|
|
1026
|
+
* Outside dockerMode the probe is silent (returns ok with "n/a"). Phase
|
|
1027
|
+
* 4 deleted the host-side scheduler entirely; non-docker callers
|
|
1028
|
+
* (legacy systemd installs, tests) have no scheduler to probe.
|
|
1029
|
+
*/
|
|
1030
|
+
export async function probeScheduler(
|
|
1031
|
+
_agentName: string,
|
|
1032
|
+
opts: {
|
|
1033
|
+
dockerMode?: boolean
|
|
1034
|
+
fs?: SchedulerFsImpl
|
|
1035
|
+
/** Override the lockfile path. Defaults to env
|
|
1036
|
+
* `SWITCHROOM_AGENT_SCHEDULER_LOCK` (matches the override the
|
|
1037
|
+
* scheduler itself reads at src/agent-scheduler/index.ts:196), then
|
|
1038
|
+
* to `/state/agent/scheduler.lock`. */
|
|
1039
|
+
lockPath?: string
|
|
1040
|
+
/** Override the audit-jsonl path. Defaults to env
|
|
1041
|
+
* `SWITCHROOM_AGENT_SCHEDULER_JSONL`, then to
|
|
1042
|
+
* `/state/agent/scheduler.jsonl` (mirrors index.ts:194). */
|
|
1043
|
+
jsonlPath?: string
|
|
1044
|
+
/** Liveness check for the holder PID — defaults to process.kill(pid, 0). */
|
|
1045
|
+
isAlive?: (pid: number) => boolean
|
|
1046
|
+
now?: () => number
|
|
1047
|
+
/** Container PID-1 start time in ms since epoch. When set AND the
|
|
1048
|
+
* current time is within `SCHEDULER_FRESH_BOOT_MS` of it, scheduler
|
|
1049
|
+
* fail/degraded verdicts are softened to "still settling". Pass
|
|
1050
|
+
* `null` to disable the softening (e.g. unit tests pinning a hard
|
|
1051
|
+
* fail). Defaults to `readContainerBootTimeMsForProbe()`. */
|
|
1052
|
+
containerBootTimeMs?: number | null
|
|
1053
|
+
} = {},
|
|
797
1054
|
): Promise<ProbeResult> {
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
|
|
1055
|
+
if (!opts.dockerMode) {
|
|
1056
|
+
return { status: 'ok', label: 'Scheduler', detail: 'n/a (non-docker)' }
|
|
1057
|
+
}
|
|
1058
|
+
return withTimeout('Scheduler', (async (): Promise<ProbeResult> => {
|
|
1059
|
+
const fs = opts.fs ?? realSchedulerFs
|
|
1060
|
+
const lockPath = opts.lockPath
|
|
1061
|
+
?? process.env.SWITCHROOM_AGENT_SCHEDULER_LOCK
|
|
1062
|
+
?? SCHEDULER_LOCK_PATH_DEFAULT
|
|
1063
|
+
const jsonlPath = opts.jsonlPath
|
|
1064
|
+
?? process.env.SWITCHROOM_AGENT_SCHEDULER_JSONL
|
|
1065
|
+
?? SCHEDULER_JSONL_PATH_DEFAULT
|
|
1066
|
+
const now = opts.now ?? Date.now
|
|
1067
|
+
const isAlive = opts.isAlive ?? ((pid: number) => {
|
|
1068
|
+
try { process.kill(pid, 0); return true } catch { return false }
|
|
1069
|
+
})
|
|
1070
|
+
const bootTimeMs = 'containerBootTimeMs' in opts
|
|
1071
|
+
? opts.containerBootTimeMs
|
|
1072
|
+
: readContainerBootTimeMsForProbe()
|
|
1073
|
+
const stillSettling = bootTimeMs != null
|
|
1074
|
+
&& (now() - bootTimeMs) < SCHEDULER_FRESH_BOOT_MS
|
|
1075
|
+
const settlingNote = stillSettling ? ' (still settling)' : ''
|
|
1076
|
+
|
|
1077
|
+
if (!fs.exists(lockPath)) {
|
|
1078
|
+
// During the first ~30 s of a container's life, "no lockfile" is
|
|
1079
|
+
// the supervisor + bun still starting up. /status hit at that
|
|
1080
|
+
// moment shouldn't show 🔴 for a non-issue.
|
|
1081
|
+
return {
|
|
1082
|
+
status: stillSettling ? 'degraded' : 'fail',
|
|
1083
|
+
label: 'Scheduler',
|
|
1084
|
+
detail: `sidecar not running (no lockfile)${settlingNote}`,
|
|
1085
|
+
}
|
|
1086
|
+
}
|
|
1087
|
+
let holderPid: number | null = null
|
|
801
1088
|
try {
|
|
802
|
-
const
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
if (msg.includes('No timers found') || String((err as NodeJS.ErrnoException)?.code) === '1') {
|
|
817
|
-
return { status: 'ok', label: 'Crons', detail: '0 timers' }
|
|
1089
|
+
const raw = fs.readFile(lockPath).trim()
|
|
1090
|
+
const parsed = Number.parseInt(raw, 10)
|
|
1091
|
+
if (Number.isInteger(parsed) && parsed > 0) holderPid = parsed
|
|
1092
|
+
} catch {
|
|
1093
|
+
return { status: 'degraded', label: 'Scheduler', detail: 'lockfile unreadable' }
|
|
1094
|
+
}
|
|
1095
|
+
if (holderPid == null) {
|
|
1096
|
+
return { status: 'degraded', label: 'Scheduler', detail: 'lockfile contents invalid' }
|
|
1097
|
+
}
|
|
1098
|
+
if (!isAlive(holderPid)) {
|
|
1099
|
+
return {
|
|
1100
|
+
status: 'degraded',
|
|
1101
|
+
label: 'Scheduler',
|
|
1102
|
+
detail: `lock holder pid ${holderPid} not alive (supervisor restart in progress?)`,
|
|
818
1103
|
}
|
|
819
|
-
return { status: 'fail', label: 'Crons', detail: `systemctl failed: ${msg}` }
|
|
820
1104
|
}
|
|
821
1105
|
|
|
822
|
-
|
|
823
|
-
|
|
1106
|
+
// Sidecar is up. Add a freshness hint from scheduler.jsonl if present
|
|
1107
|
+
// — gives the user signal that fires are actually happening, not just
|
|
1108
|
+
// that the daemon is breathing. Absence is fine: a freshly booted
|
|
1109
|
+
// agent or a 0-entry agent has no fires to report.
|
|
1110
|
+
let detail = `running (pid ${holderPid})`
|
|
1111
|
+
if (fs.exists(jsonlPath)) {
|
|
1112
|
+
try {
|
|
1113
|
+
const ageMs = now() - fs.mtimeMs(jsonlPath)
|
|
1114
|
+
if (Number.isFinite(ageMs) && ageMs >= 0) {
|
|
1115
|
+
detail += ` · last fire ${formatMs(ageMs)} ago`
|
|
1116
|
+
}
|
|
1117
|
+
} catch {
|
|
1118
|
+
// mtime read failed — keep the basic detail; non-blocking.
|
|
1119
|
+
}
|
|
824
1120
|
}
|
|
1121
|
+
return { status: 'ok', label: 'Scheduler', detail }
|
|
1122
|
+
})())
|
|
1123
|
+
}
|
|
1124
|
+
|
|
1125
|
+
// ─── Probe: Vault broker / approval kernel reachability ──────────────────────
|
|
825
1126
|
|
|
826
|
-
|
|
1127
|
+
/**
|
|
1128
|
+
* Generic UDS-reachability probe used for both vault-broker and
|
|
1129
|
+
* approval-kernel. Path-as-identity invariant (CLAUDE.md "Per-agent
|
|
1130
|
+
* socket model") — bind paths are mounted into each agent container at
|
|
1131
|
+
* /run/switchroom/{broker,kernel}/<agent>/sock. ENOENT means the
|
|
1132
|
+
* compose volume isn't mounted (broker container down or no agent dir
|
|
1133
|
+
* yet); ECONNREFUSED means the bind disappeared between us and the
|
|
1134
|
+
* daemon (rare, broker shutdown removes the socket).
|
|
1135
|
+
*
|
|
1136
|
+
* Connect-test only — we do NOT send a wire request. The probe must not
|
|
1137
|
+
* authenticate as the agent or do any vault/grant work; that's the
|
|
1138
|
+
* agent's job. We just want to know "is something listening on this
|
|
1139
|
+
* socket". Connection is closed immediately on success.
|
|
1140
|
+
*/
|
|
1141
|
+
async function probeUds(
|
|
1142
|
+
label: string,
|
|
1143
|
+
socketPath: string | undefined,
|
|
1144
|
+
opts: { dockerMode?: boolean; connectImpl?: (path: string) => Promise<void> } = {},
|
|
1145
|
+
): Promise<ProbeResult> {
|
|
1146
|
+
if (!opts.dockerMode) {
|
|
1147
|
+
return { status: 'ok', label, detail: 'n/a (non-docker)' }
|
|
1148
|
+
}
|
|
1149
|
+
if (!socketPath) {
|
|
1150
|
+
return { status: 'fail', label, detail: 'socket path not configured' }
|
|
1151
|
+
}
|
|
1152
|
+
return withTimeout(label, (async (): Promise<ProbeResult> => {
|
|
1153
|
+
if (!opts.connectImpl) {
|
|
1154
|
+
// Cheap pre-check: stat the file. Saves the connect round-trip on
|
|
1155
|
+
// the common "broker container down → bind mount empty" case.
|
|
1156
|
+
if (!existsSync(socketPath)) {
|
|
1157
|
+
return { status: 'fail', label, detail: `socket missing: ${socketPath}` }
|
|
1158
|
+
}
|
|
1159
|
+
}
|
|
1160
|
+
const connect = opts.connectImpl ?? defaultUdsConnect
|
|
827
1161
|
try {
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
const
|
|
832
|
-
|
|
1162
|
+
await connect(socketPath)
|
|
1163
|
+
return { status: 'ok', label, detail: 'reachable' }
|
|
1164
|
+
} catch (err: unknown) {
|
|
1165
|
+
const code = (err as NodeJS.ErrnoException)?.code
|
|
1166
|
+
const msg = (err as Error)?.message ?? String(err)
|
|
1167
|
+
if (code === 'ENOENT') return { status: 'fail', label, detail: 'socket missing' }
|
|
1168
|
+
if (code === 'ECONNREFUSED') return { status: 'fail', label, detail: 'connection refused' }
|
|
1169
|
+
return { status: 'fail', label, detail: `connect failed: ${msg}` }
|
|
833
1170
|
}
|
|
1171
|
+
})())
|
|
1172
|
+
}
|
|
834
1173
|
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
|
|
1174
|
+
/**
|
|
1175
|
+
* Default UDS connect — opens a stream, then immediately closes it.
|
|
1176
|
+
* Resolves on `connect` event, rejects on `error`. 1s connect timeout
|
|
1177
|
+
* is plenty for a local socket (the per-probe timeout in withTimeout
|
|
1178
|
+
* is the outer guard).
|
|
1179
|
+
*/
|
|
1180
|
+
function defaultUdsConnect(socketPath: string): Promise<void> {
|
|
1181
|
+
// eslint-disable-next-line @typescript-eslint/no-require-imports
|
|
1182
|
+
const net = require('net') as typeof import('net')
|
|
1183
|
+
return new Promise<void>((resolve, reject) => {
|
|
1184
|
+
const sock = net.createConnection({ path: socketPath })
|
|
1185
|
+
const t = setTimeout(() => {
|
|
1186
|
+
sock.destroy()
|
|
1187
|
+
reject(new Error('connect timeout'))
|
|
1188
|
+
}, 1000)
|
|
1189
|
+
sock.once('connect', () => {
|
|
1190
|
+
clearTimeout(t)
|
|
1191
|
+
sock.end()
|
|
1192
|
+
resolve()
|
|
1193
|
+
})
|
|
1194
|
+
sock.once('error', (err) => {
|
|
1195
|
+
clearTimeout(t)
|
|
1196
|
+
sock.destroy()
|
|
1197
|
+
reject(err)
|
|
1198
|
+
})
|
|
1199
|
+
})
|
|
1200
|
+
}
|
|
1201
|
+
|
|
1202
|
+
export async function probeBroker(
|
|
1203
|
+
socketPath?: string,
|
|
1204
|
+
opts: { dockerMode?: boolean; connectImpl?: (path: string) => Promise<void> } = {},
|
|
1205
|
+
): Promise<ProbeResult> {
|
|
1206
|
+
return probeUds('Broker', socketPath ?? process.env.SWITCHROOM_BROKER_SOCKET, opts)
|
|
1207
|
+
}
|
|
1208
|
+
|
|
1209
|
+
export async function probeKernel(
|
|
1210
|
+
socketPath?: string,
|
|
1211
|
+
opts: { dockerMode?: boolean; connectImpl?: (path: string) => Promise<void> } = {},
|
|
1212
|
+
): Promise<ProbeResult> {
|
|
1213
|
+
return probeUds('Kernel', socketPath ?? process.env.SWITCHROOM_KERNEL_SOCKET, opts)
|
|
1214
|
+
}
|
|
838
1215
|
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
|
|
1216
|
+
// ─── Probe: Skills (symlink validity) ────────────────────────────────────────
|
|
1217
|
+
|
|
1218
|
+
/**
|
|
1219
|
+
* Validate that every entry under <agentDir>/.claude/skills/ resolves
|
|
1220
|
+
* to a readable file. Skills are normally symlinks into the global pool
|
|
1221
|
+
* `~/.switchroom/skills/` (src/agents/scaffold.ts:639); a renamed or
|
|
1222
|
+
* deleted skill in the pool dangles silently — claude won't surface the
|
|
1223
|
+
* skill, the user wonders why /<skill> doesn't work.
|
|
1224
|
+
*
|
|
1225
|
+
* ok — every entry resolves OR the dir doesn't exist (no skills
|
|
1226
|
+
* configured is a normal state, not a failure)
|
|
1227
|
+
* degraded — at least one symlink dangles; rendered detail names them
|
|
1228
|
+
* up to a cap so the row doesn't wrap forever
|
|
1229
|
+
*/
|
|
1230
|
+
export async function probeSkills(
|
|
1231
|
+
agentDir: string,
|
|
1232
|
+
opts: { fs?: SkillsFsImpl; maxNamesShown?: number } = {},
|
|
1233
|
+
): Promise<ProbeResult> {
|
|
1234
|
+
return withTimeout('Skills', (async (): Promise<ProbeResult> => {
|
|
1235
|
+
const fs = opts.fs ?? realSkillsFs
|
|
1236
|
+
const max = opts.maxNamesShown ?? 3
|
|
1237
|
+
const skillsDir = join(agentDir, '.claude', 'skills')
|
|
1238
|
+
if (!fs.exists(skillsDir)) {
|
|
1239
|
+
return { status: 'ok', label: 'Skills', detail: 'no skills dir' }
|
|
1240
|
+
}
|
|
1241
|
+
let entries: string[]
|
|
1242
|
+
try {
|
|
1243
|
+
entries = fs.readdir(skillsDir)
|
|
1244
|
+
} catch {
|
|
1245
|
+
return { status: 'degraded', label: 'Skills', detail: 'skills dir unreadable' }
|
|
1246
|
+
}
|
|
1247
|
+
if (entries.length === 0) {
|
|
1248
|
+
return { status: 'ok', label: 'Skills', detail: '0 skills' }
|
|
1249
|
+
}
|
|
1250
|
+
const dangling: string[] = []
|
|
1251
|
+
for (const name of entries) {
|
|
1252
|
+
// Skills are dirs containing a SKILL.md (claude convention). The
|
|
1253
|
+
// dangle case we worry about is a symlink whose target was
|
|
1254
|
+
// removed — readability of <name>/SKILL.md is the simplest proxy
|
|
1255
|
+
// and matches what claude itself would discover.
|
|
1256
|
+
const skillPath = join(skillsDir, name)
|
|
1257
|
+
if (!fs.exists(skillPath)) {
|
|
1258
|
+
dangling.push(name)
|
|
1259
|
+
continue
|
|
1260
|
+
}
|
|
1261
|
+
// Single-file skills exist (rare but allowed); accept them too.
|
|
1262
|
+
const skillMd = join(skillPath, 'SKILL.md')
|
|
1263
|
+
if (!fs.exists(skillMd) && !fs.exists(skillPath + '.md')) {
|
|
1264
|
+
// Only flag as dangling if the entry IS a symlink (a real dir
|
|
1265
|
+
// without SKILL.md is weird but not necessarily broken — could
|
|
1266
|
+
// be an in-progress local skill). We have no symlink-test in
|
|
1267
|
+
// SkillsFsImpl by design; conservatively don't flag as dangling.
|
|
1268
|
+
// The user's main risk is removed-pool-target, which existsSync
|
|
1269
|
+
// catches above.
|
|
1270
|
+
continue
|
|
846
1271
|
}
|
|
847
1272
|
}
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
if (!earliest) {
|
|
851
|
-
return { status: 'ok', label: 'Crons', detail: `${count} timers` }
|
|
1273
|
+
if (dangling.length === 0) {
|
|
1274
|
+
return { status: 'ok', label: 'Skills', detail: `${entries.length} resolved` }
|
|
852
1275
|
}
|
|
853
|
-
|
|
854
|
-
const
|
|
855
|
-
const m = Math.round((earliest.leftMs % 3600_000) / 60_000)
|
|
856
|
-
const timeStr = h > 0 ? `${h}h ${m}m` : `${m}m`
|
|
1276
|
+
const named = dangling.slice(0, max).join(', ')
|
|
1277
|
+
const more = dangling.length > max ? ` +${dangling.length - max} more` : ''
|
|
857
1278
|
return {
|
|
858
|
-
status: '
|
|
859
|
-
label: '
|
|
860
|
-
detail: `${
|
|
1279
|
+
status: 'degraded',
|
|
1280
|
+
label: 'Skills',
|
|
1281
|
+
detail: `${dangling.length}/${entries.length} dangling: ${named}${more}`,
|
|
861
1282
|
}
|
|
862
1283
|
})())
|
|
863
1284
|
}
|
|
1285
|
+
|
|
1286
|
+
export interface SkillsFsImpl {
|
|
1287
|
+
readdir: (p: string) => string[]
|
|
1288
|
+
exists: (p: string) => boolean
|
|
1289
|
+
}
|
|
1290
|
+
|
|
1291
|
+
const realSkillsFs: SkillsFsImpl = {
|
|
1292
|
+
readdir: (p) => readdirSync(p),
|
|
1293
|
+
exists: (p) => existsSync(p),
|
|
1294
|
+
}
|