@os-eco/overstory-cli 0.9.4 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +50 -19
- package/agents/builder.md +19 -9
- package/agents/coordinator.md +6 -6
- package/agents/lead.md +204 -87
- package/agents/merger.md +25 -14
- package/agents/reviewer.md +22 -16
- package/agents/scout.md +17 -12
- package/package.json +6 -3
- package/src/agents/capabilities.test.ts +85 -0
- package/src/agents/capabilities.ts +125 -0
- package/src/agents/headless-mail-injector.test.ts +448 -0
- package/src/agents/headless-mail-injector.ts +219 -0
- package/src/agents/headless-prompt.test.ts +102 -0
- package/src/agents/headless-prompt.ts +68 -0
- package/src/agents/hooks-deployer.test.ts +514 -14
- package/src/agents/hooks-deployer.ts +141 -0
- package/src/agents/mail-poll-detect.test.ts +153 -0
- package/src/agents/mail-poll-detect.ts +73 -0
- package/src/agents/overlay.test.ts +60 -4
- package/src/agents/overlay.ts +63 -8
- package/src/agents/scope-detect.test.ts +190 -0
- package/src/agents/scope-detect.ts +146 -0
- package/src/agents/turn-lock.test.ts +181 -0
- package/src/agents/turn-lock.ts +235 -0
- package/src/agents/turn-runner-dispatch.test.ts +182 -0
- package/src/agents/turn-runner-dispatch.ts +105 -0
- package/src/agents/turn-runner.test.ts +2312 -0
- package/src/agents/turn-runner.ts +1383 -0
- package/src/commands/agents.ts +9 -0
- package/src/commands/clean.ts +54 -0
- package/src/commands/coordinator.test.ts +254 -0
- package/src/commands/coordinator.ts +273 -8
- package/src/commands/dashboard.test.ts +188 -0
- package/src/commands/dashboard.ts +14 -4
- package/src/commands/doctor.ts +3 -1
- package/src/commands/group.test.ts +94 -0
- package/src/commands/group.ts +49 -20
- package/src/commands/init.test.ts +8 -0
- package/src/commands/init.ts +8 -1
- package/src/commands/log.test.ts +187 -11
- package/src/commands/log.ts +171 -71
- package/src/commands/mail.test.ts +162 -0
- package/src/commands/mail.ts +64 -9
- package/src/commands/merge.test.ts +230 -1
- package/src/commands/merge.ts +68 -12
- package/src/commands/nudge.test.ts +351 -4
- package/src/commands/nudge.ts +356 -34
- package/src/commands/run.test.ts +43 -7
- package/src/commands/serve/build.test.ts +202 -0
- package/src/commands/serve/build.ts +206 -0
- package/src/commands/serve/coordinator-actions.test.ts +339 -0
- package/src/commands/serve/coordinator-actions.ts +408 -0
- package/src/commands/serve/dev.test.ts +168 -0
- package/src/commands/serve/dev.ts +117 -0
- package/src/commands/serve/mail-actions.test.ts +312 -0
- package/src/commands/serve/mail-actions.ts +167 -0
- package/src/commands/serve/rest.test.ts +1323 -0
- package/src/commands/serve/rest.ts +708 -0
- package/src/commands/serve/static.ts +51 -0
- package/src/commands/serve/ws.test.ts +361 -0
- package/src/commands/serve/ws.ts +332 -0
- package/src/commands/serve.test.ts +459 -0
- package/src/commands/serve.ts +565 -0
- package/src/commands/sling.test.ts +177 -1
- package/src/commands/sling.ts +243 -71
- package/src/commands/status.test.ts +9 -0
- package/src/commands/status.ts +12 -4
- package/src/commands/stop.test.ts +255 -1
- package/src/commands/stop.ts +107 -8
- package/src/commands/watch.test.ts +43 -0
- package/src/commands/watch.ts +153 -28
- package/src/config.ts +23 -0
- package/src/doctor/consistency.test.ts +106 -0
- package/src/doctor/consistency.ts +48 -1
- package/src/doctor/serve.test.ts +95 -0
- package/src/doctor/serve.ts +86 -0
- package/src/doctor/types.ts +2 -1
- package/src/doctor/watchdog.ts +57 -1
- package/src/events/tailer.test.ts +234 -1
- package/src/events/tailer.ts +90 -0
- package/src/index.ts +57 -6
- package/src/insights/quality-gates.test.ts +141 -0
- package/src/insights/quality-gates.ts +156 -0
- package/src/json.ts +29 -0
- package/src/logging/theme.ts +4 -0
- package/src/mail/client.ts +15 -2
- package/src/mail/store.test.ts +82 -0
- package/src/mail/store.ts +41 -4
- package/src/merge/lock.test.ts +149 -0
- package/src/merge/lock.ts +140 -0
- package/src/merge/predict.test.ts +387 -0
- package/src/merge/predict.ts +249 -0
- package/src/merge/resolver.ts +1 -1
- package/src/mulch/client.ts +3 -3
- package/src/runtimes/__fixtures__/claude-stream-fixture.ts +22 -0
- package/src/runtimes/claude.test.ts +791 -1
- package/src/runtimes/claude.ts +323 -1
- package/src/runtimes/connections.test.ts +141 -1
- package/src/runtimes/connections.ts +73 -4
- package/src/runtimes/headless-connection.test.ts +264 -0
- package/src/runtimes/headless-connection.ts +158 -0
- package/src/runtimes/types.ts +10 -0
- package/src/schema-consistency.test.ts +1 -0
- package/src/sessions/store.test.ts +657 -29
- package/src/sessions/store.ts +286 -23
- package/src/test-setup.test.ts +31 -0
- package/src/test-setup.ts +28 -0
- package/src/types.ts +107 -2
- package/src/utils/pid.test.ts +85 -1
- package/src/utils/pid.ts +86 -1
- package/src/utils/process-scan.test.ts +53 -0
- package/src/utils/process-scan.ts +76 -0
- package/src/watchdog/daemon.test.ts +1607 -376
- package/src/watchdog/daemon.ts +462 -88
- package/src/watchdog/health.test.ts +282 -0
- package/src/watchdog/health.ts +126 -27
- package/src/worktree/manager.test.ts +218 -1
- package/src/worktree/manager.ts +55 -0
- package/src/worktree/process.test.ts +71 -0
- package/src/worktree/process.ts +25 -5
- package/src/worktree/tmux.test.ts +28 -0
- package/src/worktree/tmux.ts +27 -3
- package/templates/CLAUDE.md.tmpl +19 -8
- package/templates/overlay.md.tmpl +5 -2
package/src/commands/watch.ts
CHANGED
|
@@ -13,7 +13,13 @@ import { jsonOutput } from "../json.ts";
|
|
|
13
13
|
import { printError, printHint, printSuccess } from "../logging/color.ts";
|
|
14
14
|
import type { HealthCheck } from "../types.ts";
|
|
15
15
|
import { resolveOverstoryBin } from "../utils/bin.ts";
|
|
16
|
-
import {
|
|
16
|
+
import {
|
|
17
|
+
type AcquirePidLockResult,
|
|
18
|
+
acquirePidLock,
|
|
19
|
+
readPidFile,
|
|
20
|
+
removePidFile,
|
|
21
|
+
} from "../utils/pid.ts";
|
|
22
|
+
import { findRunningWatchdogProcesses, type WatchdogProcess } from "../utils/process-scan.ts";
|
|
17
23
|
import { startDaemon } from "../watchdog/daemon.ts";
|
|
18
24
|
import { isProcessRunning } from "../watchdog/health.ts";
|
|
19
25
|
|
|
@@ -38,6 +44,39 @@ export function formatCheck(check: HealthCheck): string {
|
|
|
38
44
|
return line;
|
|
39
45
|
}
|
|
40
46
|
|
|
47
|
+
/**
|
|
48
|
+
* Format a "lock contested" error consistently across foreground/background.
|
|
49
|
+
*/
|
|
50
|
+
function formatLockContestedError(existingPid: number, pidFilePath: string): string {
|
|
51
|
+
if (existingPid <= 0) {
|
|
52
|
+
return `Watchdog PID file at ${pidFilePath} is owned by another process (could not read PID). Run 'ov watch --kill-others' or remove the file.`;
|
|
53
|
+
}
|
|
54
|
+
return `Watchdog already running (PID: ${existingPid}). Kill it first, run 'ov watch --kill-others', or remove ${pidFilePath}`;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
/**
|
|
58
|
+
* Kill running `ov watch` daemons that are NOT the given excludedPid.
|
|
59
|
+
* Returns the list of PIDs killed (after a SIGTERM was issued — not waited).
|
|
60
|
+
*/
|
|
61
|
+
async function killForeignWatchdogs(
|
|
62
|
+
excludedPid: number | null,
|
|
63
|
+
): Promise<{ killed: number[]; surveyed: WatchdogProcess[] }> {
|
|
64
|
+
const surveyed = await findRunningWatchdogProcesses();
|
|
65
|
+
const killed: number[] = [];
|
|
66
|
+
for (const proc of surveyed) {
|
|
67
|
+
if (excludedPid !== null && proc.pid === excludedPid) {
|
|
68
|
+
continue;
|
|
69
|
+
}
|
|
70
|
+
try {
|
|
71
|
+
process.kill(proc.pid, "SIGTERM");
|
|
72
|
+
killed.push(proc.pid);
|
|
73
|
+
} catch {
|
|
74
|
+
// Process already gone — not an error.
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
return { killed, surveyed };
|
|
78
|
+
}
|
|
79
|
+
|
|
41
80
|
/**
|
|
42
81
|
* Core implementation for the watch command.
|
|
43
82
|
*/
|
|
@@ -45,6 +84,7 @@ async function runWatch(opts: {
|
|
|
45
84
|
interval?: string;
|
|
46
85
|
background?: boolean;
|
|
47
86
|
json?: boolean;
|
|
87
|
+
killOthers?: boolean;
|
|
48
88
|
}): Promise<void> {
|
|
49
89
|
const cwd = process.cwd();
|
|
50
90
|
const config = await loadConfig(cwd);
|
|
@@ -59,26 +99,46 @@ async function runWatch(opts: {
|
|
|
59
99
|
|
|
60
100
|
const useJson = opts.json ?? false;
|
|
61
101
|
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
102
|
+
// --kill-others: kill any pre-existing `ov watch` daemons before claiming
|
|
103
|
+
// the lock. Useful when an earlier release allowed multi-daemon state.
|
|
104
|
+
if (opts.killOthers) {
|
|
105
|
+
const { killed } = await killForeignWatchdogs(null);
|
|
106
|
+
|
|
107
|
+
// Wait for the just-killed processes to actually exit before reclaiming
|
|
108
|
+
// the PID file. Without this, the next acquirePidLock call sees a still-
|
|
109
|
+
// alive PID in the file and refuses, even though we issued SIGTERM
|
|
110
|
+
// nanoseconds earlier. Poll for up to ~2s.
|
|
111
|
+
const killedSet = new Set(killed);
|
|
112
|
+
if (killedSet.size > 0) {
|
|
113
|
+
const deadline = Date.now() + 2000;
|
|
114
|
+
while (Date.now() < deadline) {
|
|
115
|
+
const stillAlive = killed.filter((p) => isProcessRunning(p));
|
|
116
|
+
if (stillAlive.length === 0) break;
|
|
117
|
+
await Bun.sleep(50);
|
|
72
118
|
}
|
|
73
|
-
process.exitCode = 1;
|
|
74
|
-
return;
|
|
75
119
|
}
|
|
76
120
|
|
|
77
|
-
//
|
|
121
|
+
// Reclaim the PID file if it pointed at a process we just killed (it is
|
|
122
|
+
// either already dead or in flight to dead) or at any other dead PID.
|
|
123
|
+
const existingPid = await readPidFile(pidFilePath);
|
|
78
124
|
if (existingPid !== null) {
|
|
79
|
-
|
|
125
|
+
if (killedSet.has(existingPid) || !isProcessRunning(existingPid)) {
|
|
126
|
+
await removePidFile(pidFilePath);
|
|
127
|
+
}
|
|
80
128
|
}
|
|
81
129
|
|
|
130
|
+
if (killed.length > 0) {
|
|
131
|
+
if (useJson) {
|
|
132
|
+
jsonOutput("watch", { killed });
|
|
133
|
+
} else {
|
|
134
|
+
printSuccess(`Killed ${killed.length} foreign watchdog process(es): ${killed.join(", ")}`);
|
|
135
|
+
}
|
|
136
|
+
} else if (!useJson) {
|
|
137
|
+
printHint("No foreign watchdog processes found.");
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
if (opts.background) {
|
|
82
142
|
// Build the args for the child process, forwarding --interval but not --background
|
|
83
143
|
const childArgs: string[] = ["watch"];
|
|
84
144
|
if (opts.interval) {
|
|
@@ -88,7 +148,21 @@ async function runWatch(opts: {
|
|
|
88
148
|
// Resolve the overstory binary path
|
|
89
149
|
const overstoryBin = await resolveOverstoryBin();
|
|
90
150
|
|
|
91
|
-
//
|
|
151
|
+
// Pre-check: surface "already running" before paying the cost of a spawn.
|
|
152
|
+
// This is only for friendly errors — the authoritative exclusion happens
|
|
153
|
+
// in the atomic acquirePidLock call below.
|
|
154
|
+
const existingPid = await readPidFile(pidFilePath);
|
|
155
|
+
if (existingPid !== null && isProcessRunning(existingPid)) {
|
|
156
|
+
if (useJson) {
|
|
157
|
+
jsonOutput("watch", { running: true, pid: existingPid, error: "Watchdog already running" });
|
|
158
|
+
} else {
|
|
159
|
+
printError(formatLockContestedError(existingPid, pidFilePath));
|
|
160
|
+
}
|
|
161
|
+
process.exitCode = 1;
|
|
162
|
+
return;
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
// Spawn the detached background daemon (foreground mode in the child).
|
|
92
166
|
const child = Bun.spawn(["bun", "run", overstoryBin, ...childArgs], {
|
|
93
167
|
cwd,
|
|
94
168
|
stdout: "ignore",
|
|
@@ -96,13 +170,33 @@ async function runWatch(opts: {
|
|
|
96
170
|
stdin: "ignore",
|
|
97
171
|
});
|
|
98
172
|
|
|
99
|
-
// Unref the child so the parent can exit without waiting for it
|
|
100
|
-
child.unref();
|
|
101
|
-
|
|
102
173
|
const childPid = child.pid;
|
|
103
174
|
|
|
104
|
-
//
|
|
105
|
-
|
|
175
|
+
// Atomically acquire the lock with the child's PID. If another writer
|
|
176
|
+
// raced in between our pre-check and the spawn, we have to kill our
|
|
177
|
+
// child and report contention.
|
|
178
|
+
const lockResult = await acquirePidLock(pidFilePath, childPid, isProcessRunning);
|
|
179
|
+
if (!lockResult.acquired) {
|
|
180
|
+
try {
|
|
181
|
+
child.kill("SIGTERM");
|
|
182
|
+
} catch {
|
|
183
|
+
// Already exited — not an error.
|
|
184
|
+
}
|
|
185
|
+
if (useJson) {
|
|
186
|
+
jsonOutput("watch", {
|
|
187
|
+
running: true,
|
|
188
|
+
pid: lockResult.existingPid,
|
|
189
|
+
error: "Watchdog already running",
|
|
190
|
+
});
|
|
191
|
+
} else {
|
|
192
|
+
printError(formatLockContestedError(lockResult.existingPid, pidFilePath));
|
|
193
|
+
}
|
|
194
|
+
process.exitCode = 1;
|
|
195
|
+
return;
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
// Lock is ours. Detach so this parent invocation can exit independently.
|
|
199
|
+
child.unref();
|
|
106
200
|
|
|
107
201
|
if (useJson) {
|
|
108
202
|
jsonOutput("watch", { pid: childPid, intervalMs, pidFile: pidFilePath });
|
|
@@ -113,7 +207,29 @@ async function runWatch(opts: {
|
|
|
113
207
|
return;
|
|
114
208
|
}
|
|
115
209
|
|
|
116
|
-
// Foreground mode:
|
|
210
|
+
// Foreground mode: acquire the lock atomically before announcing anything.
|
|
211
|
+
// In the background-spawn case the parent has already written this PID into
|
|
212
|
+
// the lock file; acquirePidLock detects own-PID and returns acquired=true
|
|
213
|
+
// idempotently.
|
|
214
|
+
const lockResult: AcquirePidLockResult = await acquirePidLock(
|
|
215
|
+
pidFilePath,
|
|
216
|
+
process.pid,
|
|
217
|
+
isProcessRunning,
|
|
218
|
+
);
|
|
219
|
+
if (!lockResult.acquired) {
|
|
220
|
+
if (useJson) {
|
|
221
|
+
jsonOutput("watch", {
|
|
222
|
+
running: true,
|
|
223
|
+
pid: lockResult.existingPid,
|
|
224
|
+
error: "Watchdog already running",
|
|
225
|
+
});
|
|
226
|
+
} else {
|
|
227
|
+
printError(formatLockContestedError(lockResult.existingPid, pidFilePath));
|
|
228
|
+
}
|
|
229
|
+
process.exitCode = 1;
|
|
230
|
+
return;
|
|
231
|
+
}
|
|
232
|
+
|
|
117
233
|
if (useJson) {
|
|
118
234
|
jsonOutput("watch", { pid: process.pid, intervalMs, mode: "foreground" });
|
|
119
235
|
} else {
|
|
@@ -121,9 +237,6 @@ async function runWatch(opts: {
|
|
|
121
237
|
printHint("Press Ctrl+C to stop.");
|
|
122
238
|
}
|
|
123
239
|
|
|
124
|
-
// Write PID file so `--background` check and external tools can find us
|
|
125
|
-
await writePidFile(pidFilePath, process.pid);
|
|
126
|
-
|
|
127
240
|
const { stop } = startDaemon({
|
|
128
241
|
root: config.project.root,
|
|
129
242
|
intervalMs,
|
|
@@ -131,6 +244,7 @@ async function runWatch(opts: {
|
|
|
131
244
|
zombieThresholdMs,
|
|
132
245
|
nudgeIntervalMs: config.watchdog.nudgeIntervalMs,
|
|
133
246
|
tier1Enabled: config.watchdog.tier1Enabled,
|
|
247
|
+
notifyParentOnDeath: config.watchdog.notifyParentOnDeath ?? true,
|
|
134
248
|
onHealthCheck(check) {
|
|
135
249
|
const timestamp = new Date().toISOString().slice(11, 19);
|
|
136
250
|
process.stdout.write(`[${timestamp}] ${formatCheck(check)}\n`);
|
|
@@ -156,10 +270,21 @@ export function createWatchCommand(): Command {
|
|
|
156
270
|
.description("Start Tier 0 mechanical watchdog daemon")
|
|
157
271
|
.option("--interval <ms>", "Health check interval in milliseconds")
|
|
158
272
|
.option("--background", "Daemonize (run in background)")
|
|
273
|
+
.option(
|
|
274
|
+
"--kill-others",
|
|
275
|
+
"Kill any pre-existing 'ov watch' processes before starting (for cleanup of multi-daemon state)",
|
|
276
|
+
)
|
|
159
277
|
.option("--json", "Output as JSON")
|
|
160
|
-
.action(
|
|
161
|
-
|
|
162
|
-
|
|
278
|
+
.action(
|
|
279
|
+
async (opts: {
|
|
280
|
+
interval?: string;
|
|
281
|
+
background?: boolean;
|
|
282
|
+
killOthers?: boolean;
|
|
283
|
+
json?: boolean;
|
|
284
|
+
}) => {
|
|
285
|
+
await runWatch(opts);
|
|
286
|
+
},
|
|
287
|
+
);
|
|
163
288
|
}
|
|
164
289
|
|
|
165
290
|
/**
|
package/src/config.ts
CHANGED
|
@@ -90,6 +90,7 @@ export const DEFAULT_CONFIG: OverstoryConfig = {
|
|
|
90
90
|
rpcTimeoutMs: 5_000, // 5 seconds for RPC getState() calls
|
|
91
91
|
triageTimeoutMs: 30_000, // 30 seconds for Tier 1 AI triage calls
|
|
92
92
|
maxEscalationLevel: 3, // Maximum escalation level before termination
|
|
93
|
+
notifyParentOnDeath: true, // Send worker_died mail to parent when watchdog terminates a child
|
|
93
94
|
},
|
|
94
95
|
coordinator: {
|
|
95
96
|
exitTriggers: {
|
|
@@ -633,6 +634,16 @@ function validateConfig(config: OverstoryConfig): void {
|
|
|
633
634
|
}
|
|
634
635
|
}
|
|
635
636
|
|
|
637
|
+
if (
|
|
638
|
+
config.watchdog.notifyParentOnDeath !== undefined &&
|
|
639
|
+
typeof config.watchdog.notifyParentOnDeath !== "boolean"
|
|
640
|
+
) {
|
|
641
|
+
throw new ValidationError("watchdog.notifyParentOnDeath must be a boolean", {
|
|
642
|
+
field: "watchdog.notifyParentOnDeath",
|
|
643
|
+
value: config.watchdog.notifyParentOnDeath,
|
|
644
|
+
});
|
|
645
|
+
}
|
|
646
|
+
|
|
636
647
|
// mulch.primeFormat must be one of the valid options
|
|
637
648
|
const validFormats = ["markdown", "xml", "json"] as const;
|
|
638
649
|
if (!validFormats.includes(config.mulch.primeFormat as (typeof validFormats)[number])) {
|
|
@@ -774,6 +785,18 @@ function validateConfig(config: OverstoryConfig): void {
|
|
|
774
785
|
}
|
|
775
786
|
}
|
|
776
787
|
|
|
788
|
+
// runtime.claudeHeadlessByDefault: must be a boolean if present
|
|
789
|
+
if (
|
|
790
|
+
config.runtime?.claudeHeadlessByDefault !== undefined &&
|
|
791
|
+
typeof config.runtime.claudeHeadlessByDefault !== "boolean"
|
|
792
|
+
) {
|
|
793
|
+
process.stderr.write(
|
|
794
|
+
`[overstory] WARNING: runtime.claudeHeadlessByDefault must be a boolean. Got: ${typeof config
|
|
795
|
+
.runtime.claudeHeadlessByDefault}. Ignoring.\n`,
|
|
796
|
+
);
|
|
797
|
+
config.runtime.claudeHeadlessByDefault = undefined;
|
|
798
|
+
}
|
|
799
|
+
|
|
777
800
|
if (config.runtime?.capabilities) {
|
|
778
801
|
for (const [cap, runtimeName] of Object.entries(config.runtime.capabilities)) {
|
|
779
802
|
if (runtimeName !== undefined && (typeof runtimeName !== "string" || runtimeName === "")) {
|
|
@@ -410,6 +410,112 @@ describe("checkConsistency", () => {
|
|
|
410
410
|
expect(checks.find((c) => c.name === "missing-tmux")?.status).toBe("pass");
|
|
411
411
|
});
|
|
412
412
|
|
|
413
|
+
test("orphan-spawns: terminal state with live pid is flagged", async () => {
|
|
414
|
+
const dbPath = join(overstoryDir, "sessions.db");
|
|
415
|
+
const store = createSessionStore(dbPath);
|
|
416
|
+
|
|
417
|
+
store.upsert({
|
|
418
|
+
id: "session-1",
|
|
419
|
+
agentName: "orphaned-agent",
|
|
420
|
+
capability: "builder",
|
|
421
|
+
worktreePath: join(overstoryDir, "worktrees", "orphaned-agent"),
|
|
422
|
+
branchName: "overstory/orphaned-agent/test-123",
|
|
423
|
+
taskId: "test-123",
|
|
424
|
+
tmuxSession: "",
|
|
425
|
+
state: "completed",
|
|
426
|
+
pid: 4242,
|
|
427
|
+
parentAgent: null,
|
|
428
|
+
depth: 0,
|
|
429
|
+
runId: null,
|
|
430
|
+
startedAt: new Date().toISOString(),
|
|
431
|
+
lastActivity: new Date().toISOString(),
|
|
432
|
+
escalationLevel: 0,
|
|
433
|
+
stalledSince: null,
|
|
434
|
+
transcriptPath: null,
|
|
435
|
+
});
|
|
436
|
+
store.close();
|
|
437
|
+
|
|
438
|
+
mockIsProcessAlive.mockReturnValue(true);
|
|
439
|
+
mockListSessions.mockResolvedValue([]);
|
|
440
|
+
|
|
441
|
+
const checks = await checkConsistency(config, overstoryDir, mockDeps);
|
|
442
|
+
|
|
443
|
+
const orphanCheck = checks.find((c) => c.name === "orphan-spawns");
|
|
444
|
+
expect(orphanCheck).toBeDefined();
|
|
445
|
+
expect(orphanCheck?.status).toBe("warn");
|
|
446
|
+
expect(orphanCheck?.message).toContain("1 orphaned spawn");
|
|
447
|
+
expect(orphanCheck?.details?.[0]).toContain("orphaned-agent");
|
|
448
|
+
expect(orphanCheck?.fixable).toBe(true);
|
|
449
|
+
});
|
|
450
|
+
|
|
451
|
+
test("orphan-spawns: tmux dead but pid alive is flagged", async () => {
|
|
452
|
+
const dbPath = join(overstoryDir, "sessions.db");
|
|
453
|
+
const store = createSessionStore(dbPath);
|
|
454
|
+
|
|
455
|
+
store.upsert({
|
|
456
|
+
id: "session-1",
|
|
457
|
+
agentName: "tmux-dead-agent",
|
|
458
|
+
capability: "builder",
|
|
459
|
+
worktreePath: join(overstoryDir, "worktrees", "tmux-dead-agent"),
|
|
460
|
+
branchName: "overstory/tmux-dead-agent/test-123",
|
|
461
|
+
taskId: "test-123",
|
|
462
|
+
tmuxSession: "overstory-testproject-tmux-dead-agent",
|
|
463
|
+
state: "working",
|
|
464
|
+
pid: 4242,
|
|
465
|
+
parentAgent: null,
|
|
466
|
+
depth: 0,
|
|
467
|
+
runId: null,
|
|
468
|
+
startedAt: new Date().toISOString(),
|
|
469
|
+
lastActivity: new Date().toISOString(),
|
|
470
|
+
escalationLevel: 0,
|
|
471
|
+
stalledSince: null,
|
|
472
|
+
transcriptPath: null,
|
|
473
|
+
});
|
|
474
|
+
store.close();
|
|
475
|
+
|
|
476
|
+
mockIsProcessAlive.mockReturnValue(true);
|
|
477
|
+
// tmux server reports no matching session
|
|
478
|
+
mockListSessions.mockResolvedValue([]);
|
|
479
|
+
|
|
480
|
+
const checks = await checkConsistency(config, overstoryDir, mockDeps);
|
|
481
|
+
|
|
482
|
+
const orphanCheck = checks.find((c) => c.name === "orphan-spawns");
|
|
483
|
+
expect(orphanCheck?.status).toBe("warn");
|
|
484
|
+
expect(orphanCheck?.details?.[0]).toContain("tmux session");
|
|
485
|
+
});
|
|
486
|
+
|
|
487
|
+
test("orphan-spawns: passes when terminal-state pid is dead", async () => {
|
|
488
|
+
const dbPath = join(overstoryDir, "sessions.db");
|
|
489
|
+
const store = createSessionStore(dbPath);
|
|
490
|
+
|
|
491
|
+
store.upsert({
|
|
492
|
+
id: "session-1",
|
|
493
|
+
agentName: "clean-completed",
|
|
494
|
+
capability: "builder",
|
|
495
|
+
worktreePath: join(overstoryDir, "worktrees", "clean-completed"),
|
|
496
|
+
branchName: "overstory/clean-completed/test-123",
|
|
497
|
+
taskId: "test-123",
|
|
498
|
+
tmuxSession: "",
|
|
499
|
+
state: "completed",
|
|
500
|
+
pid: 4242,
|
|
501
|
+
parentAgent: null,
|
|
502
|
+
depth: 0,
|
|
503
|
+
runId: null,
|
|
504
|
+
startedAt: new Date().toISOString(),
|
|
505
|
+
lastActivity: new Date().toISOString(),
|
|
506
|
+
escalationLevel: 0,
|
|
507
|
+
stalledSince: null,
|
|
508
|
+
transcriptPath: null,
|
|
509
|
+
});
|
|
510
|
+
store.close();
|
|
511
|
+
|
|
512
|
+
mockIsProcessAlive.mockReturnValue(false);
|
|
513
|
+
|
|
514
|
+
const checks = await checkConsistency(config, overstoryDir, mockDeps);
|
|
515
|
+
|
|
516
|
+
expect(checks.find((c) => c.name === "orphan-spawns")?.status).toBe("pass");
|
|
517
|
+
});
|
|
518
|
+
|
|
413
519
|
test("handles tmux not installed gracefully", async () => {
|
|
414
520
|
// Mock tmux listing to throw an error
|
|
415
521
|
mockListSessions.mockRejectedValue(new Error("tmux: command not found"));
|
|
@@ -212,7 +212,9 @@ export async function checkConsistency(
|
|
|
212
212
|
|
|
213
213
|
// 8. Check for SessionStore entries with missing tmux sessions
|
|
214
214
|
const existingTmuxNames = new Set(tmuxSessions.map((s) => s.name));
|
|
215
|
-
const missingTmux = liveSessions.filter(
|
|
215
|
+
const missingTmux = liveSessions.filter(
|
|
216
|
+
(s) => s.tmuxSession.length > 0 && !existingTmuxNames.has(s.tmuxSession),
|
|
217
|
+
);
|
|
216
218
|
|
|
217
219
|
if (missingTmux.length > 0) {
|
|
218
220
|
checks.push({
|
|
@@ -232,6 +234,51 @@ export async function checkConsistency(
|
|
|
232
234
|
});
|
|
233
235
|
}
|
|
234
236
|
|
|
237
|
+
// 8b. Check for orphaned claude spawn PIDs (overstory-505d).
|
|
238
|
+
//
|
|
239
|
+
// An orphan is a session whose pid is still alive but should not be:
|
|
240
|
+
// - the session reached a terminal state (completed/zombie) yet the
|
|
241
|
+
// spawn didn't exit, or
|
|
242
|
+
// - the tmux container is gone but the claude child survived (was
|
|
243
|
+
// reparented to init when its bash wrapper got SIGHUP).
|
|
244
|
+
// Run `ov clean --all` to reap. Distinct from `dead-pids` (the inverse:
|
|
245
|
+
// session is live but its pid already died).
|
|
246
|
+
const orphanedSpawns: Array<{ session: AgentSession; reason: string }> = [];
|
|
247
|
+
for (const s of storeSessions) {
|
|
248
|
+
if (s.pid === null || !isProcessAliveFn(s.pid)) continue;
|
|
249
|
+
if (s.state === "completed" || s.state === "zombie") {
|
|
250
|
+
orphanedSpawns.push({
|
|
251
|
+
session: s,
|
|
252
|
+
reason: `state=${s.state} but pid ${s.pid} still alive`,
|
|
253
|
+
});
|
|
254
|
+
continue;
|
|
255
|
+
}
|
|
256
|
+
if (s.tmuxSession.length > 0 && !existingTmuxNames.has(s.tmuxSession)) {
|
|
257
|
+
orphanedSpawns.push({
|
|
258
|
+
session: s,
|
|
259
|
+
reason: `tmux session "${s.tmuxSession}" missing but pid ${s.pid} alive`,
|
|
260
|
+
});
|
|
261
|
+
}
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
if (orphanedSpawns.length > 0) {
|
|
265
|
+
checks.push({
|
|
266
|
+
name: "orphan-spawns",
|
|
267
|
+
category: "consistency",
|
|
268
|
+
status: "warn",
|
|
269
|
+
message: `Found ${orphanedSpawns.length} orphaned spawn process(es) — run "ov clean --all" to reap`,
|
|
270
|
+
details: orphanedSpawns.map(({ session, reason }) => `${session.agentName}: ${reason}`),
|
|
271
|
+
fixable: true,
|
|
272
|
+
});
|
|
273
|
+
} else {
|
|
274
|
+
checks.push({
|
|
275
|
+
name: "orphan-spawns",
|
|
276
|
+
category: "consistency",
|
|
277
|
+
status: "pass",
|
|
278
|
+
message: "No orphaned spawn processes detected",
|
|
279
|
+
});
|
|
280
|
+
}
|
|
281
|
+
|
|
235
282
|
// 9. Check reviewer-to-builder ratio per lead
|
|
236
283
|
const parentGroups = new Map<string, { builders: number; reviewers: number }>();
|
|
237
284
|
for (const session of storeSessions) {
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
import { afterEach, beforeEach, describe, expect, test } from "bun:test";
|
|
2
|
+
import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from "node:fs";
|
|
3
|
+
import { tmpdir } from "node:os";
|
|
4
|
+
import { join } from "node:path";
|
|
5
|
+
import type { OverstoryConfig } from "../types.ts";
|
|
6
|
+
import { checkServe } from "./serve.ts";
|
|
7
|
+
|
|
8
|
+
describe("checkServe", () => {
|
|
9
|
+
let tempDir: string;
|
|
10
|
+
let mockConfig: OverstoryConfig;
|
|
11
|
+
|
|
12
|
+
beforeEach(() => {
|
|
13
|
+
tempDir = mkdtempSync(join(tmpdir(), "overstory-serve-doctor-test-"));
|
|
14
|
+
mockConfig = {
|
|
15
|
+
project: { name: "test", root: tempDir, canonicalBranch: "main" },
|
|
16
|
+
agents: {
|
|
17
|
+
manifestPath: "",
|
|
18
|
+
baseDir: "",
|
|
19
|
+
maxConcurrent: 5,
|
|
20
|
+
staggerDelayMs: 100,
|
|
21
|
+
maxDepth: 2,
|
|
22
|
+
maxSessionsPerRun: 0,
|
|
23
|
+
maxAgentsPerLead: 5,
|
|
24
|
+
},
|
|
25
|
+
worktrees: { baseDir: "" },
|
|
26
|
+
taskTracker: { backend: "auto", enabled: true },
|
|
27
|
+
mulch: { enabled: true, domains: [], primeFormat: "markdown" },
|
|
28
|
+
merge: { aiResolveEnabled: false, reimagineEnabled: false },
|
|
29
|
+
providers: {
|
|
30
|
+
anthropic: { type: "native" },
|
|
31
|
+
},
|
|
32
|
+
watchdog: {
|
|
33
|
+
tier0Enabled: false,
|
|
34
|
+
tier0IntervalMs: 30000,
|
|
35
|
+
tier1Enabled: false,
|
|
36
|
+
tier2Enabled: false,
|
|
37
|
+
staleThresholdMs: 300000,
|
|
38
|
+
zombieThresholdMs: 600000,
|
|
39
|
+
nudgeIntervalMs: 60000,
|
|
40
|
+
},
|
|
41
|
+
models: {},
|
|
42
|
+
logging: { verbose: false, redactSecrets: true },
|
|
43
|
+
};
|
|
44
|
+
});
|
|
45
|
+
|
|
46
|
+
afterEach(() => {
|
|
47
|
+
rmSync(tempDir, { recursive: true, force: true });
|
|
48
|
+
});
|
|
49
|
+
|
|
50
|
+
test("ui/dist missing — returns warn about missing build", async () => {
|
|
51
|
+
const checks = await checkServe(mockConfig, tempDir);
|
|
52
|
+
const distCheck = checks.find((c) => c.name === "serve ui/dist");
|
|
53
|
+
|
|
54
|
+
expect(distCheck).toBeDefined();
|
|
55
|
+
expect(distCheck?.status).toBe("warn");
|
|
56
|
+
expect(distCheck?.message).toContain("ui/dist not found");
|
|
57
|
+
expect(distCheck?.details?.some((d) => d.includes("ui/dist"))).toBe(true);
|
|
58
|
+
});
|
|
59
|
+
|
|
60
|
+
test("ui/dist exists but index.html missing — returns warn about incomplete build", async () => {
|
|
61
|
+
mkdirSync(join(tempDir, "ui", "dist"), { recursive: true });
|
|
62
|
+
const checks = await checkServe(mockConfig, tempDir);
|
|
63
|
+
const distCheck = checks.find((c) => c.name === "serve ui/dist");
|
|
64
|
+
|
|
65
|
+
expect(distCheck).toBeDefined();
|
|
66
|
+
expect(distCheck?.status).toBe("warn");
|
|
67
|
+
expect(distCheck?.message).toContain("index.html is missing");
|
|
68
|
+
});
|
|
69
|
+
|
|
70
|
+
test("ui/dist with index.html — returns pass", async () => {
|
|
71
|
+
mkdirSync(join(tempDir, "ui", "dist"), { recursive: true });
|
|
72
|
+
writeFileSync(join(tempDir, "ui", "dist", "index.html"), "<html></html>");
|
|
73
|
+
const checks = await checkServe(mockConfig, tempDir);
|
|
74
|
+
const distCheck = checks.find((c) => c.name === "serve ui/dist");
|
|
75
|
+
|
|
76
|
+
expect(distCheck).toBeDefined();
|
|
77
|
+
expect(distCheck?.status).toBe("pass");
|
|
78
|
+
expect(distCheck?.message).toContain("index.html");
|
|
79
|
+
});
|
|
80
|
+
|
|
81
|
+
test("port check included in results", async () => {
|
|
82
|
+
const checks = await checkServe(mockConfig, tempDir);
|
|
83
|
+
const portCheck = checks.find((c) => c.name === "serve port");
|
|
84
|
+
|
|
85
|
+
expect(portCheck).toBeDefined();
|
|
86
|
+
// Server not running — should warn (or pass if something happens to be on the default port)
|
|
87
|
+
expect(portCheck?.status === "warn" || portCheck?.status === "pass").toBe(true);
|
|
88
|
+
});
|
|
89
|
+
|
|
90
|
+
test("returns exactly 2 checks (ui/dist + port)", async () => {
|
|
91
|
+
const checks = await checkServe(mockConfig, tempDir);
|
|
92
|
+
expect(checks).toHaveLength(2);
|
|
93
|
+
expect(checks.map((c) => c.category).every((cat) => cat === "serve")).toBe(true);
|
|
94
|
+
});
|
|
95
|
+
});
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
import { existsSync } from "node:fs";
|
|
2
|
+
import { join } from "node:path";
|
|
3
|
+
import { DEFAULT_SERVE_PORT } from "../commands/serve.ts";
|
|
4
|
+
import type { DoctorCheck, DoctorCheckFn } from "./types.ts";
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* ov serve subsystem health checks.
|
|
8
|
+
* Validates ui/dist build output and port reachability.
|
|
9
|
+
*/
|
|
10
|
+
export const checkServe: DoctorCheckFn = async (config, _overstoryDir): Promise<DoctorCheck[]> => {
|
|
11
|
+
const checks: DoctorCheck[] = [];
|
|
12
|
+
|
|
13
|
+
// Check 1: ui/dist directory exists (only relevant if a UI has been built)
|
|
14
|
+
const uiDistPath = join(config.project.root, "ui", "dist");
|
|
15
|
+
const uiDistExists = existsSync(uiDistPath);
|
|
16
|
+
const indexHtmlExists = uiDistExists && existsSync(join(uiDistPath, "index.html"));
|
|
17
|
+
|
|
18
|
+
if (!uiDistExists) {
|
|
19
|
+
checks.push({
|
|
20
|
+
name: "serve ui/dist",
|
|
21
|
+
category: "serve",
|
|
22
|
+
status: "warn",
|
|
23
|
+
message: "ui/dist not found — run the UI build before starting ov serve",
|
|
24
|
+
details: [`Expected: ${uiDistPath}`],
|
|
25
|
+
});
|
|
26
|
+
} else if (!indexHtmlExists) {
|
|
27
|
+
checks.push({
|
|
28
|
+
name: "serve ui/dist",
|
|
29
|
+
category: "serve",
|
|
30
|
+
status: "warn",
|
|
31
|
+
message: "ui/dist exists but index.html is missing — UI build may be incomplete",
|
|
32
|
+
details: [`Expected: ${join(uiDistPath, "index.html")}`],
|
|
33
|
+
});
|
|
34
|
+
} else {
|
|
35
|
+
checks.push({
|
|
36
|
+
name: "serve ui/dist",
|
|
37
|
+
category: "serve",
|
|
38
|
+
status: "pass",
|
|
39
|
+
message: "ui/dist is present with index.html",
|
|
40
|
+
});
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
// Check 2: default port reachability (non-blocking probe)
|
|
44
|
+
const port = DEFAULT_SERVE_PORT;
|
|
45
|
+
const host = "127.0.0.1";
|
|
46
|
+
const reachable = await probePort(host, port);
|
|
47
|
+
if (reachable) {
|
|
48
|
+
checks.push({
|
|
49
|
+
name: "serve port",
|
|
50
|
+
category: "serve",
|
|
51
|
+
status: "pass",
|
|
52
|
+
message: `ov serve is reachable on ${host}:${port}`,
|
|
53
|
+
});
|
|
54
|
+
} else {
|
|
55
|
+
checks.push({
|
|
56
|
+
name: "serve port",
|
|
57
|
+
category: "serve",
|
|
58
|
+
status: "warn",
|
|
59
|
+
message: `ov serve is not running on ${host}:${port}`,
|
|
60
|
+
details: [`Start with: ov serve --port ${port}`],
|
|
61
|
+
});
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
return checks;
|
|
65
|
+
};
|
|
66
|
+
|
|
67
|
+
/**
|
|
68
|
+
* Probe whether a TCP port is open by attempting an HTTP connection.
|
|
69
|
+
* Returns true if the server responds, false on any error.
|
|
70
|
+
*/
|
|
71
|
+
async function probePort(host: string, port: number): Promise<boolean> {
|
|
72
|
+
try {
|
|
73
|
+
const controller = new AbortController();
|
|
74
|
+
const timeout = setTimeout(() => controller.abort(), 1000);
|
|
75
|
+
try {
|
|
76
|
+
const res = await fetch(`http://${host}:${port}/healthz`, {
|
|
77
|
+
signal: controller.signal,
|
|
78
|
+
});
|
|
79
|
+
return res.ok || res.status < 500;
|
|
80
|
+
} finally {
|
|
81
|
+
clearTimeout(timeout);
|
|
82
|
+
}
|
|
83
|
+
} catch {
|
|
84
|
+
return false;
|
|
85
|
+
}
|
|
86
|
+
}
|