@os-eco/overstory-cli 0.9.4 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +50 -19
- package/agents/builder.md +19 -9
- package/agents/coordinator.md +6 -6
- package/agents/lead.md +204 -87
- package/agents/merger.md +25 -14
- package/agents/reviewer.md +22 -16
- package/agents/scout.md +17 -12
- package/package.json +6 -3
- package/src/agents/capabilities.test.ts +85 -0
- package/src/agents/capabilities.ts +125 -0
- package/src/agents/headless-mail-injector.test.ts +448 -0
- package/src/agents/headless-mail-injector.ts +219 -0
- package/src/agents/headless-prompt.test.ts +102 -0
- package/src/agents/headless-prompt.ts +68 -0
- package/src/agents/hooks-deployer.test.ts +514 -14
- package/src/agents/hooks-deployer.ts +141 -0
- package/src/agents/mail-poll-detect.test.ts +153 -0
- package/src/agents/mail-poll-detect.ts +73 -0
- package/src/agents/overlay.test.ts +60 -4
- package/src/agents/overlay.ts +63 -8
- package/src/agents/scope-detect.test.ts +190 -0
- package/src/agents/scope-detect.ts +146 -0
- package/src/agents/turn-lock.test.ts +181 -0
- package/src/agents/turn-lock.ts +235 -0
- package/src/agents/turn-runner-dispatch.test.ts +182 -0
- package/src/agents/turn-runner-dispatch.ts +105 -0
- package/src/agents/turn-runner.test.ts +2312 -0
- package/src/agents/turn-runner.ts +1383 -0
- package/src/commands/agents.ts +9 -0
- package/src/commands/clean.ts +54 -0
- package/src/commands/coordinator.test.ts +254 -0
- package/src/commands/coordinator.ts +273 -8
- package/src/commands/dashboard.test.ts +188 -0
- package/src/commands/dashboard.ts +14 -4
- package/src/commands/doctor.ts +3 -1
- package/src/commands/group.test.ts +94 -0
- package/src/commands/group.ts +49 -20
- package/src/commands/init.test.ts +8 -0
- package/src/commands/init.ts +8 -1
- package/src/commands/log.test.ts +187 -11
- package/src/commands/log.ts +171 -71
- package/src/commands/mail.test.ts +162 -0
- package/src/commands/mail.ts +64 -9
- package/src/commands/merge.test.ts +230 -1
- package/src/commands/merge.ts +68 -12
- package/src/commands/nudge.test.ts +351 -4
- package/src/commands/nudge.ts +356 -34
- package/src/commands/run.test.ts +43 -7
- package/src/commands/serve/build.test.ts +202 -0
- package/src/commands/serve/build.ts +206 -0
- package/src/commands/serve/coordinator-actions.test.ts +339 -0
- package/src/commands/serve/coordinator-actions.ts +408 -0
- package/src/commands/serve/dev.test.ts +168 -0
- package/src/commands/serve/dev.ts +117 -0
- package/src/commands/serve/mail-actions.test.ts +312 -0
- package/src/commands/serve/mail-actions.ts +167 -0
- package/src/commands/serve/rest.test.ts +1323 -0
- package/src/commands/serve/rest.ts +708 -0
- package/src/commands/serve/static.ts +51 -0
- package/src/commands/serve/ws.test.ts +361 -0
- package/src/commands/serve/ws.ts +332 -0
- package/src/commands/serve.test.ts +459 -0
- package/src/commands/serve.ts +565 -0
- package/src/commands/sling.test.ts +177 -1
- package/src/commands/sling.ts +243 -71
- package/src/commands/status.test.ts +9 -0
- package/src/commands/status.ts +12 -4
- package/src/commands/stop.test.ts +255 -1
- package/src/commands/stop.ts +107 -8
- package/src/commands/watch.test.ts +43 -0
- package/src/commands/watch.ts +153 -28
- package/src/config.ts +23 -0
- package/src/doctor/consistency.test.ts +106 -0
- package/src/doctor/consistency.ts +48 -1
- package/src/doctor/serve.test.ts +95 -0
- package/src/doctor/serve.ts +86 -0
- package/src/doctor/types.ts +2 -1
- package/src/doctor/watchdog.ts +57 -1
- package/src/events/tailer.test.ts +234 -1
- package/src/events/tailer.ts +90 -0
- package/src/index.ts +57 -6
- package/src/insights/quality-gates.test.ts +141 -0
- package/src/insights/quality-gates.ts +156 -0
- package/src/json.ts +29 -0
- package/src/logging/theme.ts +4 -0
- package/src/mail/client.ts +15 -2
- package/src/mail/store.test.ts +82 -0
- package/src/mail/store.ts +41 -4
- package/src/merge/lock.test.ts +149 -0
- package/src/merge/lock.ts +140 -0
- package/src/merge/predict.test.ts +387 -0
- package/src/merge/predict.ts +249 -0
- package/src/merge/resolver.ts +1 -1
- package/src/mulch/client.ts +3 -3
- package/src/runtimes/__fixtures__/claude-stream-fixture.ts +22 -0
- package/src/runtimes/claude.test.ts +791 -1
- package/src/runtimes/claude.ts +323 -1
- package/src/runtimes/connections.test.ts +141 -1
- package/src/runtimes/connections.ts +73 -4
- package/src/runtimes/headless-connection.test.ts +264 -0
- package/src/runtimes/headless-connection.ts +158 -0
- package/src/runtimes/types.ts +10 -0
- package/src/schema-consistency.test.ts +1 -0
- package/src/sessions/store.test.ts +657 -29
- package/src/sessions/store.ts +286 -23
- package/src/test-setup.test.ts +31 -0
- package/src/test-setup.ts +28 -0
- package/src/types.ts +107 -2
- package/src/utils/pid.test.ts +85 -1
- package/src/utils/pid.ts +86 -1
- package/src/utils/process-scan.test.ts +53 -0
- package/src/utils/process-scan.ts +76 -0
- package/src/watchdog/daemon.test.ts +1607 -376
- package/src/watchdog/daemon.ts +462 -88
- package/src/watchdog/health.test.ts +282 -0
- package/src/watchdog/health.ts +126 -27
- package/src/worktree/manager.test.ts +218 -1
- package/src/worktree/manager.ts +55 -0
- package/src/worktree/process.test.ts +71 -0
- package/src/worktree/process.ts +25 -5
- package/src/worktree/tmux.test.ts +28 -0
- package/src/worktree/tmux.ts +27 -3
- package/templates/CLAUDE.md.tmpl +19 -8
- package/templates/overlay.md.tmpl +5 -2
package/src/types.ts
CHANGED
|
@@ -108,6 +108,7 @@ export interface OverstoryConfig {
|
|
|
108
108
|
rpcTimeoutMs?: number; // Timeout for RPC getState() calls (default 5_000)
|
|
109
109
|
triageTimeoutMs?: number; // Timeout for Tier 1 AI triage calls (default 30_000)
|
|
110
110
|
maxEscalationLevel?: number; // Maximum escalation level before termination (default 3)
|
|
111
|
+
notifyParentOnDeath?: boolean; // Send synthetic worker_died mail to parent on watchdog termination (default true)
|
|
111
112
|
};
|
|
112
113
|
models: Partial<Record<string, ModelRef>>;
|
|
113
114
|
logging: {
|
|
@@ -141,6 +142,13 @@ export interface OverstoryConfig {
|
|
|
141
142
|
* Default: 0 (no delay).
|
|
142
143
|
*/
|
|
143
144
|
shellInitDelayMs?: number;
|
|
145
|
+
/**
|
|
146
|
+
* Project-level default for spawning Claude Code agents in headless mode
|
|
147
|
+
* (Bun.spawn + stream-json) instead of the tmux interactive runtime.
|
|
148
|
+
* Per-spawn `--headless` / `--no-headless` flags on `ov sling` override this.
|
|
149
|
+
* Default: false (tmux).
|
|
150
|
+
*/
|
|
151
|
+
claudeHeadlessByDefault?: boolean;
|
|
144
152
|
};
|
|
145
153
|
}
|
|
146
154
|
|
|
@@ -179,7 +187,49 @@ export type Capability = (typeof SUPPORTED_CAPABILITIES)[number];
|
|
|
179
187
|
|
|
180
188
|
// === Agent Session ===
|
|
181
189
|
|
|
182
|
-
|
|
190
|
+
/**
|
|
191
|
+
* Agent lifecycle states.
|
|
192
|
+
*
|
|
193
|
+
* `in_turn` and `between_turns` are spawn-per-turn-specific substates that
|
|
194
|
+
* split the legacy `working` state so the UI can distinguish a worker actively
|
|
195
|
+
* executing a turn from one idling between mail batches (overstory-3087):
|
|
196
|
+
*
|
|
197
|
+
* - `in_turn`: the turn-runner has observed at least one parser event from
|
|
198
|
+
* a live claude subprocess. The agent is mid-execution.
|
|
199
|
+
* - `between_turns`: the turn-runner finished a turn without a terminal
|
|
200
|
+
* mail; the agent is alive (process gone, session pinned) and waiting
|
|
201
|
+
* for the next mail batch to spawn a fresh turn.
|
|
202
|
+
*
|
|
203
|
+
* `working` remains the active state for tmux/long-lived headless agents
|
|
204
|
+
* (coordinator, orchestrator, monitor, sapling) which have no per-turn
|
|
205
|
+
* boundary. Spawn-per-turn workers (builder/scout/reviewer/lead/merger
|
|
206
|
+
* under the headless default) transition through in_turn ↔ between_turns
|
|
207
|
+
* instead.
|
|
208
|
+
*/
|
|
209
|
+
export type AgentState =
|
|
210
|
+
| "booting"
|
|
211
|
+
| "working"
|
|
212
|
+
| "in_turn"
|
|
213
|
+
| "between_turns"
|
|
214
|
+
| "completed"
|
|
215
|
+
| "stalled"
|
|
216
|
+
| "zombie";
|
|
217
|
+
|
|
218
|
+
/**
|
|
219
|
+
* Result of a guarded state transition attempt (`SessionStore.tryTransitionState`).
|
|
220
|
+
*
|
|
221
|
+
* Discriminated by `ok`. When `ok` is false, `reason` distinguishes:
|
|
222
|
+
* - `not_found`: no session exists for the given name.
|
|
223
|
+
* - `illegal_transition`: a session exists but the matrix forbids prev → attempted.
|
|
224
|
+
*
|
|
225
|
+
* `prev` is always the state observed by the SQL CAS. For `illegal_transition` it
|
|
226
|
+
* is the state that blocked the write (which may differ from what the caller read,
|
|
227
|
+
* if another writer landed first).
|
|
228
|
+
*/
|
|
229
|
+
export type TransitionOutcome =
|
|
230
|
+
| { ok: true; prev: AgentState; next: AgentState }
|
|
231
|
+
| { ok: false; reason: "not_found"; attempted: AgentState }
|
|
232
|
+
| { ok: false; reason: "illegal_transition"; prev: AgentState; attempted: AgentState };
|
|
183
233
|
|
|
184
234
|
export interface AgentSession {
|
|
185
235
|
id: string; // Unique session ID
|
|
@@ -200,6 +250,7 @@ export interface AgentSession {
|
|
|
200
250
|
stalledSince: string | null; // ISO timestamp when agent first entered stalled state
|
|
201
251
|
transcriptPath: string | null; // Runtime-provided transcript JSONL path (decoupled from ~/.claude/)
|
|
202
252
|
promptVersion?: string | null; // Canopy prompt version used at sling time (e.g. "builder@17")
|
|
253
|
+
claudeSessionId?: string | null; // Runtime-provided session_id (Claude stream-json), eagerly pinned on first event
|
|
203
254
|
}
|
|
204
255
|
|
|
205
256
|
// === Agent Identity ===
|
|
@@ -225,6 +276,7 @@ export type MailSemanticType = "status" | "question" | "result" | "error";
|
|
|
225
276
|
/** Protocol message types for structured agent coordination. */
|
|
226
277
|
export type MailProtocolType =
|
|
227
278
|
| "worker_done"
|
|
279
|
+
| "worker_died"
|
|
228
280
|
| "merge_ready"
|
|
229
281
|
| "merged"
|
|
230
282
|
| "merge_failed"
|
|
@@ -244,6 +296,7 @@ export const MAIL_MESSAGE_TYPES: readonly MailMessageType[] = [
|
|
|
244
296
|
"result",
|
|
245
297
|
"error",
|
|
246
298
|
"worker_done",
|
|
299
|
+
"worker_died",
|
|
247
300
|
"merge_ready",
|
|
248
301
|
"merged",
|
|
249
302
|
"merge_failed",
|
|
@@ -278,6 +331,33 @@ export interface WorkerDonePayload {
|
|
|
278
331
|
filesModified: string[];
|
|
279
332
|
}
|
|
280
333
|
|
|
334
|
+
/**
|
|
335
|
+
* Watchdog signals the parent that one of its children was terminated.
|
|
336
|
+
*
|
|
337
|
+
* Synthetic mail injected by the Tier 0 daemon when it transitions a worker
|
|
338
|
+
* to `zombie` (overstory-c111). Without this, the parent — typically a lead
|
|
339
|
+
* waiting for `worker_done` from this child — would block indefinitely on
|
|
340
|
+
* mail that will never arrive. The parent reads this on its next mail-injector
|
|
341
|
+
* tick and decides whether to retry, escalate, or report up.
|
|
342
|
+
*/
|
|
343
|
+
export interface WorkerDiedPayload {
|
|
344
|
+
agentName: string;
|
|
345
|
+
capability: string;
|
|
346
|
+
taskId: string;
|
|
347
|
+
/** Reason the watchdog or runner terminated the child (e.g. "Process terminated"). */
|
|
348
|
+
reason: string;
|
|
349
|
+
/** ISO timestamp of the child's last observed activity. */
|
|
350
|
+
lastActivity: string;
|
|
351
|
+
/**
|
|
352
|
+
* Source that detected the failure.
|
|
353
|
+
* - `tier0`/`tier1`: watchdog daemon detected a dead/stuck process out-of-band.
|
|
354
|
+
* - `runner`: the per-turn runner observed an in-band failure — either an
|
|
355
|
+
* abort/stall that forced SIGTERM/SIGKILL, or a clean exit without the
|
|
356
|
+
* capability's terminal mail (silent-no-op, overstory-4159 / overstory-c772).
|
|
357
|
+
*/
|
|
358
|
+
terminatedBy: "tier0" | "tier1" | "runner";
|
|
359
|
+
}
|
|
360
|
+
|
|
281
361
|
/** Supervisor signals branch is verified and ready for merge. */
|
|
282
362
|
export interface MergeReadyPayload {
|
|
283
363
|
branch: string;
|
|
@@ -349,6 +429,7 @@ export interface DecisionGatePayload {
|
|
|
349
429
|
/** Maps protocol message types to their payload interfaces. */
|
|
350
430
|
export interface MailPayloadMap {
|
|
351
431
|
worker_done: WorkerDonePayload;
|
|
432
|
+
worker_died: WorkerDiedPayload;
|
|
352
433
|
merge_ready: MergeReadyPayload;
|
|
353
434
|
merged: MergedPayload;
|
|
354
435
|
merge_failed: MergeFailedPayload;
|
|
@@ -391,6 +472,13 @@ export interface OverlayConfig {
|
|
|
391
472
|
qualityGates?: QualityGate[];
|
|
392
473
|
/** Relative path to the instruction file within the worktree (runtime-specific). Defaults to .claude/CLAUDE.md. */
|
|
393
474
|
instructionPath?: string;
|
|
475
|
+
/**
|
|
476
|
+
* Names of sibling agents dispatched in parallel that may share file scope
|
|
477
|
+
* with this agent. When set, the overlay renders a "Parallel Siblings"
|
|
478
|
+
* section with rebase-before-merge_ready guidance (overstory-f76a). Empty
|
|
479
|
+
* or unset → no overlay section.
|
|
480
|
+
*/
|
|
481
|
+
siblings?: string[];
|
|
394
482
|
}
|
|
395
483
|
|
|
396
484
|
// === Merge Queue ===
|
|
@@ -436,6 +524,23 @@ export interface ConflictHistory {
|
|
|
436
524
|
predictedConflictFiles: string[];
|
|
437
525
|
}
|
|
438
526
|
|
|
527
|
+
/**
|
|
528
|
+
* Side-effect-free prediction of how `ov merge` would resolve a branch.
|
|
529
|
+
* Produced by `predictConflicts` (src/merge/predict.ts) without touching HEAD,
|
|
530
|
+
* the working tree, or the merge lock — surfaced via `ov merge --dry-run` so a
|
|
531
|
+
* lead/operator/greenhouse can branch on `wouldRequireAgent`.
|
|
532
|
+
*/
|
|
533
|
+
export interface ConflictPrediction {
|
|
534
|
+
/** The tier `ov merge` would land in if invoked now. */
|
|
535
|
+
predictedTier: ResolutionTier;
|
|
536
|
+
/** Files that would conflict — empty for clean-merge. */
|
|
537
|
+
conflictFiles: string[];
|
|
538
|
+
/** True iff predictedTier is "ai-resolve" or "reimagine" (Tier 3+). */
|
|
539
|
+
wouldRequireAgent: boolean;
|
|
540
|
+
/** Short, operator-readable explanation for the predicted tier. */
|
|
541
|
+
reason: string;
|
|
542
|
+
}
|
|
543
|
+
|
|
439
544
|
// === Watchdog ===
|
|
440
545
|
|
|
441
546
|
export interface HealthCheck {
|
|
@@ -446,7 +551,7 @@ export interface HealthCheck {
|
|
|
446
551
|
pidAlive: boolean | null; // null when pid is unavailable
|
|
447
552
|
lastActivity: string;
|
|
448
553
|
state: AgentState;
|
|
449
|
-
action: "none" | "escalate" | "terminate" | "investigate";
|
|
554
|
+
action: "none" | "escalate" | "terminate" | "investigate" | "complete";
|
|
450
555
|
/** Describes any conflict between observable state and recorded state. */
|
|
451
556
|
reconciliationNote: string | null;
|
|
452
557
|
}
|
package/src/utils/pid.test.ts
CHANGED
|
@@ -3,7 +3,7 @@ import { mkdtemp } from "node:fs/promises";
|
|
|
3
3
|
import { tmpdir } from "node:os";
|
|
4
4
|
import { join } from "node:path";
|
|
5
5
|
import { cleanupTempDir } from "../test-helpers.ts";
|
|
6
|
-
import { readPidFile, removePidFile, writePidFile } from "./pid.ts";
|
|
6
|
+
import { acquirePidLock, readPidFile, removePidFile, writePidFile } from "./pid.ts";
|
|
7
7
|
|
|
8
8
|
let tempDir: string;
|
|
9
9
|
|
|
@@ -66,3 +66,87 @@ describe("removePidFile", () => {
|
|
|
66
66
|
// No throw = pass
|
|
67
67
|
});
|
|
68
68
|
});
|
|
69
|
+
|
|
70
|
+
describe("acquirePidLock", () => {
|
|
71
|
+
const alwaysAlive = (_pid: number) => true;
|
|
72
|
+
const alwaysDead = (_pid: number) => false;
|
|
73
|
+
|
|
74
|
+
test("acquires when no lock file exists", async () => {
|
|
75
|
+
const pidPath = join(tempDir, "lock.pid");
|
|
76
|
+
const result = await acquirePidLock(pidPath, 1234, alwaysAlive);
|
|
77
|
+
expect(result.acquired).toBe(true);
|
|
78
|
+
expect(await readPidFile(pidPath)).toBe(1234);
|
|
79
|
+
});
|
|
80
|
+
|
|
81
|
+
test("creates parent directory if missing", async () => {
|
|
82
|
+
const pidPath = join(tempDir, "nested", "deeper", "lock.pid");
|
|
83
|
+
const result = await acquirePidLock(pidPath, 555, alwaysAlive);
|
|
84
|
+
expect(result.acquired).toBe(true);
|
|
85
|
+
expect(await readPidFile(pidPath)).toBe(555);
|
|
86
|
+
});
|
|
87
|
+
|
|
88
|
+
test("refuses when a live foreign PID owns the lock", async () => {
|
|
89
|
+
const pidPath = join(tempDir, "lock.pid");
|
|
90
|
+
await Bun.write(pidPath, "9999\n");
|
|
91
|
+
const result = await acquirePidLock(pidPath, 1234, alwaysAlive);
|
|
92
|
+
expect(result.acquired).toBe(false);
|
|
93
|
+
if (!result.acquired) {
|
|
94
|
+
expect(result.existingPid).toBe(9999);
|
|
95
|
+
}
|
|
96
|
+
// File untouched.
|
|
97
|
+
expect(await readPidFile(pidPath)).toBe(9999);
|
|
98
|
+
});
|
|
99
|
+
|
|
100
|
+
test("idempotent when file already contains caller's own PID", async () => {
|
|
101
|
+
const pidPath = join(tempDir, "lock.pid");
|
|
102
|
+
await Bun.write(pidPath, "1234\n");
|
|
103
|
+
// alwaysAlive would say 1234 is alive, but acquirePidLock should detect
|
|
104
|
+
// own-PID first and accept.
|
|
105
|
+
const result = await acquirePidLock(pidPath, 1234, alwaysAlive);
|
|
106
|
+
expect(result.acquired).toBe(true);
|
|
107
|
+
expect(await readPidFile(pidPath)).toBe(1234);
|
|
108
|
+
});
|
|
109
|
+
|
|
110
|
+
test("reclaims stale lock with dead PID", async () => {
|
|
111
|
+
const pidPath = join(tempDir, "lock.pid");
|
|
112
|
+
await Bun.write(pidPath, "9999\n");
|
|
113
|
+
const result = await acquirePidLock(pidPath, 1234, alwaysDead);
|
|
114
|
+
expect(result.acquired).toBe(true);
|
|
115
|
+
expect(await readPidFile(pidPath)).toBe(1234);
|
|
116
|
+
});
|
|
117
|
+
|
|
118
|
+
test("reclaims unreadable/corrupted lock file", async () => {
|
|
119
|
+
const pidPath = join(tempDir, "lock.pid");
|
|
120
|
+
await Bun.write(pidPath, "garbage-not-a-pid\n");
|
|
121
|
+
const result = await acquirePidLock(pidPath, 1234, alwaysAlive);
|
|
122
|
+
expect(result.acquired).toBe(true);
|
|
123
|
+
expect(await readPidFile(pidPath)).toBe(1234);
|
|
124
|
+
});
|
|
125
|
+
|
|
126
|
+
test("two simultaneous acquirers — only one wins", async () => {
|
|
127
|
+
const pidPath = join(tempDir, "lock.pid");
|
|
128
|
+
const [a, b] = await Promise.all([
|
|
129
|
+
acquirePidLock(pidPath, 1111, alwaysAlive),
|
|
130
|
+
acquirePidLock(pidPath, 2222, alwaysAlive),
|
|
131
|
+
]);
|
|
132
|
+
const winners = [a, b].filter((r) => r.acquired);
|
|
133
|
+
const losers = [a, b].filter((r) => !r.acquired);
|
|
134
|
+
expect(winners.length).toBe(1);
|
|
135
|
+
expect(losers.length).toBe(1);
|
|
136
|
+
const loser = losers[0];
|
|
137
|
+
if (loser && !loser.acquired) {
|
|
138
|
+
expect([1111, 2222]).toContain(loser.existingPid);
|
|
139
|
+
}
|
|
140
|
+
});
|
|
141
|
+
|
|
142
|
+
test("two simultaneous acquirers — file content matches the winner", async () => {
|
|
143
|
+
const pidPath = join(tempDir, "lock.pid");
|
|
144
|
+
const [a, b] = await Promise.all([
|
|
145
|
+
acquirePidLock(pidPath, 1111, alwaysAlive),
|
|
146
|
+
acquirePidLock(pidPath, 2222, alwaysAlive),
|
|
147
|
+
]);
|
|
148
|
+
const fileContent = await readPidFile(pidPath);
|
|
149
|
+
const winnerPid = a.acquired ? 1111 : b.acquired ? 2222 : -1;
|
|
150
|
+
expect(fileContent).toBe(winnerPid);
|
|
151
|
+
});
|
|
152
|
+
});
|
package/src/utils/pid.ts
CHANGED
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* PID file management for daemon processes.
|
|
3
3
|
*/
|
|
4
|
-
import {
|
|
4
|
+
import { randomUUID } from "node:crypto";
|
|
5
|
+
import { link, mkdir, unlink, writeFile } from "node:fs/promises";
|
|
6
|
+
import { dirname } from "node:path";
|
|
5
7
|
|
|
6
8
|
/**
|
|
7
9
|
* Read the PID from a PID file.
|
|
@@ -43,3 +45,86 @@ export async function removePidFile(pidFilePath: string): Promise<void> {
|
|
|
43
45
|
// File may already be gone — not an error
|
|
44
46
|
}
|
|
45
47
|
}
|
|
48
|
+
|
|
49
|
+
/**
|
|
50
|
+
* Result of acquirePidLock.
|
|
51
|
+
*
|
|
52
|
+
* `acquired: true` — caller owns the lock and is responsible for removing the
|
|
53
|
+
* PID file on shutdown.
|
|
54
|
+
*
|
|
55
|
+
* `acquired: false` — a live foreign process already owns the lock; caller
|
|
56
|
+
* must not start. `existingPid` is the live owner. `existingPid === -1` means
|
|
57
|
+
* the lock file existed but was unreadable and could not be reclaimed.
|
|
58
|
+
*/
|
|
59
|
+
export type AcquirePidLockResult = { acquired: true } | { acquired: false; existingPid: number };
|
|
60
|
+
|
|
61
|
+
/**
|
|
62
|
+
* Atomically acquire a PID-file lock.
|
|
63
|
+
*
|
|
64
|
+
* Uses the write-temp-then-link pattern so the lock file appears at its final
|
|
65
|
+
* path with PID contents already present (no empty-file window): a competing
|
|
66
|
+
* reader can never observe an in-flight write. Behavior:
|
|
67
|
+
*
|
|
68
|
+
* - Lock file does not exist → atomic create via link(). Caller owns the lock.
|
|
69
|
+
* - Lock file exists, contains the caller's own PID → idempotent acquire
|
|
70
|
+
* (caller already owns it; e.g. background-mode parent wrote child.pid
|
|
71
|
+
* before spawn).
|
|
72
|
+
* - Lock file exists with a live foreign PID → refuse; return existingPid.
|
|
73
|
+
* - Lock file exists with a dead PID (or unreadable) → reclaim by unlinking
|
|
74
|
+
* and retrying once. If the retry races and loses to a live foreign
|
|
75
|
+
* watchdog, the call returns acquired=false with that foreign PID.
|
|
76
|
+
*
|
|
77
|
+
* Parent directory is created if missing (matches the implicit Bun.write
|
|
78
|
+
* behavior the legacy writePidFile relied on).
|
|
79
|
+
*/
|
|
80
|
+
export async function acquirePidLock(
|
|
81
|
+
pidFilePath: string,
|
|
82
|
+
pid: number,
|
|
83
|
+
isAlive: (pid: number) => boolean,
|
|
84
|
+
): Promise<AcquirePidLockResult> {
|
|
85
|
+
await mkdir(dirname(pidFilePath), { recursive: true });
|
|
86
|
+
|
|
87
|
+
// Stage the PID content at a unique temp path. After link() succeeds, the
|
|
88
|
+
// lock path appears with full content already present.
|
|
89
|
+
const tempPath = `${pidFilePath}.tmp.${pid}.${randomUUID()}`;
|
|
90
|
+
await writeFile(tempPath, `${pid}\n`);
|
|
91
|
+
|
|
92
|
+
try {
|
|
93
|
+
// Two attempts: first try, then one stale-lock reclaim retry. A second
|
|
94
|
+
// EEXIST after reclaim means a live foreign process raced in.
|
|
95
|
+
for (let attempt = 0; attempt < 2; attempt++) {
|
|
96
|
+
try {
|
|
97
|
+
await link(tempPath, pidFilePath);
|
|
98
|
+
return { acquired: true };
|
|
99
|
+
} catch (err: unknown) {
|
|
100
|
+
const code = (err as NodeJS.ErrnoException | undefined)?.code;
|
|
101
|
+
if (code !== "EEXIST") {
|
|
102
|
+
throw err;
|
|
103
|
+
}
|
|
104
|
+
const existing = await readPidFile(pidFilePath);
|
|
105
|
+
if (existing === null) {
|
|
106
|
+
// Unreadable/corrupted lock file — treat as stale.
|
|
107
|
+
await removePidFile(pidFilePath);
|
|
108
|
+
continue;
|
|
109
|
+
}
|
|
110
|
+
if (existing === pid) {
|
|
111
|
+
// Idempotent: caller already owns it (parent pre-wrote child PID).
|
|
112
|
+
return { acquired: true };
|
|
113
|
+
}
|
|
114
|
+
if (isAlive(existing)) {
|
|
115
|
+
return { acquired: false, existingPid: existing };
|
|
116
|
+
}
|
|
117
|
+
// Stale: reclaim and retry once.
|
|
118
|
+
await removePidFile(pidFilePath);
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
// Two stale-then-retry attempts both failed. Another writer raced in
|
|
123
|
+
// between our reclaim and our retry — they own the lock now.
|
|
124
|
+
const existing = await readPidFile(pidFilePath);
|
|
125
|
+
return { acquired: false, existingPid: existing ?? -1 };
|
|
126
|
+
} finally {
|
|
127
|
+
// Drop the temp inode link (lock path retains the data via the second link).
|
|
128
|
+
await unlink(tempPath).catch(() => {});
|
|
129
|
+
}
|
|
130
|
+
}
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
import { describe, expect, test } from "bun:test";
|
|
2
|
+
import { findRunningWatchdogProcesses } from "./process-scan.ts";
|
|
3
|
+
|
|
4
|
+
describe("findRunningWatchdogProcesses", () => {
|
|
5
|
+
test("returns an array (does not throw)", async () => {
|
|
6
|
+
const results = await findRunningWatchdogProcesses();
|
|
7
|
+
expect(Array.isArray(results)).toBe(true);
|
|
8
|
+
// We can't assert specifics — depends on what's running on the host —
|
|
9
|
+
// but each entry should have a numeric pid and string command.
|
|
10
|
+
for (const proc of results) {
|
|
11
|
+
expect(typeof proc.pid).toBe("number");
|
|
12
|
+
expect(proc.pid).toBeGreaterThan(0);
|
|
13
|
+
expect(typeof proc.command).toBe("string");
|
|
14
|
+
}
|
|
15
|
+
});
|
|
16
|
+
|
|
17
|
+
test("excludes own process even if command matches", async () => {
|
|
18
|
+
// The test process itself runs `bun test ...` not `ov watch`, so it
|
|
19
|
+
// would not match anyway. But we still verify own-pid is filtered out
|
|
20
|
+
// by checking no result has our PID.
|
|
21
|
+
const results = await findRunningWatchdogProcesses();
|
|
22
|
+
const ownPid = process.pid;
|
|
23
|
+
for (const proc of results) {
|
|
24
|
+
expect(proc.pid).not.toBe(ownPid);
|
|
25
|
+
}
|
|
26
|
+
});
|
|
27
|
+
|
|
28
|
+
test("matches `ov watch` and `bun run ov watch` invocations", async () => {
|
|
29
|
+
// Spawn a sleeper whose command line contains the `ov watch` substring,
|
|
30
|
+
// then verify the scanner finds it. We use `sh -c` so the argv string
|
|
31
|
+
// passed to ps contains our marker tokens.
|
|
32
|
+
const sleeper = Bun.spawn(["sh", "-c", "exec -a 'bun run ov watch' sleep 30"], {
|
|
33
|
+
stdout: "ignore",
|
|
34
|
+
stderr: "ignore",
|
|
35
|
+
});
|
|
36
|
+
try {
|
|
37
|
+
// Give ps a moment to see the new process.
|
|
38
|
+
await Bun.sleep(150);
|
|
39
|
+
const results = await findRunningWatchdogProcesses();
|
|
40
|
+
const found = results.find((p) => p.pid === sleeper.pid);
|
|
41
|
+
// On macOS BSD ps, `exec -a` may or may not change the displayed
|
|
42
|
+
// argv depending on shell version. We accept either: if the
|
|
43
|
+
// command is detected, it must look right; if not, we don't fail
|
|
44
|
+
// the test (env-dependent).
|
|
45
|
+
if (found) {
|
|
46
|
+
expect(found.command).toMatch(/\b(ov|overstory)\b.*\bwatch\b/);
|
|
47
|
+
}
|
|
48
|
+
} finally {
|
|
49
|
+
sleeper.kill("SIGTERM");
|
|
50
|
+
await sleeper.exited.catch(() => {});
|
|
51
|
+
}
|
|
52
|
+
});
|
|
53
|
+
});
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Process-table scanning helpers.
|
|
3
|
+
*
|
|
4
|
+
* Used to detect runaway daemon processes that are not tracked by a PID file —
|
|
5
|
+
* for example, the multi-`ov watch` situation observed on 2026-04-30 where
|
|
6
|
+
* three concurrent watchdogs were running because earlier releases had no
|
|
7
|
+
* PID-file exclusion lock.
|
|
8
|
+
*
|
|
9
|
+
* Implementation note: `ps` is used directly because we only need to find
|
|
10
|
+
* processes by command-line substring, and Bun has no built-in process-table
|
|
11
|
+
* API. The `ps -o pid=,command=` form is portable across macOS (BSD) and
|
|
12
|
+
* Linux (procps) for the columns we read.
|
|
13
|
+
*/
|
|
14
|
+
|
|
15
|
+
export interface WatchdogProcess {
|
|
16
|
+
pid: number;
|
|
17
|
+
/** The full command line as reported by `ps`. */
|
|
18
|
+
command: string;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
/**
|
|
22
|
+
* Find running processes that look like an `ov watch` daemon.
|
|
23
|
+
*
|
|
24
|
+
* Matches on the command-line substring `ov watch` (the daemon spawn form)
|
|
25
|
+
* and excludes the current process so callers do not accidentally treat
|
|
26
|
+
* themselves as a foreign daemon.
|
|
27
|
+
*
|
|
28
|
+
* Returns an empty list if `ps` is unavailable or fails — callers must not
|
|
29
|
+
* rely on this for correctness, only for diagnostics and `--kill-others`.
|
|
30
|
+
*/
|
|
31
|
+
export async function findRunningWatchdogProcesses(): Promise<WatchdogProcess[]> {
|
|
32
|
+
const proc = Bun.spawn(["ps", "-A", "-o", "pid=,command="], {
|
|
33
|
+
stdout: "pipe",
|
|
34
|
+
stderr: "ignore",
|
|
35
|
+
});
|
|
36
|
+
const exitCode = await proc.exited;
|
|
37
|
+
if (exitCode !== 0) {
|
|
38
|
+
return [];
|
|
39
|
+
}
|
|
40
|
+
const text = await new Response(proc.stdout).text();
|
|
41
|
+
const ownPid = process.pid;
|
|
42
|
+
const out: WatchdogProcess[] = [];
|
|
43
|
+
|
|
44
|
+
for (const rawLine of text.split("\n")) {
|
|
45
|
+
const line = rawLine.trim();
|
|
46
|
+
if (line === "") continue;
|
|
47
|
+
|
|
48
|
+
// `ps -o pid=,command=` outputs: ` 1234 /path/to/binary args...`
|
|
49
|
+
// (leading whitespace is allowed, then PID, then a single space, then
|
|
50
|
+
// the rest of the command).
|
|
51
|
+
const match = line.match(/^(\d+)\s+(.+)$/);
|
|
52
|
+
if (!match) continue;
|
|
53
|
+
const pidStr = match[1];
|
|
54
|
+
const command = match[2];
|
|
55
|
+
if (pidStr === undefined || command === undefined) continue;
|
|
56
|
+
const pid = Number.parseInt(pidStr, 10);
|
|
57
|
+
if (!Number.isFinite(pid) || pid <= 0) continue;
|
|
58
|
+
if (pid === ownPid) continue;
|
|
59
|
+
|
|
60
|
+
// Match the spawn form: `bun run /path/to/ov watch`. We also tolerate
|
|
61
|
+
// direct invocation `overstory watch` and `ov watch`.
|
|
62
|
+
if (!isWatchdogCommand(command)) continue;
|
|
63
|
+
|
|
64
|
+
out.push({ pid, command });
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
return out;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
function isWatchdogCommand(command: string): boolean {
|
|
71
|
+
// Anchor on a `watch` token preceded by an `ov` or `overstory` token.
|
|
72
|
+
// Avoids false positives like "watch ov.log" or unrelated `watch` commands.
|
|
73
|
+
if (!/\bwatch\b/.test(command)) return false;
|
|
74
|
+
if (/\b(ov|overstory)\b[^\n]*\bwatch\b/.test(command)) return true;
|
|
75
|
+
return false;
|
|
76
|
+
}
|