pi-crew 0.9.4 → 0.9.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +592 -0
- package/README.md +55 -3
- package/docs/HARNESS_BACKLOG.md +51 -3
- package/docs/dynamic-workflows.md +315 -2
- package/docs/fix-plan-disabletools-exit-null.md +219 -0
- package/docs/troubleshooting.md +102 -0
- package/package.json +8 -2
- package/src/extension/command-completions.ts +1 -0
- package/src/extension/crew-shortcuts.ts +1 -0
- package/src/extension/register.ts +2 -0
- package/src/extension/registration/commands.ts +3 -0
- package/src/extension/team-tool/doctor.ts +14 -0
- package/src/extension/team-tool/goal.ts +1 -0
- package/src/extension/team-tool/run.ts +4 -0
- package/src/runtime/background-runner.ts +24 -2
- package/src/runtime/chain-runner.ts +1 -0
- package/src/runtime/child-pi.ts +101 -10
- package/src/runtime/crash-recovery.ts +78 -36
- package/src/runtime/deterministic-ast.ts +161 -0
- package/src/runtime/dwf-state-store.ts +97 -0
- package/src/runtime/dynamic-workflow-context.ts +381 -7
- package/src/runtime/dynamic-workflow-runner.ts +94 -2
- package/src/runtime/goal-loop-runner.ts +2 -0
- package/src/runtime/live-session-runtime.ts +1 -0
- package/src/runtime/model-scope.ts +1 -0
- package/src/runtime/peer-dep.ts +1 -0
- package/src/runtime/pi-args.ts +11 -0
- package/src/runtime/resilient-edit.ts +1 -0
- package/src/runtime/result-extractor.ts +72 -7
- package/src/runtime/task-runner.ts +1 -0
- package/src/runtime/team-runner.ts +8 -3
- package/src/runtime/zombie-scanner.ts +297 -0
- package/src/schema/team-tool-schema.ts +28 -0
- package/src/state/contracts.ts +1 -0
- package/src/state/hook-instinct-bridge.ts +3 -0
- package/src/state/state-store.ts +3 -0
- package/src/state/types.ts +9 -0
- package/src/ui/dashboard-panes/progress-pane.ts +5 -0
- package/src/ui/dwf-phase-display.ts +151 -0
- package/src/ui/run-snapshot-cache.ts +4 -0
- package/src/ui/snapshot-types.ts +3 -0
- package/src/utils/bm25-search.ts +2 -0
- package/src/workflows/workflow-config.ts +3 -0
- package/src/worktree/worktree-manager.ts +94 -0
- package/types/dwf.d.ts +187 -0
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
import type { ExtensionContext } from "@earendil-works/pi-coding-agent";
|
|
2
2
|
import * as fs from "node:fs";
|
|
3
|
+
import * as path from "node:path";
|
|
3
4
|
import type { MetricRegistry } from "../observability/metric-registry.ts";
|
|
4
5
|
import { appendEvent, scanSequence } from "../state/event-log.ts";
|
|
5
6
|
import { recordFromTask, upsertCrewAgent } from "./crew-agent-records.ts";
|
|
6
7
|
import { withRunLockSync } from "../state/locks.ts";
|
|
7
|
-
import { loadRunManifestById,
|
|
8
|
+
import { loadRunManifestById, saveRunTasks, updateRunStatus } from "../state/state-store.ts";
|
|
8
9
|
import type { TeamTaskState } from "../state/types.ts";
|
|
9
10
|
import { isWorkerHeartbeatStale } from "./worker-heartbeat.ts";
|
|
10
11
|
import type { ManifestCache } from "./manifest-cache.ts";
|
|
@@ -215,6 +216,43 @@ function tryRemoveRunDirectories(entry: { stateRoot: string; cwd: string }): voi
|
|
|
215
216
|
// NOTE: artifactsRoot is shared across runs and cleaned up by pruneFinishedRuns/pruneUserLevelRuns — not deleted here.
|
|
216
217
|
}
|
|
217
218
|
|
|
219
|
+
/**
|
|
220
|
+
* Age (ms) of the team-level heartbeat file for a run. The team-runner writes
|
|
221
|
+
* `<stateRoot>/heartbeat.json` periodically while a workflow is executing
|
|
222
|
+
* (startTeamHeartbeat), so a fresh heartbeat is strong evidence the run is alive
|
|
223
|
+
* even when its recorded PID check is inconclusive or its active-run-index
|
|
224
|
+
* entry's `updatedAt` was frozen at registration. Returns Infinity when absent.
|
|
225
|
+
*/
|
|
226
|
+
function heartbeatAgeMs(entry: { stateRoot: string }, now: number): number {
|
|
227
|
+
try {
|
|
228
|
+
const mtime = fs.statSync(path.join(entry.stateRoot, "heartbeat.json")).mtimeMs;
|
|
229
|
+
return Number.isFinite(mtime) ? now - mtime : Infinity;
|
|
230
|
+
} catch {
|
|
231
|
+
return Infinity;
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
/**
|
|
236
|
+
* True if there is recent evidence the run is (or was very recently) alive, so
|
|
237
|
+
* it must NOT be purged. Any one of these signals is sufficient:
|
|
238
|
+
* - on-disk `manifest.updatedAt` fresher than `staleThresholdMs` (rewritten on
|
|
239
|
+
* every task transition / status change), and/or
|
|
240
|
+
* - team-level `heartbeat.json` fresher than `staleThresholdMs`.
|
|
241
|
+
* `entry.updatedAt` is intentionally NOT consulted: it is frozen at
|
|
242
|
+
* registration and never refreshed during execution, which previously caused
|
|
243
|
+
* long-running legitimate runs to be falsely purged — destroying their
|
|
244
|
+
* stateRoot, and because saveRunTasks() silently no-ops once the state dir is
|
|
245
|
+
* gone, hanging the workflow permanently at the current task with no
|
|
246
|
+
* recoverable state ("Run not found").
|
|
247
|
+
*/
|
|
248
|
+
function hasRecentLifeEvidence(entry: { stateRoot: string }, manifestUpdatedAt: string | undefined, now: number, staleThresholdMs: number): boolean {
|
|
249
|
+
const manifestMs = manifestUpdatedAt ? new Date(manifestUpdatedAt).getTime() : NaN;
|
|
250
|
+
if (Number.isFinite(manifestMs) && now - manifestMs <= staleThresholdMs) return true;
|
|
251
|
+
const hbAge = heartbeatAgeMs(entry, now);
|
|
252
|
+
if (Number.isFinite(hbAge) && hbAge <= staleThresholdMs) return true;
|
|
253
|
+
return false;
|
|
254
|
+
}
|
|
255
|
+
|
|
218
256
|
/**
|
|
219
257
|
* Purge the global active-run-index of entries whose manifest is no longer active.
|
|
220
258
|
*
|
|
@@ -244,7 +282,7 @@ export function purgeStaleActiveRunIndex(staleThresholdMs = 300_000, now = Date.
|
|
|
244
282
|
}
|
|
245
283
|
|
|
246
284
|
// 3. Read manifest status
|
|
247
|
-
let manifest: { status?: string; async?: { pid?: number }; ownerSessionId?: string } | undefined;
|
|
285
|
+
let manifest: { status?: string; updatedAt?: string; async?: { pid?: number }; ownerSessionId?: string } | undefined;
|
|
248
286
|
try {
|
|
249
287
|
manifest = JSON.parse(fs.readFileSync(entry.manifestPath, "utf-8"));
|
|
250
288
|
} catch {
|
|
@@ -262,46 +300,52 @@ export function purgeStaleActiveRunIndex(staleThresholdMs = 300_000, now = Date.
|
|
|
262
300
|
continue;
|
|
263
301
|
}
|
|
264
302
|
|
|
265
|
-
// 5. Still "running"
|
|
303
|
+
// 5. Still "running" with an async worker PID — only purge when the worker
|
|
304
|
+
// is actually dead AND there is no recent evidence of life. We must NOT
|
|
305
|
+
// rely solely on `entry.updatedAt` (frozen at registration) nor on a single
|
|
306
|
+
// dead-PID reading: a long-running worker (e.g. a 15-minute explorer)
|
|
307
|
+
// legitimately keeps the run "running" while periodically rewriting the
|
|
308
|
+
// on-disk manifest.updatedAt and heartbeat.json. Falsely purging such a run
|
|
309
|
+
// destroys its stateRoot, and because saveRunTasks() silently no-ops once
|
|
310
|
+
// the state dir is gone, the workflow then hangs permanently at the
|
|
311
|
+
// current task with no recoverable state ("Run not found"). When we do mark
|
|
312
|
+
// a run cancelled here, we KEEP its stateRoot so the run stays queryable/
|
|
313
|
+
// resumable and its diagnostics survive; the finished-run pruner removes
|
|
314
|
+
// the directory later on its normal schedule.
|
|
266
315
|
if (manifest?.status === "running" && manifest.async?.pid !== undefined) {
|
|
267
316
|
const pidAlive = checkProcessLiveness(manifest.async.pid).alive;
|
|
268
|
-
if (!pidAlive) {
|
|
269
|
-
//
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
const
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
for (const task of repairedTasks) { try { upsertCrewAgent(fullLoaded.manifest, recordFromTask(fullLoaded.manifest, task, "scaffold")); } catch { /* non-critical */ } }
|
|
285
|
-
updateRunStatus(fullLoaded.manifest, "cancelled", "Orphaned run: worker process dead and no recent activity");
|
|
286
|
-
saveRunManifest(fullLoaded.manifest);
|
|
287
|
-
void terminateLiveAgentsForRun(fullLoaded.manifest.runId, "cancelled", appendEvent, fullLoaded.manifest.eventsPath).catch((error) => logInternalError("crash-recovery.pid-dead.terminate", error, `runId=${fullLoaded.manifest.runId}`));
|
|
288
|
-
}
|
|
289
|
-
} catch {
|
|
290
|
-
// Best-effort manifest cleanup
|
|
317
|
+
if (!pidAlive && !hasRecentLifeEvidence(entry, manifest.updatedAt, now, staleThresholdMs)) {
|
|
318
|
+
// Dead PID + no recent life evidence → cancel the manifest and unregister
|
|
319
|
+
try {
|
|
320
|
+
const fullLoaded = loadRunManifestById(entry.cwd, entry.runId); // NOTE: no withRunLock - best-effort only; concurrent writes may cause inconsistency
|
|
321
|
+
if (fullLoaded) {
|
|
322
|
+
const now_iso = new Date(now).toISOString();
|
|
323
|
+
const repairedTasks = fullLoaded.tasks.map((task) => {
|
|
324
|
+
if (task.status === "running" || task.status === "queued" || task.status === "waiting") {
|
|
325
|
+
return { ...task, status: "cancelled" as const, finishedAt: now_iso, error: "Orphaned run: worker process dead and no recent activity" };
|
|
326
|
+
}
|
|
327
|
+
return task;
|
|
328
|
+
});
|
|
329
|
+
saveRunTasks(fullLoaded.manifest, repairedTasks);
|
|
330
|
+
for (const task of repairedTasks) { try { upsertCrewAgent(fullLoaded.manifest, recordFromTask(fullLoaded.manifest, task, "scaffold")); } catch { /* non-critical */ } }
|
|
331
|
+
updateRunStatus(fullLoaded.manifest, "cancelled", "Orphaned run: worker process dead and no recent activity");
|
|
332
|
+
void terminateLiveAgentsForRun(fullLoaded.manifest.runId, "cancelled", appendEvent, fullLoaded.manifest.eventsPath).catch((error) => logInternalError("crash-recovery.pid-dead.terminate", error, `runId=${fullLoaded.manifest.runId}`));
|
|
291
333
|
}
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
purged.push(entry.runId);
|
|
295
|
-
continue;
|
|
334
|
+
} catch {
|
|
335
|
+
// Best-effort manifest cleanup
|
|
296
336
|
}
|
|
337
|
+
unregisterActiveRun(entry.runId);
|
|
338
|
+
purged.push(entry.runId);
|
|
339
|
+
continue;
|
|
297
340
|
}
|
|
298
341
|
}
|
|
299
342
|
|
|
300
|
-
// 6. "running" but no async worker PID — possible orphaned run where
|
|
301
|
-
// was never updated
|
|
343
|
+
// 6. "running" but no async worker PID — possible orphaned run where the
|
|
344
|
+
// manifest was never updated to a terminal status after the worker exited.
|
|
345
|
+
// Uses the same life-evidence corroboration as condition 5; the stateRoot is
|
|
346
|
+
// kept on cancel so the run stays queryable/resumable with diagnostics.
|
|
302
347
|
if (manifest?.status === "running" && manifest.async === undefined) {
|
|
303
|
-
|
|
304
|
-
if (Number.isFinite(updatedAt) && now - updatedAt > staleThresholdMs) {
|
|
348
|
+
if (!hasRecentLifeEvidence(entry, manifest.updatedAt, now, staleThresholdMs)) {
|
|
305
349
|
try {
|
|
306
350
|
const fullLoaded = loadRunManifestById(entry.cwd, entry.runId); // NOTE: no withRunLock - best-effort only; concurrent writes may cause inconsistency
|
|
307
351
|
if (fullLoaded && fullLoaded.manifest.status === "running") {
|
|
@@ -315,14 +359,12 @@ export function purgeStaleActiveRunIndex(staleThresholdMs = 300_000, now = Date.
|
|
|
315
359
|
saveRunTasks(fullLoaded.manifest, repairedTasks);
|
|
316
360
|
for (const task of repairedTasks) { try { upsertCrewAgent(fullLoaded.manifest, recordFromTask(fullLoaded.manifest, task, "scaffold")); } catch { /* non-critical */ } }
|
|
317
361
|
updateRunStatus(fullLoaded.manifest, "cancelled", "Orphaned run: no async worker and no manifest update in over " + Math.round(staleThresholdMs / 60000) + " minutes");
|
|
318
|
-
saveRunManifest(fullLoaded.manifest);
|
|
319
362
|
void terminateLiveAgentsForRun(fullLoaded.manifest.runId, "cancelled", appendEvent, fullLoaded.manifest.eventsPath).catch((error) => logInternalError("crash-recovery.pid-dead.terminate", error, `runId=${fullLoaded.manifest.runId}`));
|
|
320
363
|
}
|
|
321
364
|
} catch {
|
|
322
365
|
// Best-effort
|
|
323
366
|
}
|
|
324
367
|
unregisterActiveRun(entry.runId);
|
|
325
|
-
tryRemoveRunDirectories(entry);
|
|
326
368
|
purged.push(entry.runId);
|
|
327
369
|
continue;
|
|
328
370
|
}
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* deterministic-ast.ts — AST-based determinism enforcement for dynamic-workflow scripts (round-13 P0-2).
|
|
3
|
+
*
|
|
4
|
+
* Rejects `Date.now()`, `Math.random()`, and `new Date()` at workflow-load time
|
|
5
|
+
* using a true AST walk (not regex) so that:
|
|
6
|
+
* - Prompts mentioning "Date.now()" as string literals are accepted.
|
|
7
|
+
* - Comments containing "Date.now()" are accepted.
|
|
8
|
+
* - `Date.parse()`, `Date.UTC()`, `Math.floor()`, etc. are accepted (only `now` and `random` are blocked).
|
|
9
|
+
*
|
|
10
|
+
* Adapted from pi-dynamic-workflows/src/workflow.ts (MIT) — see NOTICE.md.
|
|
11
|
+
*
|
|
12
|
+
* The walker uses acorn's parse() with permissive flags (allowAwaitOutsideFunction,
|
|
13
|
+
* allowReturnOutsideFunction) so we don't reject perfectly valid workflow scripts
|
|
14
|
+
* that contain top-level `await` or `return`.
|
|
15
|
+
*
|
|
16
|
+
* On parse error, this function returns silently: jiti will surface a clearer
|
|
17
|
+
* parse error downstream. We don't double-report parse errors.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
import { parse } from "acorn";
|
|
21
|
+
|
|
22
|
+
const NONDETERMINISM_ERROR =
|
|
23
|
+
"Workflow scripts must be deterministic: Date.now()/Math.random()/new Date() are unavailable. These introduce non-reproducible behavior across runs. Use ctx.vars for cached state, or pass a fixed seed via ctx.setArgs(). To bypass this check (escape hatch), set PI_CREW_DWF_SKIP_DETERMINISM_CHECK=1.";
|
|
24
|
+
|
|
25
|
+
export class DeterminismError extends Error {
|
|
26
|
+
constructor() {
|
|
27
|
+
super(NONDETERMINISM_ERROR);
|
|
28
|
+
this.name = "DeterminismError";
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
/**
|
|
33
|
+
* Parse `script` and walk the AST looking for non-deterministic calls.
|
|
34
|
+
* Throws DeterminismError on the first hit. Silently returns on parse error
|
|
35
|
+
* (jiti will produce a clearer message downstream).
|
|
36
|
+
*/
|
|
37
|
+
export function assertDeterministicScript(script: string): void {
|
|
38
|
+
let ast: AstNode;
|
|
39
|
+
try {
|
|
40
|
+
ast = parse(script, {
|
|
41
|
+
ecmaVersion: "latest",
|
|
42
|
+
sourceType: "module",
|
|
43
|
+
allowAwaitOutsideFunction: true,
|
|
44
|
+
allowReturnOutsideFunction: true,
|
|
45
|
+
ranges: false,
|
|
46
|
+
}) as unknown as AstNode;
|
|
47
|
+
} catch {
|
|
48
|
+
// Parse errors are handled by jiti downstream — don't double-report.
|
|
49
|
+
return;
|
|
50
|
+
}
|
|
51
|
+
assertDeterministicAst(ast);
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
/**
|
|
55
|
+
* Escape hatch: when PI_CREW_DWF_SKIP_DETERMINISM_CHECK=1 the check is bypassed.
|
|
56
|
+
* Power users may need this when a workflow legitimately depends on time/random
|
|
57
|
+
* (e.g. randomized benchmark scripts).
|
|
58
|
+
*/
|
|
59
|
+
export function isDeterminismCheckEnabled(): boolean {
|
|
60
|
+
return process.env.PI_CREW_DWF_SKIP_DETERMINISM_CHECK !== "1";
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
// ---------------------------------------------------------------------------
|
|
64
|
+
// AST walker
|
|
65
|
+
// ---------------------------------------------------------------------------
|
|
66
|
+
|
|
67
|
+
interface AstNode {
|
|
68
|
+
type: string;
|
|
69
|
+
[key: string]: unknown;
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
function asAstNode(value: unknown): AstNode | undefined {
|
|
73
|
+
if (!value || typeof value !== "object") return undefined;
|
|
74
|
+
const obj = value as Record<string, unknown>;
|
|
75
|
+
if (typeof obj.type !== "string") return undefined;
|
|
76
|
+
return obj as AstNode;
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
function astChildren(node: AstNode): AstNode[] {
|
|
80
|
+
const out: AstNode[] = [];
|
|
81
|
+
for (const value of Object.values(node)) {
|
|
82
|
+
if (Array.isArray(value)) {
|
|
83
|
+
for (const item of value) {
|
|
84
|
+
const child = asAstNode(item);
|
|
85
|
+
if (child) out.push(child);
|
|
86
|
+
}
|
|
87
|
+
} else {
|
|
88
|
+
const child = asAstNode(value);
|
|
89
|
+
if (child) out.push(child);
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
return out;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
function assertDeterministicAst(node: AstNode): void {
|
|
96
|
+
if (isDateNowCall(node) || isMathRandomCall(node) || isNewDateExpression(node)) {
|
|
97
|
+
throw new DeterminismError();
|
|
98
|
+
}
|
|
99
|
+
for (const child of astChildren(node)) assertDeterministicAst(child);
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
function isDateNowCall(node: AstNode): boolean {
|
|
103
|
+
return node.type === "CallExpression" && isMemberExpression(node, "callee", "Date", "now");
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
function isMathRandomCall(node: AstNode): boolean {
|
|
107
|
+
return node.type === "CallExpression" && isMemberExpression(node, "callee", "Math", "random");
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
function isNewDateExpression(node: AstNode): boolean {
|
|
111
|
+
if (node.type !== "NewExpression") return false;
|
|
112
|
+
const callee = asAstNode(node.callee);
|
|
113
|
+
return callee?.type === "Identifier" && callee.name === "Date";
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
/**
|
|
117
|
+
* Test whether `node[childKey]` is a MemberExpression of shape `objectName.propertyName`,
|
|
118
|
+
* where the property is either a static Identifier or a resolvable static string.
|
|
119
|
+
* `childKey` is the property name on `node` (usually "callee" for CallExpression).
|
|
120
|
+
*/
|
|
121
|
+
function isMemberExpression(node: AstNode, childKey: string, objectName: string, propertyName: string): boolean {
|
|
122
|
+
const child = asAstNode(node[childKey]);
|
|
123
|
+
if (!child || child.type !== "MemberExpression") return false;
|
|
124
|
+
const object = asAstNode(child.object);
|
|
125
|
+
if (!object || object.type !== "Identifier" || object.name !== objectName) return false;
|
|
126
|
+
return propertyNameOf(child) === propertyName;
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
function propertyNameOf(node: AstNode): string | undefined {
|
|
130
|
+
const computed = node.computed === true;
|
|
131
|
+
const property = asAstNode(node.property);
|
|
132
|
+
if (!property) return undefined;
|
|
133
|
+
if (!computed && property.type === "Identifier") {
|
|
134
|
+
return property.name as string | undefined;
|
|
135
|
+
}
|
|
136
|
+
return staticStringOf(property);
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
function staticStringOf(node: AstNode | undefined): string | undefined {
|
|
140
|
+
if (!node) return undefined;
|
|
141
|
+
if (node.type === "Literal" && typeof node.value === "string") return node.value;
|
|
142
|
+
if (node.type === "TemplateLiteral") {
|
|
143
|
+
const expressions = node.expressions;
|
|
144
|
+
if (Array.isArray(expressions) && expressions.length > 0) return undefined;
|
|
145
|
+
const quasis = node.quasis;
|
|
146
|
+
if (!Array.isArray(quasis)) return undefined;
|
|
147
|
+
return quasis
|
|
148
|
+
.map((q) => {
|
|
149
|
+
const quasi = asAstNode(q);
|
|
150
|
+
const value = quasi?.value as { cooked?: string; raw?: string } | undefined;
|
|
151
|
+
return value?.cooked ?? value?.raw ?? "";
|
|
152
|
+
})
|
|
153
|
+
.join("");
|
|
154
|
+
}
|
|
155
|
+
if (node.type === "BinaryExpression" && node.operator === "+") {
|
|
156
|
+
const left = staticStringOf(asAstNode(node.left));
|
|
157
|
+
const right = staticStringOf(asAstNode(node.right));
|
|
158
|
+
if (left !== undefined && right !== undefined) return left + right;
|
|
159
|
+
}
|
|
160
|
+
return undefined;
|
|
161
|
+
}
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* dwf-state-store.ts — Persistent checkpoint state for dynamic-workflow runs (P2-3, round-18).
|
|
3
|
+
*
|
|
4
|
+
* Modeled on GoalStore (goal-state-store.ts) and FileCheckpointStore (checkpoint.ts),
|
|
5
|
+
* but scoped to a single run's stateRoot (which is already <crewRoot>/state/runs/<runId>).
|
|
6
|
+
*
|
|
7
|
+
* Stores DwfCheckpointState as atomic JSON at <stateRoot>/dwf-checkpoint.json.
|
|
8
|
+
* atomicWriteJson (temp + rename + fsync) guarantees either the old or the new file,
|
|
9
|
+
* never a partial write — safe across crashes.
|
|
10
|
+
*
|
|
11
|
+
* Resume semantics (round-18): the runner loads a checkpoint on run start and hydrates
|
|
12
|
+
* ctx.vars/phases/logs from it; on clean completion the runner deletes it. A missing or
|
|
13
|
+
* corrupt checkpoint is treated as a fresh run (load() returns undefined). If a crash
|
|
14
|
+
* happens mid-agent, that agent simply re-runs from scratch on resume — agent results
|
|
15
|
+
* are expected to be idempotent-ish.
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
import { mkdirSync, existsSync, readFileSync, unlinkSync } from "node:fs";
|
|
19
|
+
import { dirname } from "node:path";
|
|
20
|
+
import { atomicWriteJson } from "../state/atomic-write.ts";
|
|
21
|
+
import { logInternalError } from "../utils/internal-error.ts";
|
|
22
|
+
|
|
23
|
+
export interface DwfCheckpointState {
|
|
24
|
+
runId: string;
|
|
25
|
+
vars: Record<string, unknown>;
|
|
26
|
+
phases: string[];
|
|
27
|
+
currentPhase: string | undefined;
|
|
28
|
+
logs: string[]; // capped copy (≤1000); the events log (dwf.log) is the durable source of truth
|
|
29
|
+
spent: number; // budget accumulator (round-14 P1-2)
|
|
30
|
+
agentCount: number;
|
|
31
|
+
updatedAt: string;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
/**
|
|
35
|
+
* DwfStore — atomic CRUD for a single run's DWF checkpoint.
|
|
36
|
+
*
|
|
37
|
+
* Concurrency: writes are atomic (atomicWriteJson). The DWF runner is the sole
|
|
38
|
+
* writer during a run; `team resume` loads the checkpoint read-only before the
|
|
39
|
+
* script re-executes. No file-lock is needed here because only one runner owns a
|
|
40
|
+
* run's stateRoot at a time (run locks protect manifest transitions elsewhere).
|
|
41
|
+
*
|
|
42
|
+
* Note: the constructor takes the run's stateRoot directly (NOT cwd + runId) to
|
|
43
|
+
* avoid a double-nesting bug — stateRoot is already <crewRoot>/state/runs/<runId>,
|
|
44
|
+
* so the checkpoint lands at <crewRoot>/state/runs/<runId>/dwf-checkpoint.json.
|
|
45
|
+
* This mirrors FileCheckpointStore (checkpoint.ts: constructor(stateRoot)).
|
|
46
|
+
*/
|
|
47
|
+
export class DwfStore {
|
|
48
|
+
private readonly stateRoot: string;
|
|
49
|
+
|
|
50
|
+
constructor(stateRoot: string) {
|
|
51
|
+
this.stateRoot = stateRoot;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
private get path(): string {
|
|
55
|
+
return `${this.stateRoot}/dwf-checkpoint.json`;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
/** Load the checkpoint for this run's stateRoot. Returns undefined if missing or corrupt (fresh run). */
|
|
59
|
+
load(): DwfCheckpointState | undefined {
|
|
60
|
+
const path = this.path;
|
|
61
|
+
try {
|
|
62
|
+
if (!existsSync(path)) return undefined;
|
|
63
|
+
const raw = readFileSync(path, "utf-8");
|
|
64
|
+
const parsed = JSON.parse(raw);
|
|
65
|
+
// Corrupt-guard: a valid checkpoint must be an object with a string runId
|
|
66
|
+
// (mirrors GoalStore.load's typeof parsed.goalId !== "string" check).
|
|
67
|
+
if (!parsed || typeof parsed !== "object" || typeof parsed.runId !== "string") return undefined;
|
|
68
|
+
return parsed as DwfCheckpointState;
|
|
69
|
+
} catch {
|
|
70
|
+
return undefined;
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
/** Atomically persist a checkpoint state. Stamps `updatedAt` (callers need not set it). */
|
|
75
|
+
save(state: DwfCheckpointState): void {
|
|
76
|
+
const path = this.path;
|
|
77
|
+
const next = { ...state, updatedAt: new Date().toISOString() };
|
|
78
|
+
try {
|
|
79
|
+
mkdirSync(dirname(path), { recursive: true });
|
|
80
|
+
atomicWriteJson(path, next);
|
|
81
|
+
} catch (error) {
|
|
82
|
+
logInternalError("dwf-state-store.save", error, `runId=${state.runId}`);
|
|
83
|
+
throw error;
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
/** Remove the checkpoint file (after a clean completion). Best-effort; never throws. */
|
|
88
|
+
delete(): void {
|
|
89
|
+
const path = this.path;
|
|
90
|
+
try {
|
|
91
|
+
if (!existsSync(path)) return;
|
|
92
|
+
unlinkSync(path);
|
|
93
|
+
} catch (error) {
|
|
94
|
+
logInternalError("dwf-state-store.delete", error);
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
}
|