pi-crew 0.9.4 → 0.9.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/CHANGELOG.md +592 -0
  2. package/README.md +55 -3
  3. package/docs/HARNESS_BACKLOG.md +51 -3
  4. package/docs/dynamic-workflows.md +315 -2
  5. package/docs/fix-plan-disabletools-exit-null.md +219 -0
  6. package/docs/troubleshooting.md +102 -0
  7. package/package.json +8 -2
  8. package/src/extension/command-completions.ts +1 -0
  9. package/src/extension/crew-shortcuts.ts +1 -0
  10. package/src/extension/register.ts +2 -0
  11. package/src/extension/registration/commands.ts +3 -0
  12. package/src/extension/team-tool/doctor.ts +14 -0
  13. package/src/extension/team-tool/goal.ts +1 -0
  14. package/src/extension/team-tool/run.ts +4 -0
  15. package/src/runtime/background-runner.ts +24 -2
  16. package/src/runtime/chain-runner.ts +1 -0
  17. package/src/runtime/child-pi.ts +101 -10
  18. package/src/runtime/crash-recovery.ts +78 -36
  19. package/src/runtime/deterministic-ast.ts +161 -0
  20. package/src/runtime/dwf-state-store.ts +97 -0
  21. package/src/runtime/dynamic-workflow-context.ts +381 -7
  22. package/src/runtime/dynamic-workflow-runner.ts +94 -2
  23. package/src/runtime/goal-loop-runner.ts +2 -0
  24. package/src/runtime/live-session-runtime.ts +1 -0
  25. package/src/runtime/model-scope.ts +1 -0
  26. package/src/runtime/peer-dep.ts +1 -0
  27. package/src/runtime/pi-args.ts +11 -0
  28. package/src/runtime/resilient-edit.ts +1 -0
  29. package/src/runtime/result-extractor.ts +72 -7
  30. package/src/runtime/task-runner.ts +1 -0
  31. package/src/runtime/team-runner.ts +8 -3
  32. package/src/runtime/zombie-scanner.ts +297 -0
  33. package/src/schema/team-tool-schema.ts +28 -0
  34. package/src/state/contracts.ts +1 -0
  35. package/src/state/hook-instinct-bridge.ts +3 -0
  36. package/src/state/state-store.ts +3 -0
  37. package/src/state/types.ts +9 -0
  38. package/src/ui/dashboard-panes/progress-pane.ts +5 -0
  39. package/src/ui/dwf-phase-display.ts +151 -0
  40. package/src/ui/run-snapshot-cache.ts +4 -0
  41. package/src/ui/snapshot-types.ts +3 -0
  42. package/src/utils/bm25-search.ts +2 -0
  43. package/src/workflows/workflow-config.ts +3 -0
  44. package/src/worktree/worktree-manager.ts +94 -0
  45. package/types/dwf.d.ts +187 -0
@@ -1,10 +1,11 @@
1
1
  import type { ExtensionContext } from "@earendil-works/pi-coding-agent";
2
2
  import * as fs from "node:fs";
3
+ import * as path from "node:path";
3
4
  import type { MetricRegistry } from "../observability/metric-registry.ts";
4
5
  import { appendEvent, scanSequence } from "../state/event-log.ts";
5
6
  import { recordFromTask, upsertCrewAgent } from "./crew-agent-records.ts";
6
7
  import { withRunLockSync } from "../state/locks.ts";
7
- import { loadRunManifestById, saveRunManifest, saveRunTasks, updateRunStatus } from "../state/state-store.ts";
8
+ import { loadRunManifestById, saveRunTasks, updateRunStatus } from "../state/state-store.ts";
8
9
  import type { TeamTaskState } from "../state/types.ts";
9
10
  import { isWorkerHeartbeatStale } from "./worker-heartbeat.ts";
10
11
  import type { ManifestCache } from "./manifest-cache.ts";
@@ -215,6 +216,43 @@ function tryRemoveRunDirectories(entry: { stateRoot: string; cwd: string }): voi
215
216
  // NOTE: artifactsRoot is shared across runs and cleaned up by pruneFinishedRuns/pruneUserLevelRuns — not deleted here.
216
217
  }
217
218
 
219
+ /**
220
+ * Age (ms) of the team-level heartbeat file for a run. The team-runner writes
221
+ * `<stateRoot>/heartbeat.json` periodically while a workflow is executing
222
+ * (startTeamHeartbeat), so a fresh heartbeat is strong evidence the run is alive
223
+ * even when its recorded PID check is inconclusive or its active-run-index
224
+ * entry's `updatedAt` was frozen at registration. Returns Infinity when absent.
225
+ */
226
+ function heartbeatAgeMs(entry: { stateRoot: string }, now: number): number {
227
+ try {
228
+ const mtime = fs.statSync(path.join(entry.stateRoot, "heartbeat.json")).mtimeMs;
229
+ return Number.isFinite(mtime) ? now - mtime : Infinity;
230
+ } catch {
231
+ return Infinity;
232
+ }
233
+ }
234
+
235
+ /**
236
+ * True if there is recent evidence the run is (or was very recently) alive, so
237
+ * it must NOT be purged. Any one of these signals is sufficient:
238
+ * - on-disk `manifest.updatedAt` fresher than `staleThresholdMs` (rewritten on
239
+ * every task transition / status change), and/or
240
+ * - team-level `heartbeat.json` fresher than `staleThresholdMs`.
241
+ * `entry.updatedAt` is intentionally NOT consulted: it is frozen at
242
+ * registration and never refreshed during execution, which previously caused
243
+ * long-running legitimate runs to be falsely purged — destroying their
244
+ * stateRoot, and because saveRunTasks() silently no-ops once the state dir is
245
+ * gone, hanging the workflow permanently at the current task with no
246
+ * recoverable state ("Run not found").
247
+ */
248
+ function hasRecentLifeEvidence(entry: { stateRoot: string }, manifestUpdatedAt: string | undefined, now: number, staleThresholdMs: number): boolean {
249
+ const manifestMs = manifestUpdatedAt ? new Date(manifestUpdatedAt).getTime() : NaN;
250
+ if (Number.isFinite(manifestMs) && now - manifestMs <= staleThresholdMs) return true;
251
+ const hbAge = heartbeatAgeMs(entry, now);
252
+ if (Number.isFinite(hbAge) && hbAge <= staleThresholdMs) return true;
253
+ return false;
254
+ }
255
+
218
256
  /**
219
257
  * Purge the global active-run-index of entries whose manifest is no longer active.
220
258
  *
@@ -244,7 +282,7 @@ export function purgeStaleActiveRunIndex(staleThresholdMs = 300_000, now = Date.
244
282
  }
245
283
 
246
284
  // 3. Read manifest status
247
- let manifest: { status?: string; async?: { pid?: number }; ownerSessionId?: string } | undefined;
285
+ let manifest: { status?: string; updatedAt?: string; async?: { pid?: number }; ownerSessionId?: string } | undefined;
248
286
  try {
249
287
  manifest = JSON.parse(fs.readFileSync(entry.manifestPath, "utf-8"));
250
288
  } catch {
@@ -262,46 +300,52 @@ export function purgeStaleActiveRunIndex(staleThresholdMs = 300_000, now = Date.
262
300
  continue;
263
301
  }
264
302
 
265
- // 5. Still "running" check if worker PID is dead and no heartbeat
303
+ // 5. Still "running" with an async worker PID only purge when the worker
304
+ // is actually dead AND there is no recent evidence of life. We must NOT
305
+ // rely solely on `entry.updatedAt` (frozen at registration) nor on a single
306
+ // dead-PID reading: a long-running worker (e.g. a 15-minute explorer)
307
+ // legitimately keeps the run "running" while periodically rewriting the
308
+ // on-disk manifest.updatedAt and heartbeat.json. Falsely purging such a run
309
+ // destroys its stateRoot, and because saveRunTasks() silently no-ops once
310
+ // the state dir is gone, the workflow then hangs permanently at the
311
+ // current task with no recoverable state ("Run not found"). When we do mark
312
+ // a run cancelled here, we KEEP its stateRoot so the run stays queryable/
313
+ // resumable and its diagnostics survive; the finished-run pruner removes
314
+ // the directory later on its normal schedule.
266
315
  if (manifest?.status === "running" && manifest.async?.pid !== undefined) {
267
316
  const pidAlive = checkProcessLiveness(manifest.async.pid).alive;
268
- if (!pidAlive) {
269
- // Check age if manifest hasn't been updated in > threshold, it's stale
270
- const updatedAt = new Date(entry.updatedAt).getTime();
271
- if (Number.isFinite(updatedAt) && now - updatedAt > staleThresholdMs) {
272
- // Dead PID + stale update → cancel the manifest and unregister
273
- try {
274
- const fullLoaded = loadRunManifestById(entry.cwd, entry.runId); // NOTE: no withRunLock - best-effort only; concurrent writes may cause inconsistency
275
- if (fullLoaded) {
276
- const now_iso = new Date(now).toISOString();
277
- const repairedTasks = fullLoaded.tasks.map((task) => {
278
- if (task.status === "running" || task.status === "queued" || task.status === "waiting") {
279
- return { ...task, status: "cancelled" as const, finishedAt: now_iso, error: "Orphaned run: worker process dead and no recent activity" };
280
- }
281
- return task;
282
- });
283
- saveRunTasks(fullLoaded.manifest, repairedTasks);
284
- for (const task of repairedTasks) { try { upsertCrewAgent(fullLoaded.manifest, recordFromTask(fullLoaded.manifest, task, "scaffold")); } catch { /* non-critical */ } }
285
- updateRunStatus(fullLoaded.manifest, "cancelled", "Orphaned run: worker process dead and no recent activity");
286
- saveRunManifest(fullLoaded.manifest);
287
- void terminateLiveAgentsForRun(fullLoaded.manifest.runId, "cancelled", appendEvent, fullLoaded.manifest.eventsPath).catch((error) => logInternalError("crash-recovery.pid-dead.terminate", error, `runId=${fullLoaded.manifest.runId}`));
288
- }
289
- } catch {
290
- // Best-effort manifest cleanup
317
+ if (!pidAlive && !hasRecentLifeEvidence(entry, manifest.updatedAt, now, staleThresholdMs)) {
318
+ // Dead PID + no recent life evidence cancel the manifest and unregister
319
+ try {
320
+ const fullLoaded = loadRunManifestById(entry.cwd, entry.runId); // NOTE: no withRunLock - best-effort only; concurrent writes may cause inconsistency
321
+ if (fullLoaded) {
322
+ const now_iso = new Date(now).toISOString();
323
+ const repairedTasks = fullLoaded.tasks.map((task) => {
324
+ if (task.status === "running" || task.status === "queued" || task.status === "waiting") {
325
+ return { ...task, status: "cancelled" as const, finishedAt: now_iso, error: "Orphaned run: worker process dead and no recent activity" };
326
+ }
327
+ return task;
328
+ });
329
+ saveRunTasks(fullLoaded.manifest, repairedTasks);
330
+ for (const task of repairedTasks) { try { upsertCrewAgent(fullLoaded.manifest, recordFromTask(fullLoaded.manifest, task, "scaffold")); } catch { /* non-critical */ } }
331
+ updateRunStatus(fullLoaded.manifest, "cancelled", "Orphaned run: worker process dead and no recent activity");
332
+ void terminateLiveAgentsForRun(fullLoaded.manifest.runId, "cancelled", appendEvent, fullLoaded.manifest.eventsPath).catch((error) => logInternalError("crash-recovery.pid-dead.terminate", error, `runId=${fullLoaded.manifest.runId}`));
291
333
  }
292
- unregisterActiveRun(entry.runId);
293
- tryRemoveRunDirectories(entry);
294
- purged.push(entry.runId);
295
- continue;
334
+ } catch {
335
+ // Best-effort manifest cleanup
296
336
  }
337
+ unregisterActiveRun(entry.runId);
338
+ purged.push(entry.runId);
339
+ continue;
297
340
  }
298
341
  }
299
342
 
300
- // 6. "running" but no async worker PID — possible orphaned run where manifest
301
- // was never updated after worker exit. Check updatedAt age.
343
+ // 6. "running" but no async worker PID — possible orphaned run where the
344
+ // manifest was never updated to a terminal status after the worker exited.
345
+ // Uses the same life-evidence corroboration as condition 5; the stateRoot is
346
+ // kept on cancel so the run stays queryable/resumable with diagnostics.
302
347
  if (manifest?.status === "running" && manifest.async === undefined) {
303
- const updatedAt = new Date(entry.updatedAt).getTime();
304
- if (Number.isFinite(updatedAt) && now - updatedAt > staleThresholdMs) {
348
+ if (!hasRecentLifeEvidence(entry, manifest.updatedAt, now, staleThresholdMs)) {
305
349
  try {
306
350
  const fullLoaded = loadRunManifestById(entry.cwd, entry.runId); // NOTE: no withRunLock - best-effort only; concurrent writes may cause inconsistency
307
351
  if (fullLoaded && fullLoaded.manifest.status === "running") {
@@ -315,14 +359,12 @@ export function purgeStaleActiveRunIndex(staleThresholdMs = 300_000, now = Date.
315
359
  saveRunTasks(fullLoaded.manifest, repairedTasks);
316
360
  for (const task of repairedTasks) { try { upsertCrewAgent(fullLoaded.manifest, recordFromTask(fullLoaded.manifest, task, "scaffold")); } catch { /* non-critical */ } }
317
361
  updateRunStatus(fullLoaded.manifest, "cancelled", "Orphaned run: no async worker and no manifest update in over " + Math.round(staleThresholdMs / 60000) + " minutes");
318
- saveRunManifest(fullLoaded.manifest);
319
362
  void terminateLiveAgentsForRun(fullLoaded.manifest.runId, "cancelled", appendEvent, fullLoaded.manifest.eventsPath).catch((error) => logInternalError("crash-recovery.pid-dead.terminate", error, `runId=${fullLoaded.manifest.runId}`));
320
363
  }
321
364
  } catch {
322
365
  // Best-effort
323
366
  }
324
367
  unregisterActiveRun(entry.runId);
325
- tryRemoveRunDirectories(entry);
326
368
  purged.push(entry.runId);
327
369
  continue;
328
370
  }
@@ -0,0 +1,161 @@
1
+ /**
2
+ * deterministic-ast.ts — AST-based determinism enforcement for dynamic-workflow scripts (round-13 P0-2).
3
+ *
4
+ * Rejects `Date.now()`, `Math.random()`, and `new Date()` at workflow-load time
5
+ * using a true AST walk (not regex) so that:
6
+ * - Prompts mentioning "Date.now()" as string literals are accepted.
7
+ * - Comments containing "Date.now()" are accepted.
8
+ * - `Date.parse()`, `Date.UTC()`, `Math.floor()`, etc. are accepted (only `now` and `random` are blocked).
9
+ *
10
+ * Adapted from pi-dynamic-workflows/src/workflow.ts (MIT) — see NOTICE.md.
11
+ *
12
+ * The walker uses acorn's parse() with permissive flags (allowAwaitOutsideFunction,
13
+ * allowReturnOutsideFunction) so we don't reject perfectly valid workflow scripts
14
+ * that contain top-level `await` or `return`.
15
+ *
16
+ * On parse error, this function returns silently: jiti will surface a clearer
17
+ * parse error downstream. We don't double-report parse errors.
18
+ */
19
+
20
+ import { parse } from "acorn";
21
+
22
+ const NONDETERMINISM_ERROR =
23
+ "Workflow scripts must be deterministic: Date.now()/Math.random()/new Date() are unavailable. These introduce non-reproducible behavior across runs. Use ctx.vars for cached state, or pass a fixed seed via ctx.setArgs(). To bypass this check (escape hatch), set PI_CREW_DWF_SKIP_DETERMINISM_CHECK=1.";
24
+
25
+ export class DeterminismError extends Error {
26
+ constructor() {
27
+ super(NONDETERMINISM_ERROR);
28
+ this.name = "DeterminismError";
29
+ }
30
+ }
31
+
32
+ /**
33
+ * Parse `script` and walk the AST looking for non-deterministic calls.
34
+ * Throws DeterminismError on the first hit. Silently returns on parse error
35
+ * (jiti will produce a clearer message downstream).
36
+ */
37
+ export function assertDeterministicScript(script: string): void {
38
+ let ast: AstNode;
39
+ try {
40
+ ast = parse(script, {
41
+ ecmaVersion: "latest",
42
+ sourceType: "module",
43
+ allowAwaitOutsideFunction: true,
44
+ allowReturnOutsideFunction: true,
45
+ ranges: false,
46
+ }) as unknown as AstNode;
47
+ } catch {
48
+ // Parse errors are handled by jiti downstream — don't double-report.
49
+ return;
50
+ }
51
+ assertDeterministicAst(ast);
52
+ }
53
+
54
+ /**
55
+ * Escape hatch: when PI_CREW_DWF_SKIP_DETERMINISM_CHECK=1 the check is bypassed.
56
+ * Power users may need this when a workflow legitimately depends on time/random
57
+ * (e.g. randomized benchmark scripts).
58
+ */
59
+ export function isDeterminismCheckEnabled(): boolean {
60
+ return process.env.PI_CREW_DWF_SKIP_DETERMINISM_CHECK !== "1";
61
+ }
62
+
63
+ // ---------------------------------------------------------------------------
64
+ // AST walker
65
+ // ---------------------------------------------------------------------------
66
+
67
+ interface AstNode {
68
+ type: string;
69
+ [key: string]: unknown;
70
+ }
71
+
72
+ function asAstNode(value: unknown): AstNode | undefined {
73
+ if (!value || typeof value !== "object") return undefined;
74
+ const obj = value as Record<string, unknown>;
75
+ if (typeof obj.type !== "string") return undefined;
76
+ return obj as AstNode;
77
+ }
78
+
79
+ function astChildren(node: AstNode): AstNode[] {
80
+ const out: AstNode[] = [];
81
+ for (const value of Object.values(node)) {
82
+ if (Array.isArray(value)) {
83
+ for (const item of value) {
84
+ const child = asAstNode(item);
85
+ if (child) out.push(child);
86
+ }
87
+ } else {
88
+ const child = asAstNode(value);
89
+ if (child) out.push(child);
90
+ }
91
+ }
92
+ return out;
93
+ }
94
+
95
+ function assertDeterministicAst(node: AstNode): void {
96
+ if (isDateNowCall(node) || isMathRandomCall(node) || isNewDateExpression(node)) {
97
+ throw new DeterminismError();
98
+ }
99
+ for (const child of astChildren(node)) assertDeterministicAst(child);
100
+ }
101
+
102
+ function isDateNowCall(node: AstNode): boolean {
103
+ return node.type === "CallExpression" && isMemberExpression(node, "callee", "Date", "now");
104
+ }
105
+
106
+ function isMathRandomCall(node: AstNode): boolean {
107
+ return node.type === "CallExpression" && isMemberExpression(node, "callee", "Math", "random");
108
+ }
109
+
110
+ function isNewDateExpression(node: AstNode): boolean {
111
+ if (node.type !== "NewExpression") return false;
112
+ const callee = asAstNode(node.callee);
113
+ return callee?.type === "Identifier" && callee.name === "Date";
114
+ }
115
+
116
+ /**
117
+ * Test whether `node[childKey]` is a MemberExpression of shape `objectName.propertyName`,
118
+ * where the property is either a static Identifier or a resolvable static string.
119
+ * `childKey` is the property name on `node` (usually "callee" for CallExpression).
120
+ */
121
+ function isMemberExpression(node: AstNode, childKey: string, objectName: string, propertyName: string): boolean {
122
+ const child = asAstNode(node[childKey]);
123
+ if (!child || child.type !== "MemberExpression") return false;
124
+ const object = asAstNode(child.object);
125
+ if (!object || object.type !== "Identifier" || object.name !== objectName) return false;
126
+ return propertyNameOf(child) === propertyName;
127
+ }
128
+
129
+ function propertyNameOf(node: AstNode): string | undefined {
130
+ const computed = node.computed === true;
131
+ const property = asAstNode(node.property);
132
+ if (!property) return undefined;
133
+ if (!computed && property.type === "Identifier") {
134
+ return property.name as string | undefined;
135
+ }
136
+ return staticStringOf(property);
137
+ }
138
+
139
+ function staticStringOf(node: AstNode | undefined): string | undefined {
140
+ if (!node) return undefined;
141
+ if (node.type === "Literal" && typeof node.value === "string") return node.value;
142
+ if (node.type === "TemplateLiteral") {
143
+ const expressions = node.expressions;
144
+ if (Array.isArray(expressions) && expressions.length > 0) return undefined;
145
+ const quasis = node.quasis;
146
+ if (!Array.isArray(quasis)) return undefined;
147
+ return quasis
148
+ .map((q) => {
149
+ const quasi = asAstNode(q);
150
+ const value = quasi?.value as { cooked?: string; raw?: string } | undefined;
151
+ return value?.cooked ?? value?.raw ?? "";
152
+ })
153
+ .join("");
154
+ }
155
+ if (node.type === "BinaryExpression" && node.operator === "+") {
156
+ const left = staticStringOf(asAstNode(node.left));
157
+ const right = staticStringOf(asAstNode(node.right));
158
+ if (left !== undefined && right !== undefined) return left + right;
159
+ }
160
+ return undefined;
161
+ }
@@ -0,0 +1,97 @@
1
+ /**
2
+ * dwf-state-store.ts — Persistent checkpoint state for dynamic-workflow runs (P2-3, round-18).
3
+ *
4
+ * Modeled on GoalStore (goal-state-store.ts) and FileCheckpointStore (checkpoint.ts),
5
+ * but scoped to a single run's stateRoot (which is already <crewRoot>/state/runs/<runId>).
6
+ *
7
+ * Stores DwfCheckpointState as atomic JSON at <stateRoot>/dwf-checkpoint.json.
8
+ * atomicWriteJson (temp + rename + fsync) guarantees either the old or the new file,
9
+ * never a partial write — safe across crashes.
10
+ *
11
+ * Resume semantics (round-18): the runner loads a checkpoint on run start and hydrates
12
+ * ctx.vars/phases/logs from it; on clean completion the runner deletes it. A missing or
13
+ * corrupt checkpoint is treated as a fresh run (load() returns undefined). If a crash
14
+ * happens mid-agent, that agent simply re-runs from scratch on resume — agent results
15
+ * are expected to be idempotent-ish.
16
+ */
17
+
18
+ import { mkdirSync, existsSync, readFileSync, unlinkSync } from "node:fs";
19
+ import { dirname } from "node:path";
20
+ import { atomicWriteJson } from "../state/atomic-write.ts";
21
+ import { logInternalError } from "../utils/internal-error.ts";
22
+
23
+ export interface DwfCheckpointState {
24
+ runId: string;
25
+ vars: Record<string, unknown>;
26
+ phases: string[];
27
+ currentPhase: string | undefined;
28
+ logs: string[]; // capped copy (≤1000); the events log (dwf.log) is the durable source of truth
29
+ spent: number; // budget accumulator (round-14 P1-2)
30
+ agentCount: number;
31
+ updatedAt: string;
32
+ }
33
+
34
+ /**
35
+ * DwfStore — atomic CRUD for a single run's DWF checkpoint.
36
+ *
37
+ * Concurrency: writes are atomic (atomicWriteJson). The DWF runner is the sole
38
+ * writer during a run; `team resume` loads the checkpoint read-only before the
39
+ * script re-executes. No file-lock is needed here because only one runner owns a
40
+ * run's stateRoot at a time (run locks protect manifest transitions elsewhere).
41
+ *
42
+ * Note: the constructor takes the run's stateRoot directly (NOT cwd + runId) to
43
+ * avoid a double-nesting bug — stateRoot is already <crewRoot>/state/runs/<runId>,
44
+ * so the checkpoint lands at <crewRoot>/state/runs/<runId>/dwf-checkpoint.json.
45
+ * This mirrors FileCheckpointStore (checkpoint.ts: constructor(stateRoot)).
46
+ */
47
+ export class DwfStore {
48
+ private readonly stateRoot: string;
49
+
50
+ constructor(stateRoot: string) {
51
+ this.stateRoot = stateRoot;
52
+ }
53
+
54
+ private get path(): string {
55
+ return `${this.stateRoot}/dwf-checkpoint.json`;
56
+ }
57
+
58
+ /** Load the checkpoint for this run's stateRoot. Returns undefined if missing or corrupt (fresh run). */
59
+ load(): DwfCheckpointState | undefined {
60
+ const path = this.path;
61
+ try {
62
+ if (!existsSync(path)) return undefined;
63
+ const raw = readFileSync(path, "utf-8");
64
+ const parsed = JSON.parse(raw);
65
+ // Corrupt-guard: a valid checkpoint must be an object with a string runId
66
+ // (mirrors GoalStore.load's typeof parsed.goalId !== "string" check).
67
+ if (!parsed || typeof parsed !== "object" || typeof parsed.runId !== "string") return undefined;
68
+ return parsed as DwfCheckpointState;
69
+ } catch {
70
+ return undefined;
71
+ }
72
+ }
73
+
74
+ /** Atomically persist a checkpoint state. Stamps `updatedAt` (callers need not set it). */
75
+ save(state: DwfCheckpointState): void {
76
+ const path = this.path;
77
+ const next = { ...state, updatedAt: new Date().toISOString() };
78
+ try {
79
+ mkdirSync(dirname(path), { recursive: true });
80
+ atomicWriteJson(path, next);
81
+ } catch (error) {
82
+ logInternalError("dwf-state-store.save", error, `runId=${state.runId}`);
83
+ throw error;
84
+ }
85
+ }
86
+
87
+ /** Remove the checkpoint file (after a clean completion). Best-effort; never throws. */
88
+ delete(): void {
89
+ const path = this.path;
90
+ try {
91
+ if (!existsSync(path)) return;
92
+ unlinkSync(path);
93
+ } catch (error) {
94
+ logInternalError("dwf-state-store.delete", error);
95
+ }
96
+ }
97
+ }