lazyclaw 3.88.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,369 @@
1
+ // LazyClaw auto-resume engine (phase 2).
2
+ // State is persisted to <dir>/<sessionId>.json before each node starts and
3
+ // after it transitions to success/failed. Re-running a successful node is a
4
+ // no-op. Timeouts retry with exponential backoff up to maxRetries.
5
+
6
+ import fs from 'node:fs';
7
+ import path from 'node:path';
8
+ import { performance } from 'node:perf_hooks';
9
+ import { topologicalLevels, retryWithBackoff, runWithTimeout, settleWithConcurrency } from './executor.mjs';
10
+
11
+ const DEFAULT_DIR = '.workflow-state';
12
+
13
+ /** @typedef {'pending'|'running'|'success'|'failed'} NodeStatus */
14
+
15
+ /**
16
+ * @typedef {Object} NodeState
17
+ * @property {NodeStatus} status
18
+ * @property {unknown} [output]
19
+ * @property {number} [attempts]
20
+ * @property {string} [error]
21
+ * @property {number} [durationMs]
22
+ */
23
+
24
+ /**
25
+ * @typedef {Object} PersistedState
26
+ * @property {string} sessionId
27
+ * @property {string[]} order
28
+ * @property {Record<string, NodeState>} nodes
29
+ * @property {number} startedAt
30
+ * @property {number} updatedAt
31
+ */
32
+
33
+ /**
34
+ * @param {string} sessionId
35
+ * @param {string} [dir]
36
+ */
37
+ export function statePath(sessionId, dir = DEFAULT_DIR) {
38
+ return path.join(dir, `${sessionId}.json`);
39
+ }
40
+
41
+ /**
42
+ * @param {string} sessionId
43
+ * @param {string} [dir]
44
+ * @returns {PersistedState | null}
45
+ */
46
+ export function loadState(sessionId, dir = DEFAULT_DIR) {
47
+ const p = statePath(sessionId, dir);
48
+ if (!fs.existsSync(p)) return null;
49
+ return JSON.parse(fs.readFileSync(p, 'utf8'));
50
+ }
51
+
52
+ /**
53
+ * @param {PersistedState} state
54
+ * @param {string} [dir]
55
+ */
56
+ export function saveState(state, dir = DEFAULT_DIR) {
57
+ fs.mkdirSync(dir, { recursive: true });
58
+ state.updatedAt = Date.now();
59
+ const p = statePath(state.sessionId, dir);
60
+ const tmp = `${p}.tmp`;
61
+ fs.writeFileSync(tmp, JSON.stringify(state, null, 2));
62
+ fs.renameSync(tmp, p);
63
+ }
64
+
65
+ function initState(sessionId, nodes) {
66
+ const now = Date.now();
67
+ return {
68
+ sessionId,
69
+ order: nodes.map(n => n.id),
70
+ nodes: Object.fromEntries(nodes.map(n => [n.id, { status: 'pending', attempts: 0 }])),
71
+ startedAt: now,
72
+ updatedAt: now,
73
+ };
74
+ }
75
+
76
+ // runWithTimeout lives in executor.mjs (imported above) — single
77
+ // source of truth so the timeout shape stays identical across both
78
+ // engines and any caller that wants to reuse it.
79
+
80
+ function isTimeout(err) {
81
+ if (!err) return false;
82
+ if (err.code === 'TIMEOUT') return true;
83
+ if (err.message === 'TIMEOUT') return true;
84
+ if (typeof err.message === 'string' && err.message.toLowerCase().includes('timeout')) return true;
85
+ return false;
86
+ }
87
+
88
+ /**
89
+ * @param {import('./executor.mjs').WorkflowNode[]} nodes
90
+ * @param {{
91
+ * sessionId: string,
92
+ * dir?: string,
93
+ * maxRetries?: number,
94
+ * baseDelayMs?: number,
95
+ * timeoutMs?: number,
96
+ * sleep?: (ms: number) => Promise<void>,
97
+ * signal?: AbortSignal,
98
+ * }} opts
99
+ */
100
+ export async function runPersistent(nodes, opts) {
101
+ const dir = opts.dir ?? DEFAULT_DIR;
102
+ const maxRetries = opts.maxRetries ?? 3;
103
+ const baseDelay = opts.baseDelayMs ?? 100;
104
+ const sleep = opts.sleep ?? (ms => new Promise(r => setTimeout(r, ms)));
105
+ const signal = opts.signal;
106
+
107
+ let state = loadState(opts.sessionId, dir);
108
+ if (!state) {
109
+ state = initState(opts.sessionId, nodes);
110
+ saveState(state, dir);
111
+ } else {
112
+ for (const id of state.order) {
113
+ const ns = state.nodes[id];
114
+ if (ns && ns.status === 'running') {
115
+ state.nodes[id] = { status: 'pending', attempts: ns.attempts ?? 0 };
116
+ }
117
+ }
118
+ saveState(state, dir);
119
+ }
120
+
121
+ const retryDelays = [];
122
+ const executedNodes = [];
123
+ let input = null;
124
+
125
+ // Aborted state is *resumable*, not failed: leave the current
126
+ // node as 'pending' (decrementing attempts so resume retries it)
127
+ // and let a future runPersistent() call pick up where this one
128
+ // stopped. That's the same teardown path as a SIGKILL'd run, so
129
+ // resume-by-abort and resume-by-crash converge to the same shape.
130
+ const buildAbortReturn = (currentNodeId, attempts) => {
131
+ if (currentNodeId) {
132
+ state.nodes[currentNodeId] = { status: 'pending', attempts: Math.max(0, (attempts ?? 1) - 1) };
133
+ saveState(state, dir);
134
+ }
135
+ return {
136
+ success: false,
137
+ state,
138
+ failedAt: currentNodeId,
139
+ error: 'aborted',
140
+ code: 'ABORT',
141
+ retryDelays,
142
+ executedNodes,
143
+ };
144
+ };
145
+
146
+ for (const node of nodes) {
147
+ if (signal?.aborted) return buildAbortReturn(node.id, 0);
148
+ const ns = state.nodes[node.id] ?? { status: 'pending' };
149
+ if (ns.status === 'success') {
150
+ input = ns.output;
151
+ continue;
152
+ }
153
+ let attempts = ns.attempts ?? 0;
154
+ while (true) {
155
+ if (signal?.aborted) return buildAbortReturn(node.id, attempts);
156
+ attempts++;
157
+ state.nodes[node.id] = { status: 'running', attempts };
158
+ saveState(state, dir);
159
+ const t0 = performance.now();
160
+ try {
161
+ const output = await runWithTimeout(() => node.execute(input, { signal }), opts.timeoutMs);
162
+ const durationMs = performance.now() - t0;
163
+ state.nodes[node.id] = { status: 'success', output, attempts, durationMs };
164
+ saveState(state, dir);
165
+ executedNodes.push(node.id);
166
+ input = output;
167
+ break;
168
+ } catch (err) {
169
+ // An abort surfaced through execute() (e.g. fetch with signal)
170
+ // is treated like the cross-node check above: roll back to
171
+ // 'pending' so resume retries this node, return ABORT.
172
+ if (signal?.aborted || err?.code === 'ABORT') {
173
+ return buildAbortReturn(node.id, attempts);
174
+ }
175
+ const msg = err instanceof Error ? err.message : String(err);
176
+ if (isTimeout(err) && attempts < maxRetries) {
177
+ const delay = baseDelay * Math.pow(2, attempts - 1);
178
+ retryDelays.push(delay);
179
+ await sleep(delay);
180
+ continue;
181
+ }
182
+ const durationMs = performance.now() - t0;
183
+ state.nodes[node.id] = { status: 'failed', attempts, error: msg, durationMs };
184
+ saveState(state, dir);
185
+ return { success: false, state, failedAt: node.id, error: msg, retryDelays, executedNodes };
186
+ }
187
+ }
188
+ }
189
+ return { success: true, state, retryDelays, executedNodes };
190
+ }
191
+
192
+ /**
193
+ * Persistent DAG executor — combines `runParallel`'s topological-level
194
+ * scheduling with `runPersistent`'s checkpoint-and-resume semantics.
195
+ *
196
+ * Differences from `runPersistent`:
197
+ * - Nodes declare `deps: string[]` (order in `nodes` array irrelevant).
198
+ * - Each topological level runs concurrently via `Promise.all`.
199
+ * - State is saved after every node transition, same atomic-rename
200
+ * pattern as `runPersistent`.
201
+ * - Resume: load state; running-status nodes from a prior interrupted
202
+ * run get reset to pending; success nodes are skipped.
203
+ *
204
+ * Each node receives `{ depId: depOutput }` as input. A node with no
205
+ * deps gets `null`.
206
+ *
207
+ * @param {Array<{
208
+ * id: string,
209
+ * deps?: string[],
210
+ * execute: (input: Record<string, unknown> | null, opts?: { signal?: AbortSignal }) => Promise<unknown>,
211
+ * cleanup?: () => (Promise<void>|void),
212
+ * retry?: { max: number, baseDelayMs?: number },
213
+ * timeoutMs?: number,
214
+ * }>} nodes
215
+ * @param {{
216
+ * sessionId: string,
217
+ * dir?: string,
218
+ * timeoutMs?: number,
219
+ * signal?: AbortSignal,
220
+ * concurrency?: number,
221
+ * }} opts
222
+ */
223
+ export async function runPersistentDag(nodes, opts) {
224
+ const dir = opts.dir ?? DEFAULT_DIR;
225
+ const signal = opts.signal;
226
+
227
+ // Compute topological levels at start. (Static import at module top
228
+ // — a dynamic `import()` here trips the tsx loader's CJS conversion
229
+ // path under @playwright/test in some configurations.)
230
+ const { levels, leftover } = topologicalLevels(nodes);
231
+ if (leftover.length > 0) {
232
+ return {
233
+ success: false,
234
+ state: null,
235
+ failedAt: leftover[0],
236
+ error: `workflow has a cycle or unreachable nodes: ${leftover.join(', ')}`,
237
+ executedNodes: [],
238
+ };
239
+ }
240
+
241
+ // State init / resume — same shape as runPersistent so a session id
242
+ // doesn't accidentally collide between modes.
243
+ let state = loadState(opts.sessionId, dir);
244
+ if (!state) {
245
+ state = initState(opts.sessionId, nodes);
246
+ saveState(state, dir);
247
+ } else {
248
+ // Demote any 'running' from a prior interrupted run back to pending.
249
+ // success outputs are preserved so a fan-in node sees its predecessors.
250
+ for (const id of Object.keys(state.nodes)) {
251
+ const ns = state.nodes[id];
252
+ if (ns && ns.status === 'running') {
253
+ state.nodes[id] = { status: 'pending', attempts: ns.attempts ?? 0 };
254
+ }
255
+ }
256
+ saveState(state, dir);
257
+ }
258
+
259
+ const idToNode = new Map(nodes.map(n => [n.id, n]));
260
+ const executedNodes = [];
261
+
262
+ // Shared abort handler — same demote-to-pending semantic as
263
+ // runPersistent: aborted nodes are *resumable*, not failed. After
264
+ // an abort, demote anything still 'running' back to 'pending' so a
265
+ // future runPersistentDag() picks them up. Returns the result shape.
266
+ const buildAbortReturn = (failedAtId) => {
267
+ for (const id of Object.keys(state.nodes)) {
268
+ const ns = state.nodes[id];
269
+ if (ns && ns.status === 'running') {
270
+ state.nodes[id] = { status: 'pending', attempts: ns.attempts ?? 0 };
271
+ }
272
+ }
273
+ saveState(state, dir);
274
+ return {
275
+ success: false,
276
+ state,
277
+ failedAt: failedAtId,
278
+ error: 'aborted',
279
+ code: 'ABORT',
280
+ executedNodes,
281
+ };
282
+ };
283
+
284
+ for (let levelIdx = 0; levelIdx < levels.length; levelIdx++) {
285
+ const levelIds = levels[levelIdx];
286
+ // failedAt for an abort = first node of the next level we'd
287
+ // schedule. If we're already past the last level, use the
288
+ // current level's first id (the abort caught us between final
289
+ // level and "all done").
290
+ const nextLevelFirstId = () => levels[levelIdx + 1]?.[0] ?? levelIds[0];
291
+ if (signal?.aborted) return buildAbortReturn(levelIds[0]);
292
+ // Each node in the level is independent of its peers — run concurrently.
293
+ // We collect both success outputs and the first failure; on failure we
294
+ // stop scheduling future levels (same as runParallel) but persist the
295
+ // success outputs from the level that *did* finish before the throw.
296
+ // opts.concurrency caps how many nodes within a single level run
297
+ // at the same time — same semantic as runParallel. Default
298
+ // unbounded (every level node runs in parallel).
299
+ const settled = (await settleWithConcurrency(levelIds, async (id) => {
300
+ const ns = state.nodes[id] ?? { status: 'pending' };
301
+ if (ns.status === 'success') return { id, ok: true, skipped: true };
302
+
303
+ const node = idToNode.get(id);
304
+ const deps = node.deps || [];
305
+ const input = deps.length === 0 ? null : Object.fromEntries(deps.map(d => [d, state.nodes[d]?.output]));
306
+
307
+ // Mark running and persist before we start. Concurrent runs of
308
+ // different nodes will each write the state file; saveState's
309
+ // tmp+rename keeps each write atomic, but the final file content
310
+ // is the LAST writer's view — that's fine because each node only
311
+ // mutates its own slot.
312
+ state.nodes[id] = { status: 'running', attempts: (ns.attempts ?? 0) + 1 };
313
+ saveState(state, dir);
314
+ const t0 = performance.now();
315
+ // Wrap each execute() in retryWithBackoff when node.retry is set.
316
+ // The retry budget lives entirely *inside* this attempt — outer
317
+ // resume semantics are unchanged: a level failure still flips
318
+ // node status to 'failed' on disk, and a future runPersistentDag
319
+ // call retries it from scratch (resume-level retry, separate from
320
+ // node.retry). This composition gives users two distinct knobs:
321
+ // - node.retry → recover transient faults within one run
322
+ // - resume → recover catastrophic faults across runs
323
+ // node.timeoutMs (per-node) takes precedence over opts.timeoutMs
324
+ // (workflow-wide default) so a fast node with a tight cap doesn't
325
+ // inherit a slower node's lenient cap.
326
+ const effectiveTimeout = Number.isFinite(node.timeoutMs) ? node.timeoutMs : opts.timeoutMs;
327
+ const fn = () => runWithTimeout(() => node.execute(input, { signal }), effectiveTimeout);
328
+ try {
329
+ const output = node.retry && Number.isFinite(node.retry.max) && node.retry.max > 0
330
+ ? await retryWithBackoff(fn, node.retry)
331
+ : await fn();
332
+ const durationMs = performance.now() - t0;
333
+ state.nodes[id] = { status: 'success', output, attempts: state.nodes[id].attempts, durationMs };
334
+ saveState(state, dir);
335
+ return { id, ok: true };
336
+ } catch (err) {
337
+ // An abort surfaced through execute() flips the node back to
338
+ // pending so resume can retry it. We re-raise via aborted=true
339
+ // so the level loop below knows to short-circuit.
340
+ if (signal?.aborted || err?.code === 'ABORT') {
341
+ return { id, aborted: true };
342
+ }
343
+ const msg = err instanceof Error ? err.message : String(err);
344
+ const durationMs = performance.now() - t0;
345
+ state.nodes[id] = { status: 'failed', error: msg, attempts: state.nodes[id].attempts, durationMs };
346
+ saveState(state, dir);
347
+ return { id, ok: false, error: msg };
348
+ }
349
+ }, opts.concurrency)).map(s => s.status === 'fulfilled' ? s.value : { id: 'unknown', ok: false, error: String(s.reason) });
350
+ let firstFailure = null;
351
+ let firstAbort = null;
352
+ for (const r of settled) {
353
+ if (r.aborted) { if (!firstAbort) firstAbort = r; continue; }
354
+ if (r.ok && !r.skipped) executedNodes.push(r.id);
355
+ if (!r.ok && !firstFailure) firstFailure = r;
356
+ }
357
+ if (firstAbort || signal?.aborted) {
358
+ // If a node aborted from inside execute(), failedAt = that node.
359
+ // If the signal flipped after this level finished cleanly, the
360
+ // next level was the one that won't run — point failedAt there.
361
+ return buildAbortReturn(firstAbort?.id ?? nextLevelFirstId());
362
+ }
363
+ if (firstFailure) {
364
+ return { success: false, state, failedAt: firstFailure.id, error: firstFailure.error, executedNodes };
365
+ }
366
+ }
367
+
368
+ return { success: true, state, executedNodes };
369
+ }
@@ -0,0 +1,318 @@
1
+ // Pure transformations over persisted workflow state.
2
+ // Lifted out of the CLI so both `lazyclaw inspect` and the daemon's
3
+ // /workflows endpoint can produce the same shape — a single source
4
+ // of truth for what "workflow progress" looks like over the wire.
5
+ //
6
+ // We intentionally re-implement state-file reading here (a 3-line
7
+ // function) instead of importing from `persistent.mjs`. The daemon's
8
+ // import graph stays free of the workflow engine — under tsx/CJS
9
+ // conversion in @playwright/test, importing engine modules from the
10
+ // daemon's static graph has historically broken.
11
+
12
+ import fs from 'node:fs';
13
+ import path from 'node:path';
14
+
15
+ /**
16
+ * Load a persisted state file. Returns null when the file does not
17
+ * exist (a session that has never been written). Throws on JSON
18
+ * parse errors so callers can surface the corruption rather than
19
+ * masking it as "no state".
20
+ *
21
+ * @param {string} sessionId
22
+ * @param {string} dir
23
+ * @returns {object | null}
24
+ */
25
+ export function loadStateFile(sessionId, dir) {
26
+ const p = path.join(dir, `${sessionId}.json`);
27
+ if (!fs.existsSync(p)) return null;
28
+ return JSON.parse(fs.readFileSync(p, 'utf8'));
29
+ }
30
+
31
+ /**
32
+ * @typedef {{ status?: 'pending'|'running'|'success'|'failed', output?: unknown, attempts?: number, error?: string, durationMs?: number }} NodeState
33
+ * @typedef {{ sessionId: string, order?: string[], nodes: Record<string, NodeState>, startedAt?: number, updatedAt?: number }} PersistedState
34
+ * @typedef {{ total: number, pending: number, running: number, success: number, failed: number, done: boolean, resumable: boolean, durationMs: number }} StateSummary
35
+ */
36
+
37
+ /**
38
+ * Reduce a persisted state object to its summary block + the list of
39
+ * failed nodes. The summary is the same regardless of whether you're
40
+ * looking at a single session or one element of a listing.
41
+ *
42
+ * @param {PersistedState} state
43
+ * @returns {{ summary: StateSummary, failedNodes: Array<{ id: string, error?: string, attempts?: number }> }}
44
+ */
45
+ export function summarizeState(state) {
46
+ const counts = { pending: 0, running: 0, success: 0, failed: 0 };
47
+ const failedNodes = [];
48
+ let totalDurationMs = 0;
49
+ const nodes = state?.nodes || {};
50
+ for (const id of Object.keys(nodes)) {
51
+ const n = nodes[id];
52
+ const status = n?.status || 'pending';
53
+ if (counts[status] !== undefined) counts[status]++;
54
+ if (status === 'failed') failedNodes.push({ id, error: n.error, attempts: n.attempts });
55
+ if (typeof n?.durationMs === 'number') totalDurationMs += n.durationMs;
56
+ }
57
+ const total = Object.keys(nodes).length;
58
+ const allDone = total > 0 && counts.success === total;
59
+ const hasFailure = counts.failed > 0;
60
+ return {
61
+ summary: {
62
+ total,
63
+ ...counts,
64
+ done: allDone,
65
+ // "Resumable" = there's at least one non-success node AND no terminal
66
+ // failure. Running/pending nodes from a prior interrupted run will be
67
+ // demoted by the engine on next load — they count as resumable work.
68
+ resumable: !allDone && !hasFailure,
69
+ durationMs: totalDurationMs,
70
+ },
71
+ failedNodes,
72
+ };
73
+ }
74
+
75
+ /**
76
+ * Aggregate per-node statistics across every persisted session in
77
+ * a state directory. For each node id seen across sessions, compute
78
+ * how often it ran, how often it succeeded/failed, and the
79
+ * min/max/avg/total durations.
80
+ *
81
+ * Useful for cross-run analysis: "which node tends to be slow or
82
+ * fail across all my runs of this workflow?" — a question
83
+ * single-session inspect can't answer.
84
+ *
85
+ * @param {string} dir
86
+ * @param {{ filter?: string }} [opts] Optional case-insensitive
87
+ * sessionId substring filter — only matching sessions
88
+ * contribute to the aggregate. Same semantic as v3.36's
89
+ * list-mode `--filter`.
90
+ * @returns {{ sessionCount: number, nodeStats: Record<string, {
91
+ * count: number,
92
+ * successCount: number,
93
+ * failedCount: number,
94
+ * pendingCount: number,
95
+ * runningCount: number,
96
+ * minDurationMs: number,
97
+ * maxDurationMs: number,
98
+ * avgDurationMs: number,
99
+ * p50DurationMs: number,
100
+ * p95DurationMs: number,
101
+ * p99DurationMs: number,
102
+ * totalDurationMs: number,
103
+ * }> }}
104
+ */
105
+ export function aggregateNodeStats(dir, opts = {}) {
106
+ if (!fs.existsSync(dir)) {
107
+ const e = new Error(`State directory ${dir} does not exist`);
108
+ /** @type {any} */ (e).code = 'ENOENT';
109
+ throw e;
110
+ }
111
+ const files = fs.readdirSync(dir).filter(f => f.endsWith('.json'));
112
+ let sessionCount = 0;
113
+ /** @type {Record<string, { count: number, successCount: number, failedCount: number, pendingCount: number, runningCount: number, durations: number[] }>} */
114
+ const accumulator = {};
115
+ const filterLower = opts.filter ? String(opts.filter).toLowerCase() : null;
116
+ for (const f of files) {
117
+ let state;
118
+ try {
119
+ state = JSON.parse(fs.readFileSync(path.join(dir, f), 'utf8'));
120
+ } catch { continue; }
121
+ if (!state?.sessionId || !state?.nodes) continue;
122
+ if (filterLower && !state.sessionId.toLowerCase().includes(filterLower)) continue;
123
+ sessionCount++;
124
+ for (const id of Object.keys(state.nodes)) {
125
+ const ns = state.nodes[id];
126
+ const status = ns?.status || 'pending';
127
+ const slot = accumulator[id] || (accumulator[id] = {
128
+ count: 0, successCount: 0, failedCount: 0, pendingCount: 0, runningCount: 0,
129
+ durations: [],
130
+ });
131
+ slot.count++;
132
+ if (status === 'success') slot.successCount++;
133
+ else if (status === 'failed') slot.failedCount++;
134
+ else if (status === 'pending') slot.pendingCount++;
135
+ else if (status === 'running') slot.runningCount++;
136
+ if (Number.isFinite(ns?.durationMs)) slot.durations.push(ns.durationMs);
137
+ }
138
+ }
139
+ /** @type {Record<string, ReturnType<typeof aggregateNodeStats>['nodeStats'][string]>} */
140
+ const nodeStats = {};
141
+ // Nearest-rank percentile: ceil(p * n) gives the 1-indexed
142
+ // position; subtract 1 for 0-indexed array access. Standard
143
+ // definition (cf. Wikipedia "Percentile / Nearest-rank method").
144
+ // Empty array → 0 by convention.
145
+ const percentile = (sorted, p) => {
146
+ if (sorted.length === 0) return 0;
147
+ const idx = Math.max(0, Math.ceil(p * sorted.length) - 1);
148
+ return sorted[Math.min(idx, sorted.length - 1)];
149
+ };
150
+ for (const id of Object.keys(accumulator)) {
151
+ const slot = accumulator[id];
152
+ const durations = slot.durations;
153
+ const sorted = [...durations].sort((a, b) => a - b);
154
+ const total = durations.reduce((s, x) => s + x, 0);
155
+ const r2 = (n) => Math.round(n * 100) / 100;
156
+ nodeStats[id] = {
157
+ count: slot.count,
158
+ successCount: slot.successCount,
159
+ failedCount: slot.failedCount,
160
+ pendingCount: slot.pendingCount,
161
+ runningCount: slot.runningCount,
162
+ minDurationMs: sorted.length ? sorted[0] : 0,
163
+ maxDurationMs: sorted.length ? sorted[sorted.length - 1] : 0,
164
+ avgDurationMs: sorted.length ? r2(total / sorted.length) : 0,
165
+ p50DurationMs: r2(percentile(sorted, 0.5)),
166
+ p95DurationMs: r2(percentile(sorted, 0.95)),
167
+ p99DurationMs: r2(percentile(sorted, 0.99)),
168
+ totalDurationMs: r2(total),
169
+ };
170
+ }
171
+ return { sessionCount, nodeStats };
172
+ }
173
+
174
+ /**
175
+ * Compute the critical path (longest weighted path) through a DAG.
176
+ *
177
+ * Given the persisted state's node order + a deps map (which the
178
+ * caller supplies, since the engine doesn't persist deps — it
179
+ * persists outputs and statuses), this walks the DAG in
180
+ * topological order and finds the chain of nodes whose summed
181
+ * `durationMs` is the largest among all root-to-leaf paths.
182
+ *
183
+ * Algorithm — straightforward DP over a topo order:
184
+ * for each node in topo order:
185
+ * bestPredecessor = arg max over deps (bestFinish[dep])
186
+ * bestFinish[node] = (bestFinish[bestPredecessor] || 0) + duration[node]
187
+ * prev[node] = bestPredecessor
188
+ *
189
+ * Then walk `prev[]` backwards from the node with the max
190
+ * bestFinish to recover the path.
191
+ *
192
+ * @param {{ id: string, deps?: string[] }[]} graphNodes Workflow shape (deps = id[])
193
+ * @param {Record<string, { durationMs?: number, status?: string }>} stateNodes Persisted state (durationMs)
194
+ * @returns {{ path: string[], totalMs: number, perNodeMs: Record<string, number> }}
195
+ * - path: ordered list of node ids on the critical path
196
+ * - totalMs: sum of durationMs across the path
197
+ * - perNodeMs: durationMs lookup for every node (0 if missing)
198
+ */
199
+ export function criticalPath(graphNodes, stateNodes) {
200
+ const idToDeps = new Map(graphNodes.map(n => [n.id, n.deps || []]));
201
+ const ids = graphNodes.map(n => n.id);
202
+ // Topological order — Kahn's algorithm. We don't need levels here,
203
+ // just an order where every dep comes before its dependents.
204
+ const indegree = new Map(ids.map(id => [id, 0]));
205
+ for (const n of graphNodes) {
206
+ for (const d of n.deps || []) {
207
+ if (indegree.has(d)) indegree.set(n.id, (indegree.get(n.id) || 0) + 1);
208
+ }
209
+ }
210
+ const topo = [];
211
+ const queue = ids.filter(id => (indegree.get(id) || 0) === 0);
212
+ while (queue.length) {
213
+ const id = queue.shift();
214
+ topo.push(id);
215
+ for (const m of graphNodes) {
216
+ if ((m.deps || []).includes(id) && indegree.has(m.id)) {
217
+ const next = (indegree.get(m.id) || 0) - 1;
218
+ indegree.set(m.id, next);
219
+ if (next === 0) queue.push(m.id);
220
+ }
221
+ }
222
+ }
223
+ // If there's a cycle, topo will be shorter than ids.length. Rather
224
+ // than crash, we walk what we got — the result is the best path
225
+ // we can compute over the acyclic portion. Caller can `validate`
226
+ // up front if they want strict.
227
+ const perNodeMs = {};
228
+ for (const id of ids) {
229
+ const ns = stateNodes?.[id];
230
+ perNodeMs[id] = (ns && Number.isFinite(ns.durationMs)) ? ns.durationMs : 0;
231
+ }
232
+ const bestFinish = {};
233
+ const chainLen = {}; // path length (node count) ending at this id
234
+ const prev = {};
235
+ let bestEnd = null;
236
+ let bestEndFinish = -Infinity;
237
+ let bestEndChainLen = 0;
238
+ for (const id of topo) {
239
+ const deps = idToDeps.get(id) || [];
240
+ let bestPred = null;
241
+ let bestPredFinish = 0;
242
+ let bestPredChainLen = 0;
243
+ for (const d of deps) {
244
+ const f = bestFinish[d];
245
+ const cl = chainLen[d] || 0;
246
+ if (typeof f !== 'number') continue;
247
+ // Tie-break: weight first, then chain length. Prefer longer
248
+ // dependency chains when totalMs is the same — useful for
249
+ // fresh / pre-run state where durations are all 0 and the
250
+ // user actually wants topological depth.
251
+ if (f > bestPredFinish || (f === bestPredFinish && cl > bestPredChainLen)) {
252
+ bestPredFinish = f;
253
+ bestPredChainLen = cl;
254
+ bestPred = d;
255
+ }
256
+ }
257
+ bestFinish[id] = bestPredFinish + perNodeMs[id];
258
+ chainLen[id] = bestPredChainLen + 1;
259
+ prev[id] = bestPred;
260
+ if (bestFinish[id] > bestEndFinish ||
261
+ (bestFinish[id] === bestEndFinish && chainLen[id] > bestEndChainLen)) {
262
+ bestEndFinish = bestFinish[id];
263
+ bestEndChainLen = chainLen[id];
264
+ bestEnd = id;
265
+ }
266
+ }
267
+ // Recover the path by walking prev[] backwards.
268
+ const path = [];
269
+ for (let cur = bestEnd; cur != null; cur = prev[cur]) path.unshift(cur);
270
+ return {
271
+ path,
272
+ totalMs: Math.max(0, bestEndFinish),
273
+ perNodeMs,
274
+ };
275
+ }
276
+
277
+ /**
278
+ * Read every state file in `dir` and return a sorted listing.
279
+ * Newest activity first (by `updatedAt`); secondary sort by sessionId
280
+ * for deterministic ordering on ties.
281
+ *
282
+ * Stray non-JSON files and corrupt state are silently skipped — a
283
+ * left-over `.tmp` from a crashed write doesn't break the listing.
284
+ * Throws if `dir` does not exist; the caller decides whether that's
285
+ * an error (CLI exit 2) or empty result (auto-create on first run).
286
+ *
287
+ * @param {string} dir
288
+ * @returns {Array<{ sessionId: string, summary: StateSummary, failedNodes: Array<{ id: string, error?: string, attempts?: number }>, startedAt?: number, updatedAt?: number }>}
289
+ */
290
+ export function listSessions(dir) {
291
+ if (!fs.existsSync(dir)) {
292
+ const e = new Error(`State directory ${dir} does not exist`);
293
+ /** @type {any} */ (e).code = 'ENOENT';
294
+ throw e;
295
+ }
296
+ const files = fs.readdirSync(dir).filter(f => f.endsWith('.json'));
297
+ const sessions = [];
298
+ for (const f of files) {
299
+ try {
300
+ const raw = fs.readFileSync(path.join(dir, f), 'utf8');
301
+ const state = JSON.parse(raw);
302
+ if (!state?.sessionId) continue;
303
+ const { summary, failedNodes } = summarizeState(state);
304
+ sessions.push({
305
+ sessionId: state.sessionId,
306
+ summary,
307
+ failedNodes,
308
+ startedAt: state.startedAt,
309
+ updatedAt: state.updatedAt,
310
+ });
311
+ } catch {
312
+ // Skip non-state JSON / corrupt files — see saveState's atomic
313
+ // tmp+rename for the normal write path.
314
+ }
315
+ }
316
+ sessions.sort((a, b) => (b.updatedAt - a.updatedAt) || a.sessionId.localeCompare(b.sessionId));
317
+ return sessions;
318
+ }