npm - lazyclaw - Versions diffs - 3.88.0 - Mend

lazyclaw 3.88.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

package/workflow/persistent.mjs ADDED Viewed

@@ -0,0 +1,369 @@
+// LazyClaw auto-resume engine (phase 2).
+// State is persisted to <dir>/<sessionId>.json before each node starts and
+// after it transitions to success/failed. Re-running a successful node is a
+// no-op. Timeouts retry with exponential backoff up to maxRetries.
+import fs from 'node:fs';
+import path from 'node:path';
+import { performance } from 'node:perf_hooks';
+import { topologicalLevels, retryWithBackoff, runWithTimeout, settleWithConcurrency } from './executor.mjs';
+const DEFAULT_DIR = '.workflow-state';
+/** @typedef {'pending'|'running'|'success'|'failed'} NodeStatus */
+/**
+ * @typedef {Object} NodeState
+ * @property {NodeStatus} status
+ * @property {unknown} [output]
+ * @property {number} [attempts]
+ * @property {string} [error]
+ * @property {number} [durationMs]
+ */
+/**
+ * @typedef {Object} PersistedState
+ * @property {string} sessionId
+ * @property {string[]} order
+ * @property {Record<string, NodeState>} nodes
+ * @property {number} startedAt
+ * @property {number} updatedAt
+ */
+/**
+ * @param {string} sessionId
+ * @param {string} [dir]
+ */
+export function statePath(sessionId, dir = DEFAULT_DIR) {
+  return path.join(dir, `${sessionId}.json`);
+}
+/**
+ * @param {string} sessionId
+ * @param {string} [dir]
+ * @returns {PersistedState | null}
+ */
+export function loadState(sessionId, dir = DEFAULT_DIR) {
+  const p = statePath(sessionId, dir);
+  if (!fs.existsSync(p)) return null;
+  return JSON.parse(fs.readFileSync(p, 'utf8'));
+}
+/**
+ * @param {PersistedState} state
+ * @param {string} [dir]
+ */
+export function saveState(state, dir = DEFAULT_DIR) {
+  fs.mkdirSync(dir, { recursive: true });
+  state.updatedAt = Date.now();
+  const p = statePath(state.sessionId, dir);
+  const tmp = `${p}.tmp`;
+  fs.writeFileSync(tmp, JSON.stringify(state, null, 2));
+  fs.renameSync(tmp, p);
+}
+function initState(sessionId, nodes) {
+  const now = Date.now();
+  return {
+    sessionId,
+    order: nodes.map(n => n.id),
+    nodes: Object.fromEntries(nodes.map(n => [n.id, { status: 'pending', attempts: 0 }])),
+    startedAt: now,
+    updatedAt: now,
+  };
+}
+// runWithTimeout lives in executor.mjs (imported above) — single
+// source of truth so the timeout shape stays identical across both
+// engines and any caller that wants to reuse it.
+function isTimeout(err) {
+  if (!err) return false;
+  if (err.code === 'TIMEOUT') return true;
+  if (err.message === 'TIMEOUT') return true;
+  if (typeof err.message === 'string' && err.message.toLowerCase().includes('timeout')) return true;
+  return false;
+}
+/**
+ * @param {import('./executor.mjs').WorkflowNode[]} nodes
+ * @param {{
+ *   sessionId: string,
+ *   dir?: string,
+ *   maxRetries?: number,
+ *   baseDelayMs?: number,
+ *   timeoutMs?: number,
+ *   sleep?: (ms: number) => Promise<void>,
+ *   signal?: AbortSignal,
+ * }} opts
+ */
+export async function runPersistent(nodes, opts) {
+  const dir = opts.dir ?? DEFAULT_DIR;
+  const maxRetries = opts.maxRetries ?? 3;
+  const baseDelay = opts.baseDelayMs ?? 100;
+  const sleep = opts.sleep ?? (ms => new Promise(r => setTimeout(r, ms)));
+  const signal = opts.signal;
+  let state = loadState(opts.sessionId, dir);
+  if (!state) {
+    state = initState(opts.sessionId, nodes);
+    saveState(state, dir);
+  } else {
+    for (const id of state.order) {
+      const ns = state.nodes[id];
+      if (ns && ns.status === 'running') {
+        state.nodes[id] = { status: 'pending', attempts: ns.attempts ?? 0 };
+      }
+    }
+    saveState(state, dir);
+  }
+  const retryDelays = [];
+  const executedNodes = [];
+  let input = null;
+  // Aborted state is *resumable*, not failed: leave the current
+  // node as 'pending' (decrementing attempts so resume retries it)
+  // and let a future runPersistent() call pick up where this one
+  // stopped. That's the same teardown path as a SIGKILL'd run, so
+  // resume-by-abort and resume-by-crash converge to the same shape.
+  const buildAbortReturn = (currentNodeId, attempts) => {
+    if (currentNodeId) {
+      state.nodes[currentNodeId] = { status: 'pending', attempts: Math.max(0, (attempts ?? 1) - 1) };
+      saveState(state, dir);
+    }
+    return {
+      success: false,
+      state,
+      failedAt: currentNodeId,
+      error: 'aborted',
+      code: 'ABORT',
+      retryDelays,
+      executedNodes,
+    };
+  };
+  for (const node of nodes) {
+    if (signal?.aborted) return buildAbortReturn(node.id, 0);
+    const ns = state.nodes[node.id] ?? { status: 'pending' };
+    if (ns.status === 'success') {
+      input = ns.output;
+      continue;
+    }
+    let attempts = ns.attempts ?? 0;
+    while (true) {
+      if (signal?.aborted) return buildAbortReturn(node.id, attempts);
+      attempts++;
+      state.nodes[node.id] = { status: 'running', attempts };
+      saveState(state, dir);
+      const t0 = performance.now();
+      try {
+        const output = await runWithTimeout(() => node.execute(input, { signal }), opts.timeoutMs);
+        const durationMs = performance.now() - t0;
+        state.nodes[node.id] = { status: 'success', output, attempts, durationMs };
+        saveState(state, dir);
+        executedNodes.push(node.id);
+        input = output;
+        break;
+      } catch (err) {
+        // An abort surfaced through execute() (e.g. fetch with signal)
+        // is treated like the cross-node check above: roll back to
+        // 'pending' so resume retries this node, return ABORT.
+        if (signal?.aborted || err?.code === 'ABORT') {
+          return buildAbortReturn(node.id, attempts);
+        }
+        const msg = err instanceof Error ? err.message : String(err);
+        if (isTimeout(err) && attempts < maxRetries) {
+          const delay = baseDelay * Math.pow(2, attempts - 1);
+          retryDelays.push(delay);
+          await sleep(delay);
+          continue;
+        }
+        const durationMs = performance.now() - t0;
+        state.nodes[node.id] = { status: 'failed', attempts, error: msg, durationMs };
+        saveState(state, dir);
+        return { success: false, state, failedAt: node.id, error: msg, retryDelays, executedNodes };
+      }
+    }
+  }
+  return { success: true, state, retryDelays, executedNodes };
+}
+/**
+ * Persistent DAG executor — combines `runParallel`'s topological-level
+ * scheduling with `runPersistent`'s checkpoint-and-resume semantics.
+ *
+ * Differences from `runPersistent`:
+ *   - Nodes declare `deps: string[]` (order in `nodes` array irrelevant).
+ *   - Each topological level runs concurrently via `Promise.all`.
+ *   - State is saved after every node transition, same atomic-rename
+ *     pattern as `runPersistent`.
+ *   - Resume: load state; running-status nodes from a prior interrupted
+ *     run get reset to pending; success nodes are skipped.
+ *
+ * Each node receives `{ depId: depOutput }` as input. A node with no
+ * deps gets `null`.
+ *
+ * @param {Array<{
+ *   id: string,
+ *   deps?: string[],
+ *   execute: (input: Record<string, unknown> | null, opts?: { signal?: AbortSignal }) => Promise<unknown>,
+ *   cleanup?: () => (Promise<void>|void),
+ *   retry?: { max: number, baseDelayMs?: number },
+ *   timeoutMs?: number,
+ * }>} nodes
+ * @param {{
+ *   sessionId: string,
+ *   dir?: string,
+ *   timeoutMs?: number,
+ *   signal?: AbortSignal,
+ *   concurrency?: number,
+ * }} opts
+ */
+export async function runPersistentDag(nodes, opts) {
+  const dir = opts.dir ?? DEFAULT_DIR;
+  const signal = opts.signal;
+  // Compute topological levels at start. (Static import at module top
+  // — a dynamic `import()` here trips the tsx loader's CJS conversion
+  // path under @playwright/test in some configurations.)
+  const { levels, leftover } = topologicalLevels(nodes);
+  if (leftover.length > 0) {
+    return {
+      success: false,
+      state: null,
+      failedAt: leftover[0],
+      error: `workflow has a cycle or unreachable nodes: ${leftover.join(', ')}`,
+      executedNodes: [],
+    };
+  }
+  // State init / resume — same shape as runPersistent so a session id
+  // doesn't accidentally collide between modes.
+  let state = loadState(opts.sessionId, dir);
+  if (!state) {
+    state = initState(opts.sessionId, nodes);
+    saveState(state, dir);
+  } else {
+    // Demote any 'running' from a prior interrupted run back to pending.
+    // success outputs are preserved so a fan-in node sees its predecessors.
+    for (const id of Object.keys(state.nodes)) {
+      const ns = state.nodes[id];
+      if (ns && ns.status === 'running') {
+        state.nodes[id] = { status: 'pending', attempts: ns.attempts ?? 0 };
+      }
+    }
+    saveState(state, dir);
+  }
+  const idToNode = new Map(nodes.map(n => [n.id, n]));
+  const executedNodes = [];
+  // Shared abort handler — same demote-to-pending semantic as
+  // runPersistent: aborted nodes are *resumable*, not failed. After
+  // an abort, demote anything still 'running' back to 'pending' so a
+  // future runPersistentDag() picks them up. Returns the result shape.
+  const buildAbortReturn = (failedAtId) => {
+    for (const id of Object.keys(state.nodes)) {
+      const ns = state.nodes[id];
+      if (ns && ns.status === 'running') {
+        state.nodes[id] = { status: 'pending', attempts: ns.attempts ?? 0 };
+      }
+    }
+    saveState(state, dir);
+    return {
+      success: false,
+      state,
+      failedAt: failedAtId,
+      error: 'aborted',
+      code: 'ABORT',
+      executedNodes,
+    };
+  };
+  for (let levelIdx = 0; levelIdx < levels.length; levelIdx++) {
+    const levelIds = levels[levelIdx];
+    // failedAt for an abort = first node of the next level we'd
+    // schedule. If we're already past the last level, use the
+    // current level's first id (the abort caught us between final
+    // level and "all done").
+    const nextLevelFirstId = () => levels[levelIdx + 1]?.[0] ?? levelIds[0];
+    if (signal?.aborted) return buildAbortReturn(levelIds[0]);
+    // Each node in the level is independent of its peers — run concurrently.
+    // We collect both success outputs and the first failure; on failure we
+    // stop scheduling future levels (same as runParallel) but persist the
+    // success outputs from the level that *did* finish before the throw.
+    // opts.concurrency caps how many nodes within a single level run
+    // at the same time — same semantic as runParallel. Default
+    // unbounded (every level node runs in parallel).
+    const settled = (await settleWithConcurrency(levelIds, async (id) => {
+      const ns = state.nodes[id] ?? { status: 'pending' };
+      if (ns.status === 'success') return { id, ok: true, skipped: true };
+      const node = idToNode.get(id);
+      const deps = node.deps || [];
+      const input = deps.length === 0 ? null : Object.fromEntries(deps.map(d => [d, state.nodes[d]?.output]));
+      // Mark running and persist before we start. Concurrent runs of
+      // different nodes will each write the state file; saveState's
+      // tmp+rename keeps each write atomic, but the final file content
+      // is the LAST writer's view — that's fine because each node only
+      // mutates its own slot.
+      state.nodes[id] = { status: 'running', attempts: (ns.attempts ?? 0) + 1 };
+      saveState(state, dir);
+      const t0 = performance.now();
+      // Wrap each execute() in retryWithBackoff when node.retry is set.
+      // The retry budget lives entirely *inside* this attempt — outer
+      // resume semantics are unchanged: a level failure still flips
+      // node status to 'failed' on disk, and a future runPersistentDag
+      // call retries it from scratch (resume-level retry, separate from
+      // node.retry). This composition gives users two distinct knobs:
+      //   - node.retry  → recover transient faults within one run
+      //   - resume      → recover catastrophic faults across runs
+      // node.timeoutMs (per-node) takes precedence over opts.timeoutMs
+      // (workflow-wide default) so a fast node with a tight cap doesn't
+      // inherit a slower node's lenient cap.
+      const effectiveTimeout = Number.isFinite(node.timeoutMs) ? node.timeoutMs : opts.timeoutMs;
+      const fn = () => runWithTimeout(() => node.execute(input, { signal }), effectiveTimeout);
+      try {
+        const output = node.retry && Number.isFinite(node.retry.max) && node.retry.max > 0
+          ? await retryWithBackoff(fn, node.retry)
+          : await fn();
+        const durationMs = performance.now() - t0;
+        state.nodes[id] = { status: 'success', output, attempts: state.nodes[id].attempts, durationMs };
+        saveState(state, dir);
+        return { id, ok: true };
+      } catch (err) {
+        // An abort surfaced through execute() flips the node back to
+        // pending so resume can retry it. We re-raise via aborted=true
+        // so the level loop below knows to short-circuit.
+        if (signal?.aborted || err?.code === 'ABORT') {
+          return { id, aborted: true };
+        }
+        const msg = err instanceof Error ? err.message : String(err);
+        const durationMs = performance.now() - t0;
+        state.nodes[id] = { status: 'failed', error: msg, attempts: state.nodes[id].attempts, durationMs };
+        saveState(state, dir);
+        return { id, ok: false, error: msg };
+      }
+    }, opts.concurrency)).map(s => s.status === 'fulfilled' ? s.value : { id: 'unknown', ok: false, error: String(s.reason) });
+    let firstFailure = null;
+    let firstAbort = null;
+    for (const r of settled) {
+      if (r.aborted) { if (!firstAbort) firstAbort = r; continue; }
+      if (r.ok && !r.skipped) executedNodes.push(r.id);
+      if (!r.ok && !firstFailure) firstFailure = r;
+    }
+    if (firstAbort || signal?.aborted) {
+      // If a node aborted from inside execute(), failedAt = that node.
+      // If the signal flipped after this level finished cleanly, the
+      // next level was the one that won't run — point failedAt there.
+      return buildAbortReturn(firstAbort?.id ?? nextLevelFirstId());
+    }
+    if (firstFailure) {
+      return { success: false, state, failedAt: firstFailure.id, error: firstFailure.error, executedNodes };
+    }
+  }
+  return { success: true, state, executedNodes };
+}

package/workflow/summary.mjs ADDED Viewed

@@ -0,0 +1,318 @@
+// Pure transformations over persisted workflow state.
+// Lifted out of the CLI so both `lazyclaw inspect` and the daemon's
+// /workflows endpoint can produce the same shape — a single source
+// of truth for what "workflow progress" looks like over the wire.
+//
+// We intentionally re-implement state-file reading here (a 3-line
+// function) instead of importing from `persistent.mjs`. The daemon's
+// import graph stays free of the workflow engine — under tsx/CJS
+// conversion in @playwright/test, importing engine modules from the
+// daemon's static graph has historically broken.
+import fs from 'node:fs';
+import path from 'node:path';
+/**
+ * Load a persisted state file. Returns null when the file does not
+ * exist (a session that has never been written). Throws on JSON
+ * parse errors so callers can surface the corruption rather than
+ * masking it as "no state".
+ *
+ * @param {string} sessionId
+ * @param {string} dir
+ * @returns {object | null}
+ */
+export function loadStateFile(sessionId, dir) {
+  const p = path.join(dir, `${sessionId}.json`);
+  if (!fs.existsSync(p)) return null;
+  return JSON.parse(fs.readFileSync(p, 'utf8'));
+}
+/**
+ * @typedef {{ status?: 'pending'|'running'|'success'|'failed', output?: unknown, attempts?: number, error?: string, durationMs?: number }} NodeState
+ * @typedef {{ sessionId: string, order?: string[], nodes: Record<string, NodeState>, startedAt?: number, updatedAt?: number }} PersistedState
+ * @typedef {{ total: number, pending: number, running: number, success: number, failed: number, done: boolean, resumable: boolean, durationMs: number }} StateSummary
+ */
+/**
+ * Reduce a persisted state object to its summary block + the list of
+ * failed nodes. The summary is the same regardless of whether you're
+ * looking at a single session or one element of a listing.
+ *
+ * @param {PersistedState} state
+ * @returns {{ summary: StateSummary, failedNodes: Array<{ id: string, error?: string, attempts?: number }> }}
+ */
+export function summarizeState(state) {
+  const counts = { pending: 0, running: 0, success: 0, failed: 0 };
+  const failedNodes = [];
+  let totalDurationMs = 0;
+  const nodes = state?.nodes || {};
+  for (const id of Object.keys(nodes)) {
+    const n = nodes[id];
+    const status = n?.status || 'pending';
+    if (counts[status] !== undefined) counts[status]++;
+    if (status === 'failed') failedNodes.push({ id, error: n.error, attempts: n.attempts });
+    if (typeof n?.durationMs === 'number') totalDurationMs += n.durationMs;
+  }
+  const total = Object.keys(nodes).length;
+  const allDone = total > 0 && counts.success === total;
+  const hasFailure = counts.failed > 0;
+  return {
+    summary: {
+      total,
+      ...counts,
+      done: allDone,
+      // "Resumable" = there's at least one non-success node AND no terminal
+      // failure. Running/pending nodes from a prior interrupted run will be
+      // demoted by the engine on next load — they count as resumable work.
+      resumable: !allDone && !hasFailure,
+      durationMs: totalDurationMs,
+    },
+    failedNodes,
+  };
+}
+/**
+ * Aggregate per-node statistics across every persisted session in
+ * a state directory. For each node id seen across sessions, compute
+ * how often it ran, how often it succeeded/failed, and the
+ * min/max/avg/total durations.
+ *
+ * Useful for cross-run analysis: "which node tends to be slow or
+ * fail across all my runs of this workflow?" — a question
+ * single-session inspect can't answer.
+ *
+ * @param {string} dir
+ * @param {{ filter?: string }} [opts]   Optional case-insensitive
+ *        sessionId substring filter — only matching sessions
+ *        contribute to the aggregate. Same semantic as v3.36's
+ *        list-mode `--filter`.
+ * @returns {{ sessionCount: number, nodeStats: Record<string, {
+ *   count: number,
+ *   successCount: number,
+ *   failedCount: number,
+ *   pendingCount: number,
+ *   runningCount: number,
+ *   minDurationMs: number,
+ *   maxDurationMs: number,
+ *   avgDurationMs: number,
+ *   p50DurationMs: number,
+ *   p95DurationMs: number,
+ *   p99DurationMs: number,
+ *   totalDurationMs: number,
+ * }> }}
+ */
+export function aggregateNodeStats(dir, opts = {}) {
+  if (!fs.existsSync(dir)) {
+    const e = new Error(`State directory ${dir} does not exist`);
+    /** @type {any} */ (e).code = 'ENOENT';
+    throw e;
+  }
+  const files = fs.readdirSync(dir).filter(f => f.endsWith('.json'));
+  let sessionCount = 0;
+  /** @type {Record<string, { count: number, successCount: number, failedCount: number, pendingCount: number, runningCount: number, durations: number[] }>} */
+  const accumulator = {};
+  const filterLower = opts.filter ? String(opts.filter).toLowerCase() : null;
+  for (const f of files) {
+    let state;
+    try {
+      state = JSON.parse(fs.readFileSync(path.join(dir, f), 'utf8'));
+    } catch { continue; }
+    if (!state?.sessionId || !state?.nodes) continue;
+    if (filterLower && !state.sessionId.toLowerCase().includes(filterLower)) continue;
+    sessionCount++;
+    for (const id of Object.keys(state.nodes)) {
+      const ns = state.nodes[id];
+      const status = ns?.status || 'pending';
+      const slot = accumulator[id] || (accumulator[id] = {
+        count: 0, successCount: 0, failedCount: 0, pendingCount: 0, runningCount: 0,
+        durations: [],
+      });
+      slot.count++;
+      if (status === 'success')      slot.successCount++;
+      else if (status === 'failed')  slot.failedCount++;
+      else if (status === 'pending') slot.pendingCount++;
+      else if (status === 'running') slot.runningCount++;
+      if (Number.isFinite(ns?.durationMs)) slot.durations.push(ns.durationMs);
+    }
+  }
+  /** @type {Record<string, ReturnType<typeof aggregateNodeStats>['nodeStats'][string]>} */
+  const nodeStats = {};
+  // Nearest-rank percentile: ceil(p * n) gives the 1-indexed
+  // position; subtract 1 for 0-indexed array access. Standard
+  // definition (cf. Wikipedia "Percentile / Nearest-rank method").
+  // Empty array → 0 by convention.
+  const percentile = (sorted, p) => {
+    if (sorted.length === 0) return 0;
+    const idx = Math.max(0, Math.ceil(p * sorted.length) - 1);
+    return sorted[Math.min(idx, sorted.length - 1)];
+  };
+  for (const id of Object.keys(accumulator)) {
+    const slot = accumulator[id];
+    const durations = slot.durations;
+    const sorted = [...durations].sort((a, b) => a - b);
+    const total = durations.reduce((s, x) => s + x, 0);
+    const r2 = (n) => Math.round(n * 100) / 100;
+    nodeStats[id] = {
+      count: slot.count,
+      successCount: slot.successCount,
+      failedCount: slot.failedCount,
+      pendingCount: slot.pendingCount,
+      runningCount: slot.runningCount,
+      minDurationMs: sorted.length ? sorted[0] : 0,
+      maxDurationMs: sorted.length ? sorted[sorted.length - 1] : 0,
+      avgDurationMs: sorted.length ? r2(total / sorted.length) : 0,
+      p50DurationMs: r2(percentile(sorted, 0.5)),
+      p95DurationMs: r2(percentile(sorted, 0.95)),
+      p99DurationMs: r2(percentile(sorted, 0.99)),
+      totalDurationMs: r2(total),
+    };
+  }
+  return { sessionCount, nodeStats };
+}
+/**
+ * Compute the critical path (longest weighted path) through a DAG.
+ *
+ * Given the persisted state's node order + a deps map (which the
+ * caller supplies, since the engine doesn't persist deps — it
+ * persists outputs and statuses), this walks the DAG in
+ * topological order and finds the chain of nodes whose summed
+ * `durationMs` is the largest among all root-to-leaf paths.
+ *
+ * Algorithm — straightforward DP over a topo order:
+ *   for each node in topo order:
+ *     bestPredecessor = arg max over deps (bestFinish[dep])
+ *     bestFinish[node] = (bestFinish[bestPredecessor] || 0) + duration[node]
+ *     prev[node] = bestPredecessor
+ *
+ * Then walk `prev[]` backwards from the node with the max
+ * bestFinish to recover the path.
+ *
+ * @param {{ id: string, deps?: string[] }[]} graphNodes  Workflow shape (deps = id[])
+ * @param {Record<string, { durationMs?: number, status?: string }>} stateNodes  Persisted state (durationMs)
+ * @returns {{ path: string[], totalMs: number, perNodeMs: Record<string, number> }}
+ *          - path: ordered list of node ids on the critical path
+ *          - totalMs: sum of durationMs across the path
+ *          - perNodeMs: durationMs lookup for every node (0 if missing)
+ */
+export function criticalPath(graphNodes, stateNodes) {
+  const idToDeps = new Map(graphNodes.map(n => [n.id, n.deps || []]));
+  const ids = graphNodes.map(n => n.id);
+  // Topological order — Kahn's algorithm. We don't need levels here,
+  // just an order where every dep comes before its dependents.
+  const indegree = new Map(ids.map(id => [id, 0]));
+  for (const n of graphNodes) {
+    for (const d of n.deps || []) {
+      if (indegree.has(d)) indegree.set(n.id, (indegree.get(n.id) || 0) + 1);
+    }
+  }
+  const topo = [];
+  const queue = ids.filter(id => (indegree.get(id) || 0) === 0);
+  while (queue.length) {
+    const id = queue.shift();
+    topo.push(id);
+    for (const m of graphNodes) {
+      if ((m.deps || []).includes(id) && indegree.has(m.id)) {
+        const next = (indegree.get(m.id) || 0) - 1;
+        indegree.set(m.id, next);
+        if (next === 0) queue.push(m.id);
+      }
+    }
+  }
+  // If there's a cycle, topo will be shorter than ids.length. Rather
+  // than crash, we walk what we got — the result is the best path
+  // we can compute over the acyclic portion. Caller can `validate`
+  // up front if they want strict.
+  const perNodeMs = {};
+  for (const id of ids) {
+    const ns = stateNodes?.[id];
+    perNodeMs[id] = (ns && Number.isFinite(ns.durationMs)) ? ns.durationMs : 0;
+  }
+  const bestFinish = {};
+  const chainLen = {};   // path length (node count) ending at this id
+  const prev = {};
+  let bestEnd = null;
+  let bestEndFinish = -Infinity;
+  let bestEndChainLen = 0;
+  for (const id of topo) {
+    const deps = idToDeps.get(id) || [];
+    let bestPred = null;
+    let bestPredFinish = 0;
+    let bestPredChainLen = 0;
+    for (const d of deps) {
+      const f = bestFinish[d];
+      const cl = chainLen[d] || 0;
+      if (typeof f !== 'number') continue;
+      // Tie-break: weight first, then chain length. Prefer longer
+      // dependency chains when totalMs is the same — useful for
+      // fresh / pre-run state where durations are all 0 and the
+      // user actually wants topological depth.
+      if (f > bestPredFinish || (f === bestPredFinish && cl > bestPredChainLen)) {
+        bestPredFinish = f;
+        bestPredChainLen = cl;
+        bestPred = d;
+      }
+    }
+    bestFinish[id] = bestPredFinish + perNodeMs[id];
+    chainLen[id] = bestPredChainLen + 1;
+    prev[id] = bestPred;
+    if (bestFinish[id] > bestEndFinish ||
+        (bestFinish[id] === bestEndFinish && chainLen[id] > bestEndChainLen)) {
+      bestEndFinish = bestFinish[id];
+      bestEndChainLen = chainLen[id];
+      bestEnd = id;
+    }
+  }
+  // Recover the path by walking prev[] backwards.
+  const path = [];
+  for (let cur = bestEnd; cur != null; cur = prev[cur]) path.unshift(cur);
+  return {
+    path,
+    totalMs: Math.max(0, bestEndFinish),
+    perNodeMs,
+  };
+}
+/**
+ * Read every state file in `dir` and return a sorted listing.
+ * Newest activity first (by `updatedAt`); secondary sort by sessionId
+ * for deterministic ordering on ties.
+ *
+ * Stray non-JSON files and corrupt state are silently skipped — a
+ * left-over `.tmp` from a crashed write doesn't break the listing.
+ * Throws if `dir` does not exist; the caller decides whether that's
+ * an error (CLI exit 2) or empty result (auto-create on first run).
+ *
+ * @param {string} dir
+ * @returns {Array<{ sessionId: string, summary: StateSummary, failedNodes: Array<{ id: string, error?: string, attempts?: number }>, startedAt?: number, updatedAt?: number }>}
+ */
+export function listSessions(dir) {
+  if (!fs.existsSync(dir)) {
+    const e = new Error(`State directory ${dir} does not exist`);
+    /** @type {any} */ (e).code = 'ENOENT';
+    throw e;
+  }
+  const files = fs.readdirSync(dir).filter(f => f.endsWith('.json'));
+  const sessions = [];
+  for (const f of files) {
+    try {
+      const raw = fs.readFileSync(path.join(dir, f), 'utf8');
+      const state = JSON.parse(raw);
+      if (!state?.sessionId) continue;
+      const { summary, failedNodes } = summarizeState(state);
+      sessions.push({
+        sessionId: state.sessionId,
+        summary,
+        failedNodes,
+        startedAt: state.startedAt,
+        updatedAt: state.updatedAt,
+      });
+    } catch {
+      // Skip non-state JSON / corrupt files — see saveState's atomic
+      // tmp+rename for the normal write path.
+    }
+  }
+  sessions.sort((a, b) => (b.updatedAt - a.updatedAt) || a.sessionId.localeCompare(b.sessionId));
+  return sessions;
+}