npm - bosun - Versions diffs - 0.35.2 → 0.35.3 - Mend

bosun 0.35.2 → 0.35.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

package/README.md +14 -1
package/agent-hooks.mjs +7 -1
package/agent-pool.mjs +16 -0
package/agent-prompts.mjs +190 -4
package/agent-sdk.mjs +6 -1
package/agent-work-analyzer.mjs +48 -9
package/autofix.mjs +32 -18
package/bosun.schema.json +1 -1
package/kanban-adapter.mjs +62 -12
package/monitor.mjs +25 -6
package/opencode-shell.mjs +881 -0
package/package.json +5 -2
package/primary-agent.mjs +43 -0
package/setup.mjs +33 -4
package/task-executor.mjs +43 -14
package/ui/app.js +10 -7
package/ui/components/chat-view.js +31 -9
package/ui/components/session-list.js +20 -4
package/ui/modules/router.js +2 -0
package/ui/tabs/agents.js +66 -8
package/ui-server.mjs +142 -5
package/workflow-engine.mjs +664 -10
package/workflow-nodes.mjs +250 -1
package/workflow-templates/github.mjs +389 -71
package/workflow-templates/planning.mjs +31 -11
package/workflow-templates.mjs +3 -0

package/workflow-engine.mjs CHANGED Viewed

@@ -67,6 +67,24 @@ const DEFAULT_RUN_STUCK_THRESHOLD_MS = readBoundedEnvInt(
   { min: 10000, max: 7_200_000 },
 );
+// ── Auto-Retry Defaults ─────────────────────────────────────────────────────
+const DEFAULT_AUTO_RETRY_MAX_ATTEMPTS = readBoundedEnvInt(
+  "WORKFLOW_AUTO_RETRY_MAX_ATTEMPTS",
+  3,
+  { min: 0, max: 10 },
+);
+const DEFAULT_AUTO_RETRY_COOLDOWN_MS = readBoundedEnvInt(
+  "WORKFLOW_AUTO_RETRY_COOLDOWN_MS",
+  20 * 60 * 1000, // 20 minutes
+  { min: 0, max: 3_600_000 },
+);
+const CHECKPOINT_DEBOUNCE_MS = readBoundedEnvInt(
+  "WORKFLOW_CHECKPOINT_DEBOUNCE_MS",
+  500,
+  { min: 50, max: 10000 },
+);
+const ACTIVE_RUNS_INDEX = "_active-runs.json";
 function resolveNodeTimeoutMs(node, resolvedConfig) {
   const candidates = [
     resolvedConfig?.timeout,
@@ -333,6 +351,8 @@ export class WorkflowEngine extends EventEmitter {
     this._activeRuns = new Map();
     this._triggerSubscriptions = new Map();
     this._loaded = false;
+    this._checkpointTimers = new Map(); // runId → debounce timer
+    this._resumingRuns = false;
   }
   // ── Lifecycle ───────────────────────────────────────────────────────────
@@ -359,6 +379,11 @@ export class WorkflowEngine extends EventEmitter {
     }
     this._loaded = true;
     this.emit("loaded", { count: this._workflows.size });
+    // Detect runs that were interrupted by a previous shutdown.
+    // These are runs persisted to disk with status=RUNNING that are
+    // NOT in our in-memory _activeRuns (because we just booted).
+    this._detectInterruptedRuns();
   }
   /** Ensure storage directories exist */
@@ -466,6 +491,10 @@ export class WorkflowEngine extends EventEmitter {
       startedAt: ctx.startedAt,
       status: WorkflowStatus.RUNNING,
     });
+    // ── Persist run immediately so it survives process restarts ──────
+    this._persistActiveRunState(runId, workflowId, def.name, ctx);
     this.emit("run:start", { runId, workflowId, name: def.name });
     try {
@@ -490,12 +519,261 @@ export class WorkflowEngine extends EventEmitter {
       this.emit("run:error", { runId, workflowId, error: err.message });
     }
-    // Persist run log
+    // Persist final run log and remove from active-runs index
     this._persistRun(runId, workflowId, ctx);
+    this._clearActiveRunState(runId);
     this._activeRuns.delete(runId);
+    // ── Auto-retry on failure ───────────────────────────────────────────
+    // If the workflow failed and auto-retry is enabled, kick off the
+    // escalating retry strategy asynchronously. The caller still receives the
+    // original (failed) context immediately so we never block the event loop.
+    const finalStatus = ctx.errors.length > 0 ? WorkflowStatus.FAILED : WorkflowStatus.COMPLETED;
+    if (finalStatus === WorkflowStatus.FAILED && !opts._isRetry) {
+      const retryConfig = this._resolveAutoRetryConfig(def);
+      if (retryConfig.enabled) {
+        // Fire-and-forget — errors are logged, never thrown.
+        this._autoRetryLoop(runId, workflowId, inputData, retryConfig, opts).catch((err) => {
+          console.error(`${TAG} Auto-retry loop error for run ${runId}:`, err.message);
+        });
+      }
+    }
     return ctx;
   }
+  // ── Run Retry ───────────────────────────────────────────────────────────
+  /**
+   * Retry a previously completed (failed) run.
+   *
+   * @param {string} runId - The original run ID to retry.
+   * @param {object} [retryOpts]
+   * @param {"from_failed"|"from_scratch"} [retryOpts.mode="from_failed"]
+   *   - `"from_failed"` — re-execute starting from the first failed node,
+   *     pre-populating the context with already-completed node outputs.
+   *   - `"from_scratch"` — re-execute the entire workflow from the beginning
+   *     with the same input data that was used originally.
+   * @returns {Promise<{retryRunId: string, mode: string, ctx: WorkflowContext}>}
+   */
+  async retryRun(runId, retryOpts = {}) {
+    const mode = retryOpts.mode === "from_scratch" ? "from_scratch" : "from_failed";
+    const originalRun = this.getRunDetail(runId);
+    if (!originalRun) {
+      throw new Error(`${TAG} Run "${runId}" not found — cannot retry`);
+    }
+    const workflowId = originalRun.workflowId || originalRun.detail?.data?._workflowId;
+    if (!workflowId) {
+      throw new Error(`${TAG} Cannot determine workflowId from run "${runId}"`);
+    }
+    const def = this.get(workflowId);
+    if (!def) {
+      throw new Error(`${TAG} Workflow "${workflowId}" no longer exists — cannot retry`);
+    }
+    // Recover original input data (strip internal enrichment keys).
+    const originalData = { ...(originalRun.detail?.data || {}) };
+    delete originalData._workflowId;
+    delete originalData._workflowName;
+    this.emit("run:retry", {
+      originalRunId: runId,
+      workflowId,
+      mode,
+      attempt: retryOpts._attempt || 1,
+    });
+    if (mode === "from_scratch") {
+      const ctx = await this.execute(workflowId, originalData, {
+        ...retryOpts,
+        _isRetry: true,
+        _originalRunId: runId,
+        force: true,
+      });
+      return { retryRunId: ctx.id, mode, originalRunId: runId, ctx };
+    }
+    // ── "from_failed" — resume from the first failed node ────────────
+    const detail = originalRun.detail || {};
+    const nodeStatuses = detail.nodeStatuses || {};
+    const nodeOutputs = detail.nodeOutputs || {};
+    // Build a fresh context but pre-seed completed node outputs.
+    const ctx = new WorkflowContext({
+      ...def.variables,
+      ...originalData,
+      _workflowId: workflowId,
+      _workflowName: def.name,
+      _retryOf: runId,
+    });
+    ctx.variables = { ...def.variables };
+    // Pre-populate nodes that already succeeded.
+    for (const [nodeId, status] of Object.entries(nodeStatuses)) {
+      if (status === NodeStatus.COMPLETED) {
+        ctx.setNodeStatus(nodeId, NodeStatus.COMPLETED);
+        if (nodeOutputs[nodeId] !== undefined) {
+          ctx.setNodeOutput(nodeId, nodeOutputs[nodeId]);
+        }
+      }
+      // Reset failed / skipped nodes so the DAG will re-run them.
+    }
+    const retryRunId = ctx.id;
+    this._activeRuns.set(retryRunId, {
+      workflowId,
+      workflowName: def.name,
+      ctx,
+      startedAt: ctx.startedAt,
+      status: WorkflowStatus.RUNNING,
+    });
+    this._persistActiveRunState(retryRunId, workflowId, def.name, ctx);
+    this.emit("run:start", { runId: retryRunId, workflowId, name: def.name, retryOf: runId, mode });
+    try {
+      const adjacency = this._buildAdjacency(def);
+      const entryNodes = this._findEntryNodes(def);
+      if (entryNodes.length === 0) {
+        throw new Error("Workflow has no entry nodes (no triggers or unconnected nodes)");
+      }
+      // _executeDag naturally skips nodes that are already COMPLETED because
+      // they were pre-seeded above, so it resumes from the failed point.
+      await this._executeDag(def, entryNodes, adjacency, ctx, { ...retryOpts, _isRetry: true });
+      const status = ctx.errors.length > 0 ? WorkflowStatus.FAILED : WorkflowStatus.COMPLETED;
+      this._activeRuns.get(retryRunId).status = status;
+      this.emit("run:end", {
+        runId: retryRunId,
+        workflowId,
+        status,
+        duration: Date.now() - ctx.startedAt,
+        retryOf: runId,
+        mode,
+      });
+    } catch (err) {
+      ctx.error("_engine", err);
+      this._activeRuns.get(retryRunId).status = WorkflowStatus.FAILED;
+      this.emit("run:error", { runId: retryRunId, workflowId, error: err.message, retryOf: runId });
+    }
+    this._persistRun(retryRunId, workflowId, ctx);
+    this._clearActiveRunState(retryRunId);
+    this._activeRuns.delete(retryRunId);
+    return { retryRunId, mode, originalRunId: runId, ctx };
+  }
+  // ── Auto-retry escalating strategy ───────────────────────────────────
+  /**
+   * Resolve the auto-retry configuration for a workflow definition.
+   * Supports per-workflow overrides via `def.autoRetry`.
+   */
+  _resolveAutoRetryConfig(def) {
+    const raw = def?.autoRetry || {};
+    // Auto-retry is opt-in: workflows must explicitly set autoRetry.enabled = true.
+    // This prevents unexpected background retries for workflows that don't want them.
+    const enabled = Boolean(raw.enabled);
+    const maxAttempts = Number.isFinite(Number(raw.maxAttempts))
+      ? Math.max(0, Math.trunc(Number(raw.maxAttempts)))
+      : DEFAULT_AUTO_RETRY_MAX_ATTEMPTS;
+    const cooldownMs = Number.isFinite(Number(raw.cooldownMs))
+      ? Math.max(0, Math.trunc(Number(raw.cooldownMs)))
+      : DEFAULT_AUTO_RETRY_COOLDOWN_MS;
+    return { enabled: enabled && maxAttempts > 0, maxAttempts, cooldownMs };
+  }
+  /**
+   * Escalating auto-retry loop.
+   *
+   * Strategy (configurable, defaults to 3 attempts):
+   *   Attempt 1 → from_failed (immediate)
+   *   Attempt 2 → from_scratch (immediate)
+   *   Attempt 3 → from_scratch (after cooldown period, default 20 min)
+   *
+   * If the workflow succeeds at any point the loop stops.
+   * Results are persisted as separate runs linked via `_retryOf`.
+   */
+  async _autoRetryLoop(originalRunId, workflowId, inputData, retryConfig, baseOpts) {
+    const { maxAttempts, cooldownMs } = retryConfig;
+    for (let attempt = 1; attempt <= maxAttempts; attempt++) {
+      const mode = attempt === 1 ? "from_failed" : "from_scratch";
+      const needsCooldown = attempt >= 3 && cooldownMs > 0;
+      if (needsCooldown) {
+        console.log(
+          `${TAG} Auto-retry attempt ${attempt}/${maxAttempts} for run ${originalRunId} ` +
+          `— cooling down for ${Math.round(cooldownMs / 1000)}s before retry`,
+        );
+        this.emit("run:retry:cooldown", {
+          originalRunId,
+          workflowId,
+          attempt,
+          cooldownMs,
+        });
+        await new Promise((r) => setTimeout(r, cooldownMs));
+      }
+      console.log(
+        `${TAG} Auto-retry attempt ${attempt}/${maxAttempts} for run ${originalRunId} (mode=${mode})`,
+      );
+      try {
+        const { ctx, retryRunId } = await this.retryRun(originalRunId, {
+          mode,
+          _isRetry: true,
+          _attempt: attempt,
+        });
+        if (!ctx.errors || ctx.errors.length === 0) {
+          console.log(
+            `${TAG} Auto-retry succeeded on attempt ${attempt}/${maxAttempts} ` +
+            `for run ${originalRunId} → new run ${retryRunId}`,
+          );
+          this.emit("run:retry:success", {
+            originalRunId,
+            retryRunId,
+            workflowId,
+            attempt,
+          });
+          return; // Success — stop retrying
+        }
+        console.warn(
+          `${TAG} Auto-retry attempt ${attempt}/${maxAttempts} failed ` +
+          `for run ${originalRunId} → new run ${retryRunId}`,
+        );
+        this.emit("run:retry:failed", {
+          originalRunId,
+          retryRunId,
+          workflowId,
+          attempt,
+          errors: ctx.errors,
+        });
+      } catch (err) {
+        console.error(
+          `${TAG} Auto-retry attempt ${attempt}/${maxAttempts} threw for run ${originalRunId}:`,
+          err.message,
+        );
+        this.emit("run:retry:failed", {
+          originalRunId,
+          workflowId,
+          attempt,
+          errors: [{ error: err.message }],
+        });
+      }
+    }
+    console.error(
+      `${TAG} All ${maxAttempts} auto-retry attempts exhausted for run ${originalRunId}`,
+    );
+    this.emit("run:retry:exhausted", { originalRunId, workflowId, maxAttempts });
+  }
   /**
    * Evaluate trigger conditions to see if a workflow should fire.
    * Called by the supervisor loop or event bus.
@@ -665,6 +943,16 @@ export class WorkflowEngine extends EventEmitter {
     const queue = [...entryNodes.map((n) => n.id)];
     const nodeMap = new Map((def.nodes || []).map((n) => [n.id, n]));
+    // ── Resume support (retry from_failed) ──────────────────────────────
+    // If nodes are already marked COMPLETED in the context (pre-seeded by
+    // retryRun), treat them as already executed so the DAG skips them and
+    // begins from the first un-completed node.
+    for (const [nodeId, status] of ctx.nodeStatuses) {
+      if (status === NodeStatus.COMPLETED) {
+        executed.add(nodeId);
+      }
+    }
     // Track in-degree for proper scheduling
     const inDegree = new Map();
     for (const node of def.nodes || []) {
@@ -674,8 +962,32 @@ export class WorkflowEngine extends EventEmitter {
       inDegree.set(edge.target, (inDegree.get(edge.target) || 0) + 1);
     }
-    // Ready set = nodes with all dependencies met
-    const ready = new Set(queue);
+    // ── Adjust in-degree for pre-completed nodes (retry resume) ─────────
+    // When resuming from a failed step, pre-completed source nodes have
+    // already satisfied their downstream edges. Decrement the in-degree for
+    // each target so successors become ready once all live deps are met.
+    for (const nodeId of executed) {
+      const edges = adjacency.get(nodeId) || [];
+      for (const edge of edges) {
+        const deg = (inDegree.get(edge.target) || 1) - 1;
+        inDegree.set(edge.target, Math.max(0, deg));
+      }
+    }
+    // Ready set = entry nodes (or nodes with no remaining unsatisfied deps)
+    const ready = new Set();
+    for (const nid of queue) {
+      if (!executed.has(nid)) {
+        ready.add(nid);
+      }
+    }
+    // Also add any non-entry nodes whose in-degree is now 0 due to pre-
+    // completed predecessors (this makes "from_failed" resume work).
+    for (const [nid, deg] of inDegree) {
+      if (deg <= 0 && !executed.has(nid) && !ready.has(nid)) {
+        ready.add(nid);
+      }
+    }
     while (ready.size > 0) {
       // Execute ready nodes in bounded parallel batches.
@@ -729,10 +1041,16 @@ export class WorkflowEngine extends EventEmitter {
               ctx.setNodeStatus(nodeId, NodeStatus.COMPLETED);
               executed.add(nodeId);
               this.emit("node:complete", { nodeId, type: node.type });
+              // Checkpoint progress to disk (debounced) so the run can
+              // be resumed from here if the process is interrupted.
+              this._checkpointRun(ctx);
               lastErr = null;
               return { nodeId, result };
             } catch (err) {
               lastErr = err;
+              if (err.retryable === false) break; // permanent error — skip remaining retry attempts
             }
           }
@@ -876,12 +1194,32 @@ export class WorkflowEngine extends EventEmitter {
     // Resolve config templates against context
     const resolvedConfig = this._resolveConfig(node.config || {}, ctx);
-    // Dry run — just validate
+    // Dry run — skip capability checks and handler execution.
+    // Services aren't needed for simulation; this keeps dry-run tests fast.
     if (opts.dryRun) {
       ctx.log(node.id, `[dry-run] Would execute ${node.type}`, "info");
       return { _dryRun: true, type: node.type, config: resolvedConfig };
     }
+    // ── Capability pre-flight check ──────────────────────────────────────
+    // Verify required services are present AFTER the dryRun early-return so
+    // dry-run tests work without needing real service dependencies wired up.
+    const requiredCapabilities = this._getNodeRequiredCapabilities(node.type);
+    const missingCapabilities = [];
+    for (const cap of requiredCapabilities) {
+      if (!this._hasCapability(cap)) {
+        missingCapabilities.push(cap);
+      }
+    }
+    if (missingCapabilities.length > 0) {
+      const detail = `Node "${node.label || node.id}" (${node.type}) requires capabilities: [${missingCapabilities.join(", ")}] which are not available. ` +
+        `Check that the required services (agent pool, kanban adapter, etc.) are configured and the agent has the necessary permissions.`;
+      ctx.log(node.id, detail, "error");
+      const capErr = new Error(detail);
+      capErr.retryable = false; // missing service is permanent — don't waste time retrying
+      throw capErr;
+    }
     // Execute with timeout — clear timer on completion to avoid resource leaks
     const timeout = resolveNodeTimeoutMs(node, resolvedConfig);
     let timer;
@@ -916,6 +1254,48 @@ export class WorkflowEngine extends EventEmitter {
     return resolved;
   }
+  // ── Capability helpers ──────────────────────────────────────────────────
+  // Map node-type prefixes / names to the engine.services keys they need.
+  // This lets _executeNode fail-fast with a clear message instead of letting
+  // the handler throw a cryptic "cannot read property X of undefined".
+  /** @returns {string[]} service keys the node type needs (may be empty) */
+  _getNodeRequiredCapabilities(nodeType) {
+    // Agent nodes need the agentPool service
+    if (nodeType.startsWith("agent.") || nodeType === "action.run_agent") {
+      return ["agentPool"];
+    }
+    // Session continuation / restart also need agentPool
+    if (nodeType === "action.continue_session" || nodeType === "action.restart_agent") {
+      return ["agentPool"];
+    }
+    // Task-management nodes need kanban
+    if (
+      nodeType === "action.create_task" ||
+      nodeType === "action.update_task_status" ||
+      nodeType === "action.materialize_planner_tasks"
+    ) {
+      return ["kanban"];
+    }
+    // Telegram notification
+    if (nodeType === "notify.telegram") {
+      return ["telegram"];
+    }
+    // condition.task_has_tag reads from kanban
+    if (nodeType === "condition.task_has_tag") {
+      return ["kanban"];
+    }
+    // No special service required (file I/O, git, transforms, logs, etc.)
+    return [];
+  }
+  /** Check whether a named capability (service key) is available */
+  _hasCapability(cap) {
+    const svc = this.services?.[cap];
+    // A capability is "present" when its value is a non-null object or function.
+    return svc != null && (typeof svc === "object" || typeof svc === "function");
+  }
   _evaluateCondition(condition, ctx, sourceNodeId) {
     // Simple expression evaluator — supports basic comparisons
     // Variables: $output (source node output), $data (context data), $status
@@ -1088,6 +1468,280 @@ export class WorkflowEngine extends EventEmitter {
     return normalized;
   }
+  // ── Active-runs persistence (crash recovery) ─────────────────────────
+  /**
+   * Read the active-runs index (_active-runs.json).
+   * Returns an array of { runId, workflowId, workflowName, startedAt }.
+   */
+  _readActiveRunsIndex() {
+    try {
+      const p = resolve(this.runsDir, ACTIVE_RUNS_INDEX);
+      if (!existsSync(p)) return [];
+      const raw = JSON.parse(readFileSync(p, "utf8"));
+      return Array.isArray(raw) ? raw : [];
+    } catch {
+      return [];
+    }
+  }
+  /** Write the active-runs index atomically. */
+  _writeActiveRunsIndex(entries) {
+    try {
+      this._ensureDirs();
+      const p = resolve(this.runsDir, ACTIVE_RUNS_INDEX);
+      writeFileSync(p, JSON.stringify(entries, null, 2), "utf8");
+    } catch (err) {
+      console.error(`${TAG} Failed to write active-runs index:`, err.message);
+    }
+  }
+  /**
+   * Persist a run to the active-runs index AND write an initial detail file.
+   * Called at the very start of execute() / retryRun() so the run is on disk
+   * before any node executes.
+   */
+  _persistActiveRunState(runId, workflowId, workflowName, ctx) {
+    try {
+      this._ensureDirs();
+      // Add to active-runs index
+      const entries = this._readActiveRunsIndex().filter((e) => e.runId !== runId);
+      entries.push({ runId, workflowId, workflowName, startedAt: ctx.startedAt });
+      this._writeActiveRunsIndex(entries);
+      // Write initial detail file so we can resume from it
+      const detail = this._serializeRunContext(ctx, true);
+      const detailPath = resolve(this.runsDir, `${runId}.json`);
+      writeFileSync(detailPath, JSON.stringify(detail, null, 2), "utf8");
+      // Also ensure the run appears in the main index (with RUNNING status)
+      // so that getRunDetail() can find it even before completion.
+      this._ensureRunInIndex(runId, workflowId, workflowName, detail);
+    } catch (err) {
+      console.error(`${TAG} Failed to persist active run state:`, err.message);
+    }
+  }
+  /**
+   * Debounced checkpoint — writes the current run context to disk after each
+   * node completes.  Debounced at CHECKPOINT_DEBOUNCE_MS to avoid disk
+   * thrashing when many nodes finish in quick succession.
+   */
+  _checkpointRun(ctx) {
+    const runId = ctx.id;
+    // Clear any pending timer for this run
+    const existing = this._checkpointTimers.get(runId);
+    if (existing) clearTimeout(existing);
+    const timer = setTimeout(() => {
+      this._checkpointTimers.delete(runId);
+      try {
+        this._ensureDirs();
+        const detail = this._serializeRunContext(ctx, true);
+        const detailPath = resolve(this.runsDir, `${runId}.json`);
+        writeFileSync(detailPath, JSON.stringify(detail, null, 2), "utf8");
+      } catch (err) {
+        console.error(`${TAG} Checkpoint failed for run ${runId}:`, err.message);
+      }
+    }, CHECKPOINT_DEBOUNCE_MS);
+    // Don't let the timer prevent clean process exit
+    if (timer.unref) timer.unref();
+    this._checkpointTimers.set(runId, timer);
+  }
+  /**
+   * Remove a run from the active-runs index and clear its checkpoint timer.
+   * Called after a run completes (success or failure) so it won't be
+   * mistakenly resumed on next boot.
+   */
+  _clearActiveRunState(runId) {
+    try {
+      // Clear debounce timer
+      const timer = this._checkpointTimers.get(runId);
+      if (timer) {
+        clearTimeout(timer);
+        this._checkpointTimers.delete(runId);
+      }
+      // Remove from active-runs index
+      const entries = this._readActiveRunsIndex().filter((e) => e.runId !== runId);
+      this._writeActiveRunsIndex(entries);
+    } catch (err) {
+      console.error(`${TAG} Failed to clear active run state:`, err.message);
+    }
+  }
+  /**
+   * Ensure a run entry exists in the main runs index (index.json).
+   * Deduplicates by runId — if the run already exists, updates it in place.
+   */
+  _ensureRunInIndex(runId, workflowId, workflowName, detail) {
+    try {
+      const indexPath = resolve(this.runsDir, "index.json");
+      const runs = this._readRunIndex();
+      const existingIdx = runs.findIndex((r) => r.runId === runId);
+      const summary = this._buildSummaryFromDetail({
+        runId,
+        workflowId,
+        workflowName,
+        status: WorkflowStatus.RUNNING,
+        detail,
+      });
+      if (existingIdx >= 0) {
+        runs[existingIdx] = summary;
+      } else {
+        runs.push(summary);
+      }
+      if (runs.length > MAX_PERSISTED_RUNS) runs.splice(0, runs.length - MAX_PERSISTED_RUNS);
+      writeFileSync(indexPath, JSON.stringify({ runs }, null, 2), "utf8");
+    } catch (err) {
+      console.error(`${TAG} Failed to ensure run in index:`, err.message);
+    }
+  }
+  /**
+   * Detect runs that were interrupted by a previous shutdown.
+   * Scans the _active-runs.json index for entries that are NOT in our
+   * in-memory _activeRuns map (which is empty on fresh boot). Marks them
+   * as PAUSED in the main index and clears the active-runs index.
+   */
+  _detectInterruptedRuns() {
+    try {
+      const activeEntries = this._readActiveRunsIndex();
+      if (!activeEntries.length) return;
+      const interrupted = [];
+      for (const entry of activeEntries) {
+        // If it's somehow still in _activeRuns, skip it (not interrupted)
+        if (this._activeRuns.has(entry.runId)) continue;
+        // Mark this run as PAUSED in the main index
+        const indexPath = resolve(this.runsDir, "index.json");
+        const runs = this._readRunIndex();
+        const idx = runs.findIndex((r) => r.runId === entry.runId);
+        if (idx >= 0) {
+          runs[idx].status = WorkflowStatus.PAUSED;
+          runs[idx].resumable = true;
+          runs[idx].interruptedAt = Date.now();
+          writeFileSync(indexPath, JSON.stringify({ runs }, null, 2), "utf8");
+        }
+        interrupted.push(entry);
+      }
+      // Clear the active-runs index — we've handled them
+      this._writeActiveRunsIndex([]);
+      if (interrupted.length > 0) {
+        console.log(
+          `${TAG} Detected ${interrupted.length} interrupted run(s): ${interrupted.map((e) => e.runId).join(", ")}`,
+        );
+        this.emit("runs:interrupted", { runs: interrupted });
+      }
+    } catch (err) {
+      console.error(`${TAG} Failed to detect interrupted runs:`, err.message);
+    }
+  }
+  /**
+   * Resume all interrupted (PAUSED + resumable) runs.
+   * Should be called AFTER services are wired up (e.g. after workflow
+   * engine is fully initialized with node executors).
+   */
+  async resumeInterruptedRuns() {
+    if (this._resumingRuns) return;
+    this._resumingRuns = true;
+    try {
+      const runs = this._readRunIndex().filter(
+        (r) => r.status === WorkflowStatus.PAUSED && r.resumable,
+      );
+      if (!runs.length) {
+        this._resumingRuns = false;
+        return;
+      }
+      console.log(`${TAG} Resuming ${runs.length} interrupted run(s)...`);
+      for (const run of runs) {
+        try {
+          // Check if the workflow definition still exists
+          const def = this.get(run.workflowId);
+          if (!def) {
+            console.warn(`${TAG} Cannot resume run ${run.runId}: workflow "${run.workflowId}" no longer exists`);
+            this._markRunUnresumable(run.runId, "workflow_deleted");
+            continue;
+          }
+          // Load the persisted detail file to get the context state
+          const detailPath = resolve(this.runsDir, `${run.runId}.json`);
+          if (!existsSync(detailPath)) {
+            console.warn(`${TAG} Cannot resume run ${run.runId}: no detail file found`);
+            this._markRunUnresumable(run.runId, "no_detail_file");
+            continue;
+          }
+          const detail = JSON.parse(readFileSync(detailPath, "utf8"));
+          const nodeStatuses = detail.nodeStatuses || {};
+          const hasCompletedNodes = Object.values(nodeStatuses).some(
+            (s) => s === NodeStatus.COMPLETED,
+          );
+          if (hasCompletedNodes) {
+            // Resume from where it left off using retryRun("from_failed")
+            console.log(`${TAG} Resuming run ${run.runId} from failed/interrupted node...`);
+            await this.retryRun(run.runId, { mode: "from_failed" }).catch((err) => {
+              console.error(`${TAG} Failed to resume run ${run.runId}:`, err.message);
+              this._markRunUnresumable(run.runId, `retry_error: ${err.message}`);
+            });
+          } else {
+            // No nodes completed — re-run from scratch
+            console.log(`${TAG} Re-executing run ${run.runId} from scratch...`);
+            const originalData = detail.inputData || detail.data || {};
+            // Clean up internal metadata from data before re-executing
+            const { _workflowId, _workflowName, _retryOf, ...cleanData } = originalData;
+            await this.execute(run.workflowId, cleanData, { force: true }).catch((err) => {
+              console.error(`${TAG} Failed to re-execute run ${run.runId}:`, err.message);
+              this._markRunUnresumable(run.runId, `execute_error: ${err.message}`);
+            });
+          }
+          // Mark the original interrupted run as no longer resumable
+          // (the retry/re-execute created a new run)
+          this._markRunUnresumable(run.runId, "resumed");
+        } catch (err) {
+          console.error(`${TAG} Error resuming run ${run.runId}:`, err.message);
+          this._markRunUnresumable(run.runId, `error: ${err.message}`);
+        }
+      }
+    } finally {
+      this._resumingRuns = false;
+    }
+  }
+  /**
+   * Mark a run as no longer resumable in the main index.
+   */
+  _markRunUnresumable(runId, reason) {
+    try {
+      const indexPath = resolve(this.runsDir, "index.json");
+      const runs = this._readRunIndex();
+      const idx = runs.findIndex((r) => r.runId === runId);
+      if (idx >= 0) {
+        runs[idx].resumable = false;
+        runs[idx].resumeResult = reason;
+        writeFileSync(indexPath, JSON.stringify({ runs }, null, 2), "utf8");
+      }
+    } catch (err) {
+      console.error(`${TAG} Failed to mark run unresumable:`, err.message);
+    }
+  }
+  // ── Persist completed run ─────────────────────────────────────────────
   _persistRun(runId, workflowId, ctx) {
     try {
       this._ensureDirs();
@@ -1101,14 +1755,13 @@ export class WorkflowEngine extends EventEmitter {
         detail,
       });
-      // Append to index
+      // Deduplicate: remove any existing entry for this runId before appending
       const indexPath = resolve(this.runsDir, "index.json");
-      let index = { runs: this._readRunIndex() };
-      index.runs.push(summary);
+      let runs = this._readRunIndex().filter((r) => r.runId !== runId);
+      runs.push(summary);
       // Keep last N runs
-      if (index.runs.length > MAX_PERSISTED_RUNS) index.runs = index.runs.slice(-MAX_PERSISTED_RUNS);
-      writeFileSync(indexPath, JSON.stringify(index, null, 2), "utf8");
+      if (runs.length > MAX_PERSISTED_RUNS) runs = runs.slice(-MAX_PERSISTED_RUNS);
+      writeFileSync(indexPath, JSON.stringify({ runs }, null, 2), "utf8");
       // Save full run detail
       const detailPath = resolve(this.runsDir, `${runId}.json`);
@@ -1147,3 +1800,4 @@ export function deleteWorkflow(id, opts) { return getWorkflowEngine(opts).delete
 export function listWorkflows(opts) { return getWorkflowEngine(opts).list(); }
 export function getWorkflow(id, opts) { return getWorkflowEngine(opts).get(id); }
 export async function executeWorkflow(id, data, opts) { return getWorkflowEngine(opts).execute(id, data, opts); }
+export async function retryWorkflowRun(runId, retryOpts, engineOpts) { return getWorkflowEngine(engineOpts).retryRun(runId, retryOpts); }