bosun 0.35.2 → 0.35.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -67,6 +67,24 @@ const DEFAULT_RUN_STUCK_THRESHOLD_MS = readBoundedEnvInt(
67
67
  { min: 10000, max: 7_200_000 },
68
68
  );
69
69
 
70
+ // ── Auto-Retry Defaults ─────────────────────────────────────────────────────
71
+ const DEFAULT_AUTO_RETRY_MAX_ATTEMPTS = readBoundedEnvInt(
72
+ "WORKFLOW_AUTO_RETRY_MAX_ATTEMPTS",
73
+ 3,
74
+ { min: 0, max: 10 },
75
+ );
76
+ const DEFAULT_AUTO_RETRY_COOLDOWN_MS = readBoundedEnvInt(
77
+ "WORKFLOW_AUTO_RETRY_COOLDOWN_MS",
78
+ 20 * 60 * 1000, // 20 minutes
79
+ { min: 0, max: 3_600_000 },
80
+ );
81
+ const CHECKPOINT_DEBOUNCE_MS = readBoundedEnvInt(
82
+ "WORKFLOW_CHECKPOINT_DEBOUNCE_MS",
83
+ 500,
84
+ { min: 50, max: 10000 },
85
+ );
86
+ const ACTIVE_RUNS_INDEX = "_active-runs.json";
87
+
70
88
  function resolveNodeTimeoutMs(node, resolvedConfig) {
71
89
  const candidates = [
72
90
  resolvedConfig?.timeout,
@@ -333,6 +351,8 @@ export class WorkflowEngine extends EventEmitter {
333
351
  this._activeRuns = new Map();
334
352
  this._triggerSubscriptions = new Map();
335
353
  this._loaded = false;
354
+ this._checkpointTimers = new Map(); // runId → debounce timer
355
+ this._resumingRuns = false;
336
356
  }
337
357
 
338
358
  // ── Lifecycle ───────────────────────────────────────────────────────────
@@ -359,6 +379,11 @@ export class WorkflowEngine extends EventEmitter {
359
379
  }
360
380
  this._loaded = true;
361
381
  this.emit("loaded", { count: this._workflows.size });
382
+
383
+ // Detect runs that were interrupted by a previous shutdown.
384
+ // These are runs persisted to disk with status=RUNNING that are
385
+ // NOT in our in-memory _activeRuns (because we just booted).
386
+ this._detectInterruptedRuns();
362
387
  }
363
388
 
364
389
  /** Ensure storage directories exist */
@@ -466,6 +491,10 @@ export class WorkflowEngine extends EventEmitter {
466
491
  startedAt: ctx.startedAt,
467
492
  status: WorkflowStatus.RUNNING,
468
493
  });
494
+
495
+ // ── Persist run immediately so it survives process restarts ──────
496
+ this._persistActiveRunState(runId, workflowId, def.name, ctx);
497
+
469
498
  this.emit("run:start", { runId, workflowId, name: def.name });
470
499
 
471
500
  try {
@@ -490,12 +519,261 @@ export class WorkflowEngine extends EventEmitter {
490
519
  this.emit("run:error", { runId, workflowId, error: err.message });
491
520
  }
492
521
 
493
- // Persist run log
522
+ // Persist final run log and remove from active-runs index
494
523
  this._persistRun(runId, workflowId, ctx);
524
+ this._clearActiveRunState(runId);
495
525
  this._activeRuns.delete(runId);
526
+
527
+ // ── Auto-retry on failure ───────────────────────────────────────────
528
+ // If the workflow failed and auto-retry is enabled, kick off the
529
+ // escalating retry strategy asynchronously. The caller still receives the
530
+ // original (failed) context immediately so we never block the event loop.
531
+ const finalStatus = ctx.errors.length > 0 ? WorkflowStatus.FAILED : WorkflowStatus.COMPLETED;
532
+ if (finalStatus === WorkflowStatus.FAILED && !opts._isRetry) {
533
+ const retryConfig = this._resolveAutoRetryConfig(def);
534
+ if (retryConfig.enabled) {
535
+ // Fire-and-forget — errors are logged, never thrown.
536
+ this._autoRetryLoop(runId, workflowId, inputData, retryConfig, opts).catch((err) => {
537
+ console.error(`${TAG} Auto-retry loop error for run ${runId}:`, err.message);
538
+ });
539
+ }
540
+ }
541
+
496
542
  return ctx;
497
543
  }
498
544
 
545
+ // ── Run Retry ───────────────────────────────────────────────────────────
546
+
547
+ /**
548
+ * Retry a previously completed (failed) run.
549
+ *
550
+ * @param {string} runId - The original run ID to retry.
551
+ * @param {object} [retryOpts]
552
+ * @param {"from_failed"|"from_scratch"} [retryOpts.mode="from_failed"]
553
+ * - `"from_failed"` — re-execute starting from the first failed node,
554
+ * pre-populating the context with already-completed node outputs.
555
+ * - `"from_scratch"` — re-execute the entire workflow from the beginning
556
+ * with the same input data that was used originally.
557
+ * @returns {Promise<{retryRunId: string, mode: string, ctx: WorkflowContext}>}
558
+ */
559
+ async retryRun(runId, retryOpts = {}) {
560
+ const mode = retryOpts.mode === "from_scratch" ? "from_scratch" : "from_failed";
561
+ const originalRun = this.getRunDetail(runId);
562
+ if (!originalRun) {
563
+ throw new Error(`${TAG} Run "${runId}" not found — cannot retry`);
564
+ }
565
+
566
+ const workflowId = originalRun.workflowId || originalRun.detail?.data?._workflowId;
567
+ if (!workflowId) {
568
+ throw new Error(`${TAG} Cannot determine workflowId from run "${runId}"`);
569
+ }
570
+
571
+ const def = this.get(workflowId);
572
+ if (!def) {
573
+ throw new Error(`${TAG} Workflow "${workflowId}" no longer exists — cannot retry`);
574
+ }
575
+
576
+ // Recover original input data (strip internal enrichment keys).
577
+ const originalData = { ...(originalRun.detail?.data || {}) };
578
+ delete originalData._workflowId;
579
+ delete originalData._workflowName;
580
+
581
+ this.emit("run:retry", {
582
+ originalRunId: runId,
583
+ workflowId,
584
+ mode,
585
+ attempt: retryOpts._attempt || 1,
586
+ });
587
+
588
+ if (mode === "from_scratch") {
589
+ const ctx = await this.execute(workflowId, originalData, {
590
+ ...retryOpts,
591
+ _isRetry: true,
592
+ _originalRunId: runId,
593
+ force: true,
594
+ });
595
+ return { retryRunId: ctx.id, mode, originalRunId: runId, ctx };
596
+ }
597
+
598
+ // ── "from_failed" — resume from the first failed node ────────────
599
+ const detail = originalRun.detail || {};
600
+ const nodeStatuses = detail.nodeStatuses || {};
601
+ const nodeOutputs = detail.nodeOutputs || {};
602
+
603
+ // Build a fresh context but pre-seed completed node outputs.
604
+ const ctx = new WorkflowContext({
605
+ ...def.variables,
606
+ ...originalData,
607
+ _workflowId: workflowId,
608
+ _workflowName: def.name,
609
+ _retryOf: runId,
610
+ });
611
+ ctx.variables = { ...def.variables };
612
+
613
+ // Pre-populate nodes that already succeeded.
614
+ for (const [nodeId, status] of Object.entries(nodeStatuses)) {
615
+ if (status === NodeStatus.COMPLETED) {
616
+ ctx.setNodeStatus(nodeId, NodeStatus.COMPLETED);
617
+ if (nodeOutputs[nodeId] !== undefined) {
618
+ ctx.setNodeOutput(nodeId, nodeOutputs[nodeId]);
619
+ }
620
+ }
621
+ // Reset failed / skipped nodes so the DAG will re-run them.
622
+ }
623
+
624
+ const retryRunId = ctx.id;
625
+ this._activeRuns.set(retryRunId, {
626
+ workflowId,
627
+ workflowName: def.name,
628
+ ctx,
629
+ startedAt: ctx.startedAt,
630
+ status: WorkflowStatus.RUNNING,
631
+ });
632
+ this._persistActiveRunState(retryRunId, workflowId, def.name, ctx);
633
+ this.emit("run:start", { runId: retryRunId, workflowId, name: def.name, retryOf: runId, mode });
634
+
635
+ try {
636
+ const adjacency = this._buildAdjacency(def);
637
+ const entryNodes = this._findEntryNodes(def);
638
+ if (entryNodes.length === 0) {
639
+ throw new Error("Workflow has no entry nodes (no triggers or unconnected nodes)");
640
+ }
641
+
642
+ // _executeDag naturally skips nodes that are already COMPLETED because
643
+ // they were pre-seeded above, so it resumes from the failed point.
644
+ await this._executeDag(def, entryNodes, adjacency, ctx, { ...retryOpts, _isRetry: true });
645
+
646
+ const status = ctx.errors.length > 0 ? WorkflowStatus.FAILED : WorkflowStatus.COMPLETED;
647
+ this._activeRuns.get(retryRunId).status = status;
648
+ this.emit("run:end", {
649
+ runId: retryRunId,
650
+ workflowId,
651
+ status,
652
+ duration: Date.now() - ctx.startedAt,
653
+ retryOf: runId,
654
+ mode,
655
+ });
656
+ } catch (err) {
657
+ ctx.error("_engine", err);
658
+ this._activeRuns.get(retryRunId).status = WorkflowStatus.FAILED;
659
+ this.emit("run:error", { runId: retryRunId, workflowId, error: err.message, retryOf: runId });
660
+ }
661
+
662
+ this._persistRun(retryRunId, workflowId, ctx);
663
+ this._clearActiveRunState(retryRunId);
664
+ this._activeRuns.delete(retryRunId);
665
+
666
+ return { retryRunId, mode, originalRunId: runId, ctx };
667
+ }
668
+
669
+ // ── Auto-retry escalating strategy ───────────────────────────────────
670
+
671
+ /**
672
+ * Resolve the auto-retry configuration for a workflow definition.
673
+ * Supports per-workflow overrides via `def.autoRetry`.
674
+ */
675
+ _resolveAutoRetryConfig(def) {
676
+ const raw = def?.autoRetry || {};
677
+ // Auto-retry is opt-in: workflows must explicitly set autoRetry.enabled = true.
678
+ // This prevents unexpected background retries for workflows that don't want them.
679
+ const enabled = Boolean(raw.enabled);
680
+ const maxAttempts = Number.isFinite(Number(raw.maxAttempts))
681
+ ? Math.max(0, Math.trunc(Number(raw.maxAttempts)))
682
+ : DEFAULT_AUTO_RETRY_MAX_ATTEMPTS;
683
+ const cooldownMs = Number.isFinite(Number(raw.cooldownMs))
684
+ ? Math.max(0, Math.trunc(Number(raw.cooldownMs)))
685
+ : DEFAULT_AUTO_RETRY_COOLDOWN_MS;
686
+ return { enabled: enabled && maxAttempts > 0, maxAttempts, cooldownMs };
687
+ }
688
+
689
+ /**
690
+ * Escalating auto-retry loop.
691
+ *
692
+ * Strategy (configurable, defaults to 3 attempts):
693
+ * Attempt 1 → from_failed (immediate)
694
+ * Attempt 2 → from_scratch (immediate)
695
+ * Attempt 3 → from_scratch (after cooldown period, default 20 min)
696
+ *
697
+ * If the workflow succeeds at any point the loop stops.
698
+ * Results are persisted as separate runs linked via `_retryOf`.
699
+ */
700
+ async _autoRetryLoop(originalRunId, workflowId, inputData, retryConfig, baseOpts) {
701
+ const { maxAttempts, cooldownMs } = retryConfig;
702
+
703
+ for (let attempt = 1; attempt <= maxAttempts; attempt++) {
704
+ const mode = attempt === 1 ? "from_failed" : "from_scratch";
705
+ const needsCooldown = attempt >= 3 && cooldownMs > 0;
706
+
707
+ if (needsCooldown) {
708
+ console.log(
709
+ `${TAG} Auto-retry attempt ${attempt}/${maxAttempts} for run ${originalRunId} ` +
710
+ `— cooling down for ${Math.round(cooldownMs / 1000)}s before retry`,
711
+ );
712
+ this.emit("run:retry:cooldown", {
713
+ originalRunId,
714
+ workflowId,
715
+ attempt,
716
+ cooldownMs,
717
+ });
718
+ await new Promise((r) => setTimeout(r, cooldownMs));
719
+ }
720
+
721
+ console.log(
722
+ `${TAG} Auto-retry attempt ${attempt}/${maxAttempts} for run ${originalRunId} (mode=${mode})`,
723
+ );
724
+
725
+ try {
726
+ const { ctx, retryRunId } = await this.retryRun(originalRunId, {
727
+ mode,
728
+ _isRetry: true,
729
+ _attempt: attempt,
730
+ });
731
+
732
+ if (!ctx.errors || ctx.errors.length === 0) {
733
+ console.log(
734
+ `${TAG} Auto-retry succeeded on attempt ${attempt}/${maxAttempts} ` +
735
+ `for run ${originalRunId} → new run ${retryRunId}`,
736
+ );
737
+ this.emit("run:retry:success", {
738
+ originalRunId,
739
+ retryRunId,
740
+ workflowId,
741
+ attempt,
742
+ });
743
+ return; // Success — stop retrying
744
+ }
745
+
746
+ console.warn(
747
+ `${TAG} Auto-retry attempt ${attempt}/${maxAttempts} failed ` +
748
+ `for run ${originalRunId} → new run ${retryRunId}`,
749
+ );
750
+ this.emit("run:retry:failed", {
751
+ originalRunId,
752
+ retryRunId,
753
+ workflowId,
754
+ attempt,
755
+ errors: ctx.errors,
756
+ });
757
+ } catch (err) {
758
+ console.error(
759
+ `${TAG} Auto-retry attempt ${attempt}/${maxAttempts} threw for run ${originalRunId}:`,
760
+ err.message,
761
+ );
762
+ this.emit("run:retry:failed", {
763
+ originalRunId,
764
+ workflowId,
765
+ attempt,
766
+ errors: [{ error: err.message }],
767
+ });
768
+ }
769
+ }
770
+
771
+ console.error(
772
+ `${TAG} All ${maxAttempts} auto-retry attempts exhausted for run ${originalRunId}`,
773
+ );
774
+ this.emit("run:retry:exhausted", { originalRunId, workflowId, maxAttempts });
775
+ }
776
+
499
777
  /**
500
778
  * Evaluate trigger conditions to see if a workflow should fire.
501
779
  * Called by the supervisor loop or event bus.
@@ -665,6 +943,16 @@ export class WorkflowEngine extends EventEmitter {
665
943
  const queue = [...entryNodes.map((n) => n.id)];
666
944
  const nodeMap = new Map((def.nodes || []).map((n) => [n.id, n]));
667
945
 
946
+ // ── Resume support (retry from_failed) ──────────────────────────────
947
+ // If nodes are already marked COMPLETED in the context (pre-seeded by
948
+ // retryRun), treat them as already executed so the DAG skips them and
949
+ // begins from the first un-completed node.
950
+ for (const [nodeId, status] of ctx.nodeStatuses) {
951
+ if (status === NodeStatus.COMPLETED) {
952
+ executed.add(nodeId);
953
+ }
954
+ }
955
+
668
956
  // Track in-degree for proper scheduling
669
957
  const inDegree = new Map();
670
958
  for (const node of def.nodes || []) {
@@ -674,8 +962,32 @@ export class WorkflowEngine extends EventEmitter {
674
962
  inDegree.set(edge.target, (inDegree.get(edge.target) || 0) + 1);
675
963
  }
676
964
 
677
- // Ready set = nodes with all dependencies met
678
- const ready = new Set(queue);
965
+ // ── Adjust in-degree for pre-completed nodes (retry resume) ─────────
966
+ // When resuming from a failed step, pre-completed source nodes have
967
+ // already satisfied their downstream edges. Decrement the in-degree for
968
+ // each target so successors become ready once all live deps are met.
969
+ for (const nodeId of executed) {
970
+ const edges = adjacency.get(nodeId) || [];
971
+ for (const edge of edges) {
972
+ const deg = (inDegree.get(edge.target) || 1) - 1;
973
+ inDegree.set(edge.target, Math.max(0, deg));
974
+ }
975
+ }
976
+
977
+ // Ready set = entry nodes (or nodes with no remaining unsatisfied deps)
978
+ const ready = new Set();
979
+ for (const nid of queue) {
980
+ if (!executed.has(nid)) {
981
+ ready.add(nid);
982
+ }
983
+ }
984
+ // Also add any non-entry nodes whose in-degree is now 0 due to pre-
985
+ // completed predecessors (this makes "from_failed" resume work).
986
+ for (const [nid, deg] of inDegree) {
987
+ if (deg <= 0 && !executed.has(nid) && !ready.has(nid)) {
988
+ ready.add(nid);
989
+ }
990
+ }
679
991
 
680
992
  while (ready.size > 0) {
681
993
  // Execute ready nodes in bounded parallel batches.
@@ -729,10 +1041,16 @@ export class WorkflowEngine extends EventEmitter {
729
1041
  ctx.setNodeStatus(nodeId, NodeStatus.COMPLETED);
730
1042
  executed.add(nodeId);
731
1043
  this.emit("node:complete", { nodeId, type: node.type });
1044
+
1045
+ // Checkpoint progress to disk (debounced) so the run can
1046
+ // be resumed from here if the process is interrupted.
1047
+ this._checkpointRun(ctx);
1048
+
732
1049
  lastErr = null;
733
1050
  return { nodeId, result };
734
1051
  } catch (err) {
735
1052
  lastErr = err;
1053
+ if (err.retryable === false) break; // permanent error — skip remaining retry attempts
736
1054
  }
737
1055
  }
738
1056
 
@@ -876,12 +1194,32 @@ export class WorkflowEngine extends EventEmitter {
876
1194
  // Resolve config templates against context
877
1195
  const resolvedConfig = this._resolveConfig(node.config || {}, ctx);
878
1196
 
879
- // Dry run — just validate
1197
+ // Dry run — skip capability checks and handler execution.
1198
+ // Services aren't needed for simulation; this keeps dry-run tests fast.
880
1199
  if (opts.dryRun) {
881
1200
  ctx.log(node.id, `[dry-run] Would execute ${node.type}`, "info");
882
1201
  return { _dryRun: true, type: node.type, config: resolvedConfig };
883
1202
  }
884
1203
 
1204
+ // ── Capability pre-flight check ──────────────────────────────────────
1205
+ // Verify required services are present AFTER the dryRun early-return so
1206
+ // dry-run tests work without needing real service dependencies wired up.
1207
+ const requiredCapabilities = this._getNodeRequiredCapabilities(node.type);
1208
+ const missingCapabilities = [];
1209
+ for (const cap of requiredCapabilities) {
1210
+ if (!this._hasCapability(cap)) {
1211
+ missingCapabilities.push(cap);
1212
+ }
1213
+ }
1214
+ if (missingCapabilities.length > 0) {
1215
+ const detail = `Node "${node.label || node.id}" (${node.type}) requires capabilities: [${missingCapabilities.join(", ")}] which are not available. ` +
1216
+ `Check that the required services (agent pool, kanban adapter, etc.) are configured and the agent has the necessary permissions.`;
1217
+ ctx.log(node.id, detail, "error");
1218
+ const capErr = new Error(detail);
1219
+ capErr.retryable = false; // missing service is permanent — don't waste time retrying
1220
+ throw capErr;
1221
+ }
1222
+
885
1223
  // Execute with timeout — clear timer on completion to avoid resource leaks
886
1224
  const timeout = resolveNodeTimeoutMs(node, resolvedConfig);
887
1225
  let timer;
@@ -916,6 +1254,48 @@ export class WorkflowEngine extends EventEmitter {
916
1254
  return resolved;
917
1255
  }
918
1256
 
1257
+ // ── Capability helpers ──────────────────────────────────────────────────
1258
+ // Map node-type prefixes / names to the engine.services keys they need.
1259
+ // This lets _executeNode fail-fast with a clear message instead of letting
1260
+ // the handler throw a cryptic "cannot read property X of undefined".
1261
+
1262
+ /** @returns {string[]} service keys the node type needs (may be empty) */
1263
+ _getNodeRequiredCapabilities(nodeType) {
1264
+ // Agent nodes need the agentPool service
1265
+ if (nodeType.startsWith("agent.") || nodeType === "action.run_agent") {
1266
+ return ["agentPool"];
1267
+ }
1268
+ // Session continuation / restart also need agentPool
1269
+ if (nodeType === "action.continue_session" || nodeType === "action.restart_agent") {
1270
+ return ["agentPool"];
1271
+ }
1272
+ // Task-management nodes need kanban
1273
+ if (
1274
+ nodeType === "action.create_task" ||
1275
+ nodeType === "action.update_task_status" ||
1276
+ nodeType === "action.materialize_planner_tasks"
1277
+ ) {
1278
+ return ["kanban"];
1279
+ }
1280
+ // Telegram notification
1281
+ if (nodeType === "notify.telegram") {
1282
+ return ["telegram"];
1283
+ }
1284
+ // condition.task_has_tag reads from kanban
1285
+ if (nodeType === "condition.task_has_tag") {
1286
+ return ["kanban"];
1287
+ }
1288
+ // No special service required (file I/O, git, transforms, logs, etc.)
1289
+ return [];
1290
+ }
1291
+
1292
+ /** Check whether a named capability (service key) is available */
1293
+ _hasCapability(cap) {
1294
+ const svc = this.services?.[cap];
1295
+ // A capability is "present" when its value is a non-null object or function.
1296
+ return svc != null && (typeof svc === "object" || typeof svc === "function");
1297
+ }
1298
+
919
1299
  _evaluateCondition(condition, ctx, sourceNodeId) {
920
1300
  // Simple expression evaluator — supports basic comparisons
921
1301
  // Variables: $output (source node output), $data (context data), $status
@@ -1088,6 +1468,280 @@ export class WorkflowEngine extends EventEmitter {
1088
1468
  return normalized;
1089
1469
  }
1090
1470
 
1471
+ // ── Active-runs persistence (crash recovery) ─────────────────────────
1472
+
1473
+ /**
1474
+ * Read the active-runs index (_active-runs.json).
1475
+ * Returns an array of { runId, workflowId, workflowName, startedAt }.
1476
+ */
1477
+ _readActiveRunsIndex() {
1478
+ try {
1479
+ const p = resolve(this.runsDir, ACTIVE_RUNS_INDEX);
1480
+ if (!existsSync(p)) return [];
1481
+ const raw = JSON.parse(readFileSync(p, "utf8"));
1482
+ return Array.isArray(raw) ? raw : [];
1483
+ } catch {
1484
+ return [];
1485
+ }
1486
+ }
1487
+
1488
+ /** Write the active-runs index atomically. */
1489
+ _writeActiveRunsIndex(entries) {
1490
+ try {
1491
+ this._ensureDirs();
1492
+ const p = resolve(this.runsDir, ACTIVE_RUNS_INDEX);
1493
+ writeFileSync(p, JSON.stringify(entries, null, 2), "utf8");
1494
+ } catch (err) {
1495
+ console.error(`${TAG} Failed to write active-runs index:`, err.message);
1496
+ }
1497
+ }
1498
+
1499
+ /**
1500
+ * Persist a run to the active-runs index AND write an initial detail file.
1501
+ * Called at the very start of execute() / retryRun() so the run is on disk
1502
+ * before any node executes.
1503
+ */
1504
+ _persistActiveRunState(runId, workflowId, workflowName, ctx) {
1505
+ try {
1506
+ this._ensureDirs();
1507
+
1508
+ // Add to active-runs index
1509
+ const entries = this._readActiveRunsIndex().filter((e) => e.runId !== runId);
1510
+ entries.push({ runId, workflowId, workflowName, startedAt: ctx.startedAt });
1511
+ this._writeActiveRunsIndex(entries);
1512
+
1513
+ // Write initial detail file so we can resume from it
1514
+ const detail = this._serializeRunContext(ctx, true);
1515
+ const detailPath = resolve(this.runsDir, `${runId}.json`);
1516
+ writeFileSync(detailPath, JSON.stringify(detail, null, 2), "utf8");
1517
+
1518
+ // Also ensure the run appears in the main index (with RUNNING status)
1519
+ // so that getRunDetail() can find it even before completion.
1520
+ this._ensureRunInIndex(runId, workflowId, workflowName, detail);
1521
+ } catch (err) {
1522
+ console.error(`${TAG} Failed to persist active run state:`, err.message);
1523
+ }
1524
+ }
1525
+
1526
+ /**
1527
+ * Debounced checkpoint — writes the current run context to disk after each
1528
+ * node completes. Debounced at CHECKPOINT_DEBOUNCE_MS to avoid disk
1529
+ * thrashing when many nodes finish in quick succession.
1530
+ */
1531
+ _checkpointRun(ctx) {
1532
+ const runId = ctx.id;
1533
+ // Clear any pending timer for this run
1534
+ const existing = this._checkpointTimers.get(runId);
1535
+ if (existing) clearTimeout(existing);
1536
+
1537
+ const timer = setTimeout(() => {
1538
+ this._checkpointTimers.delete(runId);
1539
+ try {
1540
+ this._ensureDirs();
1541
+ const detail = this._serializeRunContext(ctx, true);
1542
+ const detailPath = resolve(this.runsDir, `${runId}.json`);
1543
+ writeFileSync(detailPath, JSON.stringify(detail, null, 2), "utf8");
1544
+ } catch (err) {
1545
+ console.error(`${TAG} Checkpoint failed for run ${runId}:`, err.message);
1546
+ }
1547
+ }, CHECKPOINT_DEBOUNCE_MS);
1548
+
1549
+ // Don't let the timer prevent clean process exit
1550
+ if (timer.unref) timer.unref();
1551
+ this._checkpointTimers.set(runId, timer);
1552
+ }
1553
+
1554
+ /**
1555
+ * Remove a run from the active-runs index and clear its checkpoint timer.
1556
+ * Called after a run completes (success or failure) so it won't be
1557
+ * mistakenly resumed on next boot.
1558
+ */
1559
+ _clearActiveRunState(runId) {
1560
+ try {
1561
+ // Clear debounce timer
1562
+ const timer = this._checkpointTimers.get(runId);
1563
+ if (timer) {
1564
+ clearTimeout(timer);
1565
+ this._checkpointTimers.delete(runId);
1566
+ }
1567
+ // Remove from active-runs index
1568
+ const entries = this._readActiveRunsIndex().filter((e) => e.runId !== runId);
1569
+ this._writeActiveRunsIndex(entries);
1570
+ } catch (err) {
1571
+ console.error(`${TAG} Failed to clear active run state:`, err.message);
1572
+ }
1573
+ }
1574
+
1575
+ /**
1576
+ * Ensure a run entry exists in the main runs index (index.json).
1577
+ * Deduplicates by runId — if the run already exists, updates it in place.
1578
+ */
1579
+ _ensureRunInIndex(runId, workflowId, workflowName, detail) {
1580
+ try {
1581
+ const indexPath = resolve(this.runsDir, "index.json");
1582
+ const runs = this._readRunIndex();
1583
+ const existingIdx = runs.findIndex((r) => r.runId === runId);
1584
+
1585
+ const summary = this._buildSummaryFromDetail({
1586
+ runId,
1587
+ workflowId,
1588
+ workflowName,
1589
+ status: WorkflowStatus.RUNNING,
1590
+ detail,
1591
+ });
1592
+
1593
+ if (existingIdx >= 0) {
1594
+ runs[existingIdx] = summary;
1595
+ } else {
1596
+ runs.push(summary);
1597
+ }
1598
+ if (runs.length > MAX_PERSISTED_RUNS) runs.splice(0, runs.length - MAX_PERSISTED_RUNS);
1599
+ writeFileSync(indexPath, JSON.stringify({ runs }, null, 2), "utf8");
1600
+ } catch (err) {
1601
+ console.error(`${TAG} Failed to ensure run in index:`, err.message);
1602
+ }
1603
+ }
1604
+
1605
+ /**
1606
+ * Detect runs that were interrupted by a previous shutdown.
1607
+ * Scans the _active-runs.json index for entries that are NOT in our
1608
+ * in-memory _activeRuns map (which is empty on fresh boot). Marks them
1609
+ * as PAUSED in the main index and clears the active-runs index.
1610
+ */
1611
+ _detectInterruptedRuns() {
1612
+ try {
1613
+ const activeEntries = this._readActiveRunsIndex();
1614
+ if (!activeEntries.length) return;
1615
+
1616
+ const interrupted = [];
1617
+ for (const entry of activeEntries) {
1618
+ // If it's somehow still in _activeRuns, skip it (not interrupted)
1619
+ if (this._activeRuns.has(entry.runId)) continue;
1620
+
1621
+ // Mark this run as PAUSED in the main index
1622
+ const indexPath = resolve(this.runsDir, "index.json");
1623
+ const runs = this._readRunIndex();
1624
+ const idx = runs.findIndex((r) => r.runId === entry.runId);
1625
+ if (idx >= 0) {
1626
+ runs[idx].status = WorkflowStatus.PAUSED;
1627
+ runs[idx].resumable = true;
1628
+ runs[idx].interruptedAt = Date.now();
1629
+ writeFileSync(indexPath, JSON.stringify({ runs }, null, 2), "utf8");
1630
+ }
1631
+ interrupted.push(entry);
1632
+ }
1633
+
1634
+ // Clear the active-runs index — we've handled them
1635
+ this._writeActiveRunsIndex([]);
1636
+
1637
+ if (interrupted.length > 0) {
1638
+ console.log(
1639
+ `${TAG} Detected ${interrupted.length} interrupted run(s): ${interrupted.map((e) => e.runId).join(", ")}`,
1640
+ );
1641
+ this.emit("runs:interrupted", { runs: interrupted });
1642
+ }
1643
+ } catch (err) {
1644
+ console.error(`${TAG} Failed to detect interrupted runs:`, err.message);
1645
+ }
1646
+ }
1647
+
1648
+ /**
1649
+ * Resume all interrupted (PAUSED + resumable) runs.
1650
+ * Should be called AFTER services are wired up (e.g. after workflow
1651
+ * engine is fully initialized with node executors).
1652
+ */
1653
+ async resumeInterruptedRuns() {
1654
+ if (this._resumingRuns) return;
1655
+ this._resumingRuns = true;
1656
+
1657
+ try {
1658
+ const runs = this._readRunIndex().filter(
1659
+ (r) => r.status === WorkflowStatus.PAUSED && r.resumable,
1660
+ );
1661
+
1662
+ if (!runs.length) {
1663
+ this._resumingRuns = false;
1664
+ return;
1665
+ }
1666
+
1667
+ console.log(`${TAG} Resuming ${runs.length} interrupted run(s)...`);
1668
+
1669
+ for (const run of runs) {
1670
+ try {
1671
+ // Check if the workflow definition still exists
1672
+ const def = this.get(run.workflowId);
1673
+ if (!def) {
1674
+ console.warn(`${TAG} Cannot resume run ${run.runId}: workflow "${run.workflowId}" no longer exists`);
1675
+ this._markRunUnresumable(run.runId, "workflow_deleted");
1676
+ continue;
1677
+ }
1678
+
1679
+ // Load the persisted detail file to get the context state
1680
+ const detailPath = resolve(this.runsDir, `${run.runId}.json`);
1681
+ if (!existsSync(detailPath)) {
1682
+ console.warn(`${TAG} Cannot resume run ${run.runId}: no detail file found`);
1683
+ this._markRunUnresumable(run.runId, "no_detail_file");
1684
+ continue;
1685
+ }
1686
+
1687
+ const detail = JSON.parse(readFileSync(detailPath, "utf8"));
1688
+ const nodeStatuses = detail.nodeStatuses || {};
1689
+ const hasCompletedNodes = Object.values(nodeStatuses).some(
1690
+ (s) => s === NodeStatus.COMPLETED,
1691
+ );
1692
+
1693
+ if (hasCompletedNodes) {
1694
+ // Resume from where it left off using retryRun("from_failed")
1695
+ console.log(`${TAG} Resuming run ${run.runId} from failed/interrupted node...`);
1696
+ await this.retryRun(run.runId, { mode: "from_failed" }).catch((err) => {
1697
+ console.error(`${TAG} Failed to resume run ${run.runId}:`, err.message);
1698
+ this._markRunUnresumable(run.runId, `retry_error: ${err.message}`);
1699
+ });
1700
+ } else {
1701
+ // No nodes completed — re-run from scratch
1702
+ console.log(`${TAG} Re-executing run ${run.runId} from scratch...`);
1703
+ const originalData = detail.inputData || detail.data || {};
1704
+ // Clean up internal metadata from data before re-executing
1705
+ const { _workflowId, _workflowName, _retryOf, ...cleanData } = originalData;
1706
+ await this.execute(run.workflowId, cleanData, { force: true }).catch((err) => {
1707
+ console.error(`${TAG} Failed to re-execute run ${run.runId}:`, err.message);
1708
+ this._markRunUnresumable(run.runId, `execute_error: ${err.message}`);
1709
+ });
1710
+ }
1711
+
1712
+ // Mark the original interrupted run as no longer resumable
1713
+ // (the retry/re-execute created a new run)
1714
+ this._markRunUnresumable(run.runId, "resumed");
1715
+ } catch (err) {
1716
+ console.error(`${TAG} Error resuming run ${run.runId}:`, err.message);
1717
+ this._markRunUnresumable(run.runId, `error: ${err.message}`);
1718
+ }
1719
+ }
1720
+ } finally {
1721
+ this._resumingRuns = false;
1722
+ }
1723
+ }
1724
+
1725
+ /**
1726
+ * Mark a run as no longer resumable in the main index.
1727
+ */
1728
+ _markRunUnresumable(runId, reason) {
1729
+ try {
1730
+ const indexPath = resolve(this.runsDir, "index.json");
1731
+ const runs = this._readRunIndex();
1732
+ const idx = runs.findIndex((r) => r.runId === runId);
1733
+ if (idx >= 0) {
1734
+ runs[idx].resumable = false;
1735
+ runs[idx].resumeResult = reason;
1736
+ writeFileSync(indexPath, JSON.stringify({ runs }, null, 2), "utf8");
1737
+ }
1738
+ } catch (err) {
1739
+ console.error(`${TAG} Failed to mark run unresumable:`, err.message);
1740
+ }
1741
+ }
1742
+
1743
+ // ── Persist completed run ─────────────────────────────────────────────
1744
+
1091
1745
  _persistRun(runId, workflowId, ctx) {
1092
1746
  try {
1093
1747
  this._ensureDirs();
@@ -1101,14 +1755,13 @@ export class WorkflowEngine extends EventEmitter {
1101
1755
  detail,
1102
1756
  });
1103
1757
 
1104
- // Append to index
1758
+ // Deduplicate: remove any existing entry for this runId before appending
1105
1759
  const indexPath = resolve(this.runsDir, "index.json");
1106
- let index = { runs: this._readRunIndex() };
1107
-
1108
- index.runs.push(summary);
1760
+ let runs = this._readRunIndex().filter((r) => r.runId !== runId);
1761
+ runs.push(summary);
1109
1762
  // Keep last N runs
1110
- if (index.runs.length > MAX_PERSISTED_RUNS) index.runs = index.runs.slice(-MAX_PERSISTED_RUNS);
1111
- writeFileSync(indexPath, JSON.stringify(index, null, 2), "utf8");
1763
+ if (runs.length > MAX_PERSISTED_RUNS) runs = runs.slice(-MAX_PERSISTED_RUNS);
1764
+ writeFileSync(indexPath, JSON.stringify({ runs }, null, 2), "utf8");
1112
1765
 
1113
1766
  // Save full run detail
1114
1767
  const detailPath = resolve(this.runsDir, `${runId}.json`);
@@ -1147,3 +1800,4 @@ export function deleteWorkflow(id, opts) { return getWorkflowEngine(opts).delete
1147
1800
  export function listWorkflows(opts) { return getWorkflowEngine(opts).list(); }
1148
1801
  export function getWorkflow(id, opts) { return getWorkflowEngine(opts).get(id); }
1149
1802
  export async function executeWorkflow(id, data, opts) { return getWorkflowEngine(opts).execute(id, data, opts); }
1803
+ export async function retryWorkflowRun(runId, retryOpts, engineOpts) { return getWorkflowEngine(engineOpts).retryRun(runId, retryOpts); }