bosun 0.35.2 → 0.35.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +14 -1
- package/agent-hooks.mjs +7 -1
- package/agent-pool.mjs +16 -0
- package/agent-prompts.mjs +190 -4
- package/agent-sdk.mjs +6 -1
- package/agent-work-analyzer.mjs +48 -9
- package/autofix.mjs +32 -18
- package/bosun.schema.json +1 -1
- package/kanban-adapter.mjs +62 -12
- package/monitor.mjs +25 -6
- package/opencode-shell.mjs +881 -0
- package/package.json +5 -2
- package/primary-agent.mjs +43 -0
- package/setup.mjs +33 -4
- package/task-executor.mjs +43 -14
- package/ui/app.js +10 -7
- package/ui/components/chat-view.js +31 -9
- package/ui/components/session-list.js +20 -4
- package/ui/modules/router.js +2 -0
- package/ui/tabs/agents.js +66 -8
- package/ui-server.mjs +142 -5
- package/workflow-engine.mjs +664 -10
- package/workflow-nodes.mjs +250 -1
- package/workflow-templates/github.mjs +389 -71
- package/workflow-templates/planning.mjs +31 -11
- package/workflow-templates.mjs +3 -0
package/workflow-engine.mjs
CHANGED
|
@@ -67,6 +67,24 @@ const DEFAULT_RUN_STUCK_THRESHOLD_MS = readBoundedEnvInt(
|
|
|
67
67
|
{ min: 10000, max: 7_200_000 },
|
|
68
68
|
);
|
|
69
69
|
|
|
70
|
+
// ── Auto-Retry Defaults ─────────────────────────────────────────────────────
|
|
71
|
+
const DEFAULT_AUTO_RETRY_MAX_ATTEMPTS = readBoundedEnvInt(
|
|
72
|
+
"WORKFLOW_AUTO_RETRY_MAX_ATTEMPTS",
|
|
73
|
+
3,
|
|
74
|
+
{ min: 0, max: 10 },
|
|
75
|
+
);
|
|
76
|
+
const DEFAULT_AUTO_RETRY_COOLDOWN_MS = readBoundedEnvInt(
|
|
77
|
+
"WORKFLOW_AUTO_RETRY_COOLDOWN_MS",
|
|
78
|
+
20 * 60 * 1000, // 20 minutes
|
|
79
|
+
{ min: 0, max: 3_600_000 },
|
|
80
|
+
);
|
|
81
|
+
const CHECKPOINT_DEBOUNCE_MS = readBoundedEnvInt(
|
|
82
|
+
"WORKFLOW_CHECKPOINT_DEBOUNCE_MS",
|
|
83
|
+
500,
|
|
84
|
+
{ min: 50, max: 10000 },
|
|
85
|
+
);
|
|
86
|
+
const ACTIVE_RUNS_INDEX = "_active-runs.json";
|
|
87
|
+
|
|
70
88
|
function resolveNodeTimeoutMs(node, resolvedConfig) {
|
|
71
89
|
const candidates = [
|
|
72
90
|
resolvedConfig?.timeout,
|
|
@@ -333,6 +351,8 @@ export class WorkflowEngine extends EventEmitter {
|
|
|
333
351
|
this._activeRuns = new Map();
|
|
334
352
|
this._triggerSubscriptions = new Map();
|
|
335
353
|
this._loaded = false;
|
|
354
|
+
this._checkpointTimers = new Map(); // runId → debounce timer
|
|
355
|
+
this._resumingRuns = false;
|
|
336
356
|
}
|
|
337
357
|
|
|
338
358
|
// ── Lifecycle ───────────────────────────────────────────────────────────
|
|
@@ -359,6 +379,11 @@ export class WorkflowEngine extends EventEmitter {
|
|
|
359
379
|
}
|
|
360
380
|
this._loaded = true;
|
|
361
381
|
this.emit("loaded", { count: this._workflows.size });
|
|
382
|
+
|
|
383
|
+
// Detect runs that were interrupted by a previous shutdown.
|
|
384
|
+
// These are runs persisted to disk with status=RUNNING that are
|
|
385
|
+
// NOT in our in-memory _activeRuns (because we just booted).
|
|
386
|
+
this._detectInterruptedRuns();
|
|
362
387
|
}
|
|
363
388
|
|
|
364
389
|
/** Ensure storage directories exist */
|
|
@@ -466,6 +491,10 @@ export class WorkflowEngine extends EventEmitter {
|
|
|
466
491
|
startedAt: ctx.startedAt,
|
|
467
492
|
status: WorkflowStatus.RUNNING,
|
|
468
493
|
});
|
|
494
|
+
|
|
495
|
+
// ── Persist run immediately so it survives process restarts ──────
|
|
496
|
+
this._persistActiveRunState(runId, workflowId, def.name, ctx);
|
|
497
|
+
|
|
469
498
|
this.emit("run:start", { runId, workflowId, name: def.name });
|
|
470
499
|
|
|
471
500
|
try {
|
|
@@ -490,12 +519,261 @@ export class WorkflowEngine extends EventEmitter {
|
|
|
490
519
|
this.emit("run:error", { runId, workflowId, error: err.message });
|
|
491
520
|
}
|
|
492
521
|
|
|
493
|
-
// Persist run log
|
|
522
|
+
// Persist final run log and remove from active-runs index
|
|
494
523
|
this._persistRun(runId, workflowId, ctx);
|
|
524
|
+
this._clearActiveRunState(runId);
|
|
495
525
|
this._activeRuns.delete(runId);
|
|
526
|
+
|
|
527
|
+
// ── Auto-retry on failure ───────────────────────────────────────────
|
|
528
|
+
// If the workflow failed and auto-retry is enabled, kick off the
|
|
529
|
+
// escalating retry strategy asynchronously. The caller still receives the
|
|
530
|
+
// original (failed) context immediately so we never block the event loop.
|
|
531
|
+
const finalStatus = ctx.errors.length > 0 ? WorkflowStatus.FAILED : WorkflowStatus.COMPLETED;
|
|
532
|
+
if (finalStatus === WorkflowStatus.FAILED && !opts._isRetry) {
|
|
533
|
+
const retryConfig = this._resolveAutoRetryConfig(def);
|
|
534
|
+
if (retryConfig.enabled) {
|
|
535
|
+
// Fire-and-forget — errors are logged, never thrown.
|
|
536
|
+
this._autoRetryLoop(runId, workflowId, inputData, retryConfig, opts).catch((err) => {
|
|
537
|
+
console.error(`${TAG} Auto-retry loop error for run ${runId}:`, err.message);
|
|
538
|
+
});
|
|
539
|
+
}
|
|
540
|
+
}
|
|
541
|
+
|
|
496
542
|
return ctx;
|
|
497
543
|
}
|
|
498
544
|
|
|
545
|
+
// ── Run Retry ───────────────────────────────────────────────────────────
|
|
546
|
+
|
|
547
|
+
/**
|
|
548
|
+
* Retry a previously completed (failed) run.
|
|
549
|
+
*
|
|
550
|
+
* @param {string} runId - The original run ID to retry.
|
|
551
|
+
* @param {object} [retryOpts]
|
|
552
|
+
* @param {"from_failed"|"from_scratch"} [retryOpts.mode="from_failed"]
|
|
553
|
+
* - `"from_failed"` — re-execute starting from the first failed node,
|
|
554
|
+
* pre-populating the context with already-completed node outputs.
|
|
555
|
+
* - `"from_scratch"` — re-execute the entire workflow from the beginning
|
|
556
|
+
* with the same input data that was used originally.
|
|
557
|
+
* @returns {Promise<{retryRunId: string, mode: string, ctx: WorkflowContext}>}
|
|
558
|
+
*/
|
|
559
|
+
async retryRun(runId, retryOpts = {}) {
|
|
560
|
+
const mode = retryOpts.mode === "from_scratch" ? "from_scratch" : "from_failed";
|
|
561
|
+
const originalRun = this.getRunDetail(runId);
|
|
562
|
+
if (!originalRun) {
|
|
563
|
+
throw new Error(`${TAG} Run "${runId}" not found — cannot retry`);
|
|
564
|
+
}
|
|
565
|
+
|
|
566
|
+
const workflowId = originalRun.workflowId || originalRun.detail?.data?._workflowId;
|
|
567
|
+
if (!workflowId) {
|
|
568
|
+
throw new Error(`${TAG} Cannot determine workflowId from run "${runId}"`);
|
|
569
|
+
}
|
|
570
|
+
|
|
571
|
+
const def = this.get(workflowId);
|
|
572
|
+
if (!def) {
|
|
573
|
+
throw new Error(`${TAG} Workflow "${workflowId}" no longer exists — cannot retry`);
|
|
574
|
+
}
|
|
575
|
+
|
|
576
|
+
// Recover original input data (strip internal enrichment keys).
|
|
577
|
+
const originalData = { ...(originalRun.detail?.data || {}) };
|
|
578
|
+
delete originalData._workflowId;
|
|
579
|
+
delete originalData._workflowName;
|
|
580
|
+
|
|
581
|
+
this.emit("run:retry", {
|
|
582
|
+
originalRunId: runId,
|
|
583
|
+
workflowId,
|
|
584
|
+
mode,
|
|
585
|
+
attempt: retryOpts._attempt || 1,
|
|
586
|
+
});
|
|
587
|
+
|
|
588
|
+
if (mode === "from_scratch") {
|
|
589
|
+
const ctx = await this.execute(workflowId, originalData, {
|
|
590
|
+
...retryOpts,
|
|
591
|
+
_isRetry: true,
|
|
592
|
+
_originalRunId: runId,
|
|
593
|
+
force: true,
|
|
594
|
+
});
|
|
595
|
+
return { retryRunId: ctx.id, mode, originalRunId: runId, ctx };
|
|
596
|
+
}
|
|
597
|
+
|
|
598
|
+
// ── "from_failed" — resume from the first failed node ────────────
|
|
599
|
+
const detail = originalRun.detail || {};
|
|
600
|
+
const nodeStatuses = detail.nodeStatuses || {};
|
|
601
|
+
const nodeOutputs = detail.nodeOutputs || {};
|
|
602
|
+
|
|
603
|
+
// Build a fresh context but pre-seed completed node outputs.
|
|
604
|
+
const ctx = new WorkflowContext({
|
|
605
|
+
...def.variables,
|
|
606
|
+
...originalData,
|
|
607
|
+
_workflowId: workflowId,
|
|
608
|
+
_workflowName: def.name,
|
|
609
|
+
_retryOf: runId,
|
|
610
|
+
});
|
|
611
|
+
ctx.variables = { ...def.variables };
|
|
612
|
+
|
|
613
|
+
// Pre-populate nodes that already succeeded.
|
|
614
|
+
for (const [nodeId, status] of Object.entries(nodeStatuses)) {
|
|
615
|
+
if (status === NodeStatus.COMPLETED) {
|
|
616
|
+
ctx.setNodeStatus(nodeId, NodeStatus.COMPLETED);
|
|
617
|
+
if (nodeOutputs[nodeId] !== undefined) {
|
|
618
|
+
ctx.setNodeOutput(nodeId, nodeOutputs[nodeId]);
|
|
619
|
+
}
|
|
620
|
+
}
|
|
621
|
+
// Reset failed / skipped nodes so the DAG will re-run them.
|
|
622
|
+
}
|
|
623
|
+
|
|
624
|
+
const retryRunId = ctx.id;
|
|
625
|
+
this._activeRuns.set(retryRunId, {
|
|
626
|
+
workflowId,
|
|
627
|
+
workflowName: def.name,
|
|
628
|
+
ctx,
|
|
629
|
+
startedAt: ctx.startedAt,
|
|
630
|
+
status: WorkflowStatus.RUNNING,
|
|
631
|
+
});
|
|
632
|
+
this._persistActiveRunState(retryRunId, workflowId, def.name, ctx);
|
|
633
|
+
this.emit("run:start", { runId: retryRunId, workflowId, name: def.name, retryOf: runId, mode });
|
|
634
|
+
|
|
635
|
+
try {
|
|
636
|
+
const adjacency = this._buildAdjacency(def);
|
|
637
|
+
const entryNodes = this._findEntryNodes(def);
|
|
638
|
+
if (entryNodes.length === 0) {
|
|
639
|
+
throw new Error("Workflow has no entry nodes (no triggers or unconnected nodes)");
|
|
640
|
+
}
|
|
641
|
+
|
|
642
|
+
// _executeDag naturally skips nodes that are already COMPLETED because
|
|
643
|
+
// they were pre-seeded above, so it resumes from the failed point.
|
|
644
|
+
await this._executeDag(def, entryNodes, adjacency, ctx, { ...retryOpts, _isRetry: true });
|
|
645
|
+
|
|
646
|
+
const status = ctx.errors.length > 0 ? WorkflowStatus.FAILED : WorkflowStatus.COMPLETED;
|
|
647
|
+
this._activeRuns.get(retryRunId).status = status;
|
|
648
|
+
this.emit("run:end", {
|
|
649
|
+
runId: retryRunId,
|
|
650
|
+
workflowId,
|
|
651
|
+
status,
|
|
652
|
+
duration: Date.now() - ctx.startedAt,
|
|
653
|
+
retryOf: runId,
|
|
654
|
+
mode,
|
|
655
|
+
});
|
|
656
|
+
} catch (err) {
|
|
657
|
+
ctx.error("_engine", err);
|
|
658
|
+
this._activeRuns.get(retryRunId).status = WorkflowStatus.FAILED;
|
|
659
|
+
this.emit("run:error", { runId: retryRunId, workflowId, error: err.message, retryOf: runId });
|
|
660
|
+
}
|
|
661
|
+
|
|
662
|
+
this._persistRun(retryRunId, workflowId, ctx);
|
|
663
|
+
this._clearActiveRunState(retryRunId);
|
|
664
|
+
this._activeRuns.delete(retryRunId);
|
|
665
|
+
|
|
666
|
+
return { retryRunId, mode, originalRunId: runId, ctx };
|
|
667
|
+
}
|
|
668
|
+
|
|
669
|
+
// ── Auto-retry escalating strategy ───────────────────────────────────
|
|
670
|
+
|
|
671
|
+
/**
|
|
672
|
+
* Resolve the auto-retry configuration for a workflow definition.
|
|
673
|
+
* Supports per-workflow overrides via `def.autoRetry`.
|
|
674
|
+
*/
|
|
675
|
+
_resolveAutoRetryConfig(def) {
|
|
676
|
+
const raw = def?.autoRetry || {};
|
|
677
|
+
// Auto-retry is opt-in: workflows must explicitly set autoRetry.enabled = true.
|
|
678
|
+
// This prevents unexpected background retries for workflows that don't want them.
|
|
679
|
+
const enabled = Boolean(raw.enabled);
|
|
680
|
+
const maxAttempts = Number.isFinite(Number(raw.maxAttempts))
|
|
681
|
+
? Math.max(0, Math.trunc(Number(raw.maxAttempts)))
|
|
682
|
+
: DEFAULT_AUTO_RETRY_MAX_ATTEMPTS;
|
|
683
|
+
const cooldownMs = Number.isFinite(Number(raw.cooldownMs))
|
|
684
|
+
? Math.max(0, Math.trunc(Number(raw.cooldownMs)))
|
|
685
|
+
: DEFAULT_AUTO_RETRY_COOLDOWN_MS;
|
|
686
|
+
return { enabled: enabled && maxAttempts > 0, maxAttempts, cooldownMs };
|
|
687
|
+
}
|
|
688
|
+
|
|
689
|
+
/**
|
|
690
|
+
* Escalating auto-retry loop.
|
|
691
|
+
*
|
|
692
|
+
* Strategy (configurable, defaults to 3 attempts):
|
|
693
|
+
* Attempt 1 → from_failed (immediate)
|
|
694
|
+
* Attempt 2 → from_scratch (immediate)
|
|
695
|
+
* Attempt 3 → from_scratch (after cooldown period, default 20 min)
|
|
696
|
+
*
|
|
697
|
+
* If the workflow succeeds at any point the loop stops.
|
|
698
|
+
* Results are persisted as separate runs linked via `_retryOf`.
|
|
699
|
+
*/
|
|
700
|
+
async _autoRetryLoop(originalRunId, workflowId, inputData, retryConfig, baseOpts) {
|
|
701
|
+
const { maxAttempts, cooldownMs } = retryConfig;
|
|
702
|
+
|
|
703
|
+
for (let attempt = 1; attempt <= maxAttempts; attempt++) {
|
|
704
|
+
const mode = attempt === 1 ? "from_failed" : "from_scratch";
|
|
705
|
+
const needsCooldown = attempt >= 3 && cooldownMs > 0;
|
|
706
|
+
|
|
707
|
+
if (needsCooldown) {
|
|
708
|
+
console.log(
|
|
709
|
+
`${TAG} Auto-retry attempt ${attempt}/${maxAttempts} for run ${originalRunId} ` +
|
|
710
|
+
`— cooling down for ${Math.round(cooldownMs / 1000)}s before retry`,
|
|
711
|
+
);
|
|
712
|
+
this.emit("run:retry:cooldown", {
|
|
713
|
+
originalRunId,
|
|
714
|
+
workflowId,
|
|
715
|
+
attempt,
|
|
716
|
+
cooldownMs,
|
|
717
|
+
});
|
|
718
|
+
await new Promise((r) => setTimeout(r, cooldownMs));
|
|
719
|
+
}
|
|
720
|
+
|
|
721
|
+
console.log(
|
|
722
|
+
`${TAG} Auto-retry attempt ${attempt}/${maxAttempts} for run ${originalRunId} (mode=${mode})`,
|
|
723
|
+
);
|
|
724
|
+
|
|
725
|
+
try {
|
|
726
|
+
const { ctx, retryRunId } = await this.retryRun(originalRunId, {
|
|
727
|
+
mode,
|
|
728
|
+
_isRetry: true,
|
|
729
|
+
_attempt: attempt,
|
|
730
|
+
});
|
|
731
|
+
|
|
732
|
+
if (!ctx.errors || ctx.errors.length === 0) {
|
|
733
|
+
console.log(
|
|
734
|
+
`${TAG} Auto-retry succeeded on attempt ${attempt}/${maxAttempts} ` +
|
|
735
|
+
`for run ${originalRunId} → new run ${retryRunId}`,
|
|
736
|
+
);
|
|
737
|
+
this.emit("run:retry:success", {
|
|
738
|
+
originalRunId,
|
|
739
|
+
retryRunId,
|
|
740
|
+
workflowId,
|
|
741
|
+
attempt,
|
|
742
|
+
});
|
|
743
|
+
return; // Success — stop retrying
|
|
744
|
+
}
|
|
745
|
+
|
|
746
|
+
console.warn(
|
|
747
|
+
`${TAG} Auto-retry attempt ${attempt}/${maxAttempts} failed ` +
|
|
748
|
+
`for run ${originalRunId} → new run ${retryRunId}`,
|
|
749
|
+
);
|
|
750
|
+
this.emit("run:retry:failed", {
|
|
751
|
+
originalRunId,
|
|
752
|
+
retryRunId,
|
|
753
|
+
workflowId,
|
|
754
|
+
attempt,
|
|
755
|
+
errors: ctx.errors,
|
|
756
|
+
});
|
|
757
|
+
} catch (err) {
|
|
758
|
+
console.error(
|
|
759
|
+
`${TAG} Auto-retry attempt ${attempt}/${maxAttempts} threw for run ${originalRunId}:`,
|
|
760
|
+
err.message,
|
|
761
|
+
);
|
|
762
|
+
this.emit("run:retry:failed", {
|
|
763
|
+
originalRunId,
|
|
764
|
+
workflowId,
|
|
765
|
+
attempt,
|
|
766
|
+
errors: [{ error: err.message }],
|
|
767
|
+
});
|
|
768
|
+
}
|
|
769
|
+
}
|
|
770
|
+
|
|
771
|
+
console.error(
|
|
772
|
+
`${TAG} All ${maxAttempts} auto-retry attempts exhausted for run ${originalRunId}`,
|
|
773
|
+
);
|
|
774
|
+
this.emit("run:retry:exhausted", { originalRunId, workflowId, maxAttempts });
|
|
775
|
+
}
|
|
776
|
+
|
|
499
777
|
/**
|
|
500
778
|
* Evaluate trigger conditions to see if a workflow should fire.
|
|
501
779
|
* Called by the supervisor loop or event bus.
|
|
@@ -665,6 +943,16 @@ export class WorkflowEngine extends EventEmitter {
|
|
|
665
943
|
const queue = [...entryNodes.map((n) => n.id)];
|
|
666
944
|
const nodeMap = new Map((def.nodes || []).map((n) => [n.id, n]));
|
|
667
945
|
|
|
946
|
+
// ── Resume support (retry from_failed) ──────────────────────────────
|
|
947
|
+
// If nodes are already marked COMPLETED in the context (pre-seeded by
|
|
948
|
+
// retryRun), treat them as already executed so the DAG skips them and
|
|
949
|
+
// begins from the first un-completed node.
|
|
950
|
+
for (const [nodeId, status] of ctx.nodeStatuses) {
|
|
951
|
+
if (status === NodeStatus.COMPLETED) {
|
|
952
|
+
executed.add(nodeId);
|
|
953
|
+
}
|
|
954
|
+
}
|
|
955
|
+
|
|
668
956
|
// Track in-degree for proper scheduling
|
|
669
957
|
const inDegree = new Map();
|
|
670
958
|
for (const node of def.nodes || []) {
|
|
@@ -674,8 +962,32 @@ export class WorkflowEngine extends EventEmitter {
|
|
|
674
962
|
inDegree.set(edge.target, (inDegree.get(edge.target) || 0) + 1);
|
|
675
963
|
}
|
|
676
964
|
|
|
677
|
-
//
|
|
678
|
-
|
|
965
|
+
// ── Adjust in-degree for pre-completed nodes (retry resume) ─────────
|
|
966
|
+
// When resuming from a failed step, pre-completed source nodes have
|
|
967
|
+
// already satisfied their downstream edges. Decrement the in-degree for
|
|
968
|
+
// each target so successors become ready once all live deps are met.
|
|
969
|
+
for (const nodeId of executed) {
|
|
970
|
+
const edges = adjacency.get(nodeId) || [];
|
|
971
|
+
for (const edge of edges) {
|
|
972
|
+
const deg = (inDegree.get(edge.target) || 1) - 1;
|
|
973
|
+
inDegree.set(edge.target, Math.max(0, deg));
|
|
974
|
+
}
|
|
975
|
+
}
|
|
976
|
+
|
|
977
|
+
// Ready set = entry nodes (or nodes with no remaining unsatisfied deps)
|
|
978
|
+
const ready = new Set();
|
|
979
|
+
for (const nid of queue) {
|
|
980
|
+
if (!executed.has(nid)) {
|
|
981
|
+
ready.add(nid);
|
|
982
|
+
}
|
|
983
|
+
}
|
|
984
|
+
// Also add any non-entry nodes whose in-degree is now 0 due to pre-
|
|
985
|
+
// completed predecessors (this makes "from_failed" resume work).
|
|
986
|
+
for (const [nid, deg] of inDegree) {
|
|
987
|
+
if (deg <= 0 && !executed.has(nid) && !ready.has(nid)) {
|
|
988
|
+
ready.add(nid);
|
|
989
|
+
}
|
|
990
|
+
}
|
|
679
991
|
|
|
680
992
|
while (ready.size > 0) {
|
|
681
993
|
// Execute ready nodes in bounded parallel batches.
|
|
@@ -729,10 +1041,16 @@ export class WorkflowEngine extends EventEmitter {
|
|
|
729
1041
|
ctx.setNodeStatus(nodeId, NodeStatus.COMPLETED);
|
|
730
1042
|
executed.add(nodeId);
|
|
731
1043
|
this.emit("node:complete", { nodeId, type: node.type });
|
|
1044
|
+
|
|
1045
|
+
// Checkpoint progress to disk (debounced) so the run can
|
|
1046
|
+
// be resumed from here if the process is interrupted.
|
|
1047
|
+
this._checkpointRun(ctx);
|
|
1048
|
+
|
|
732
1049
|
lastErr = null;
|
|
733
1050
|
return { nodeId, result };
|
|
734
1051
|
} catch (err) {
|
|
735
1052
|
lastErr = err;
|
|
1053
|
+
if (err.retryable === false) break; // permanent error — skip remaining retry attempts
|
|
736
1054
|
}
|
|
737
1055
|
}
|
|
738
1056
|
|
|
@@ -876,12 +1194,32 @@ export class WorkflowEngine extends EventEmitter {
|
|
|
876
1194
|
// Resolve config templates against context
|
|
877
1195
|
const resolvedConfig = this._resolveConfig(node.config || {}, ctx);
|
|
878
1196
|
|
|
879
|
-
// Dry run —
|
|
1197
|
+
// Dry run — skip capability checks and handler execution.
|
|
1198
|
+
// Services aren't needed for simulation; this keeps dry-run tests fast.
|
|
880
1199
|
if (opts.dryRun) {
|
|
881
1200
|
ctx.log(node.id, `[dry-run] Would execute ${node.type}`, "info");
|
|
882
1201
|
return { _dryRun: true, type: node.type, config: resolvedConfig };
|
|
883
1202
|
}
|
|
884
1203
|
|
|
1204
|
+
// ── Capability pre-flight check ──────────────────────────────────────
|
|
1205
|
+
// Verify required services are present AFTER the dryRun early-return so
|
|
1206
|
+
// dry-run tests work without needing real service dependencies wired up.
|
|
1207
|
+
const requiredCapabilities = this._getNodeRequiredCapabilities(node.type);
|
|
1208
|
+
const missingCapabilities = [];
|
|
1209
|
+
for (const cap of requiredCapabilities) {
|
|
1210
|
+
if (!this._hasCapability(cap)) {
|
|
1211
|
+
missingCapabilities.push(cap);
|
|
1212
|
+
}
|
|
1213
|
+
}
|
|
1214
|
+
if (missingCapabilities.length > 0) {
|
|
1215
|
+
const detail = `Node "${node.label || node.id}" (${node.type}) requires capabilities: [${missingCapabilities.join(", ")}] which are not available. ` +
|
|
1216
|
+
`Check that the required services (agent pool, kanban adapter, etc.) are configured and the agent has the necessary permissions.`;
|
|
1217
|
+
ctx.log(node.id, detail, "error");
|
|
1218
|
+
const capErr = new Error(detail);
|
|
1219
|
+
capErr.retryable = false; // missing service is permanent — don't waste time retrying
|
|
1220
|
+
throw capErr;
|
|
1221
|
+
}
|
|
1222
|
+
|
|
885
1223
|
// Execute with timeout — clear timer on completion to avoid resource leaks
|
|
886
1224
|
const timeout = resolveNodeTimeoutMs(node, resolvedConfig);
|
|
887
1225
|
let timer;
|
|
@@ -916,6 +1254,48 @@ export class WorkflowEngine extends EventEmitter {
|
|
|
916
1254
|
return resolved;
|
|
917
1255
|
}
|
|
918
1256
|
|
|
1257
|
+
// ── Capability helpers ──────────────────────────────────────────────────
|
|
1258
|
+
// Map node-type prefixes / names to the engine.services keys they need.
|
|
1259
|
+
// This lets _executeNode fail-fast with a clear message instead of letting
|
|
1260
|
+
// the handler throw a cryptic "cannot read property X of undefined".
|
|
1261
|
+
|
|
1262
|
+
/** @returns {string[]} service keys the node type needs (may be empty) */
|
|
1263
|
+
_getNodeRequiredCapabilities(nodeType) {
|
|
1264
|
+
// Agent nodes need the agentPool service
|
|
1265
|
+
if (nodeType.startsWith("agent.") || nodeType === "action.run_agent") {
|
|
1266
|
+
return ["agentPool"];
|
|
1267
|
+
}
|
|
1268
|
+
// Session continuation / restart also need agentPool
|
|
1269
|
+
if (nodeType === "action.continue_session" || nodeType === "action.restart_agent") {
|
|
1270
|
+
return ["agentPool"];
|
|
1271
|
+
}
|
|
1272
|
+
// Task-management nodes need kanban
|
|
1273
|
+
if (
|
|
1274
|
+
nodeType === "action.create_task" ||
|
|
1275
|
+
nodeType === "action.update_task_status" ||
|
|
1276
|
+
nodeType === "action.materialize_planner_tasks"
|
|
1277
|
+
) {
|
|
1278
|
+
return ["kanban"];
|
|
1279
|
+
}
|
|
1280
|
+
// Telegram notification
|
|
1281
|
+
if (nodeType === "notify.telegram") {
|
|
1282
|
+
return ["telegram"];
|
|
1283
|
+
}
|
|
1284
|
+
// condition.task_has_tag reads from kanban
|
|
1285
|
+
if (nodeType === "condition.task_has_tag") {
|
|
1286
|
+
return ["kanban"];
|
|
1287
|
+
}
|
|
1288
|
+
// No special service required (file I/O, git, transforms, logs, etc.)
|
|
1289
|
+
return [];
|
|
1290
|
+
}
|
|
1291
|
+
|
|
1292
|
+
/** Check whether a named capability (service key) is available */
|
|
1293
|
+
_hasCapability(cap) {
|
|
1294
|
+
const svc = this.services?.[cap];
|
|
1295
|
+
// A capability is "present" when its value is a non-null object or function.
|
|
1296
|
+
return svc != null && (typeof svc === "object" || typeof svc === "function");
|
|
1297
|
+
}
|
|
1298
|
+
|
|
919
1299
|
_evaluateCondition(condition, ctx, sourceNodeId) {
|
|
920
1300
|
// Simple expression evaluator — supports basic comparisons
|
|
921
1301
|
// Variables: $output (source node output), $data (context data), $status
|
|
@@ -1088,6 +1468,280 @@ export class WorkflowEngine extends EventEmitter {
|
|
|
1088
1468
|
return normalized;
|
|
1089
1469
|
}
|
|
1090
1470
|
|
|
1471
|
+
// ── Active-runs persistence (crash recovery) ─────────────────────────
|
|
1472
|
+
|
|
1473
|
+
/**
|
|
1474
|
+
* Read the active-runs index (_active-runs.json).
|
|
1475
|
+
* Returns an array of { runId, workflowId, workflowName, startedAt }.
|
|
1476
|
+
*/
|
|
1477
|
+
_readActiveRunsIndex() {
|
|
1478
|
+
try {
|
|
1479
|
+
const p = resolve(this.runsDir, ACTIVE_RUNS_INDEX);
|
|
1480
|
+
if (!existsSync(p)) return [];
|
|
1481
|
+
const raw = JSON.parse(readFileSync(p, "utf8"));
|
|
1482
|
+
return Array.isArray(raw) ? raw : [];
|
|
1483
|
+
} catch {
|
|
1484
|
+
return [];
|
|
1485
|
+
}
|
|
1486
|
+
}
|
|
1487
|
+
|
|
1488
|
+
/** Write the active-runs index atomically. */
|
|
1489
|
+
_writeActiveRunsIndex(entries) {
|
|
1490
|
+
try {
|
|
1491
|
+
this._ensureDirs();
|
|
1492
|
+
const p = resolve(this.runsDir, ACTIVE_RUNS_INDEX);
|
|
1493
|
+
writeFileSync(p, JSON.stringify(entries, null, 2), "utf8");
|
|
1494
|
+
} catch (err) {
|
|
1495
|
+
console.error(`${TAG} Failed to write active-runs index:`, err.message);
|
|
1496
|
+
}
|
|
1497
|
+
}
|
|
1498
|
+
|
|
1499
|
+
/**
|
|
1500
|
+
* Persist a run to the active-runs index AND write an initial detail file.
|
|
1501
|
+
* Called at the very start of execute() / retryRun() so the run is on disk
|
|
1502
|
+
* before any node executes.
|
|
1503
|
+
*/
|
|
1504
|
+
_persistActiveRunState(runId, workflowId, workflowName, ctx) {
|
|
1505
|
+
try {
|
|
1506
|
+
this._ensureDirs();
|
|
1507
|
+
|
|
1508
|
+
// Add to active-runs index
|
|
1509
|
+
const entries = this._readActiveRunsIndex().filter((e) => e.runId !== runId);
|
|
1510
|
+
entries.push({ runId, workflowId, workflowName, startedAt: ctx.startedAt });
|
|
1511
|
+
this._writeActiveRunsIndex(entries);
|
|
1512
|
+
|
|
1513
|
+
// Write initial detail file so we can resume from it
|
|
1514
|
+
const detail = this._serializeRunContext(ctx, true);
|
|
1515
|
+
const detailPath = resolve(this.runsDir, `${runId}.json`);
|
|
1516
|
+
writeFileSync(detailPath, JSON.stringify(detail, null, 2), "utf8");
|
|
1517
|
+
|
|
1518
|
+
// Also ensure the run appears in the main index (with RUNNING status)
|
|
1519
|
+
// so that getRunDetail() can find it even before completion.
|
|
1520
|
+
this._ensureRunInIndex(runId, workflowId, workflowName, detail);
|
|
1521
|
+
} catch (err) {
|
|
1522
|
+
console.error(`${TAG} Failed to persist active run state:`, err.message);
|
|
1523
|
+
}
|
|
1524
|
+
}
|
|
1525
|
+
|
|
1526
|
+
/**
|
|
1527
|
+
* Debounced checkpoint — writes the current run context to disk after each
|
|
1528
|
+
* node completes. Debounced at CHECKPOINT_DEBOUNCE_MS to avoid disk
|
|
1529
|
+
* thrashing when many nodes finish in quick succession.
|
|
1530
|
+
*/
|
|
1531
|
+
_checkpointRun(ctx) {
|
|
1532
|
+
const runId = ctx.id;
|
|
1533
|
+
// Clear any pending timer for this run
|
|
1534
|
+
const existing = this._checkpointTimers.get(runId);
|
|
1535
|
+
if (existing) clearTimeout(existing);
|
|
1536
|
+
|
|
1537
|
+
const timer = setTimeout(() => {
|
|
1538
|
+
this._checkpointTimers.delete(runId);
|
|
1539
|
+
try {
|
|
1540
|
+
this._ensureDirs();
|
|
1541
|
+
const detail = this._serializeRunContext(ctx, true);
|
|
1542
|
+
const detailPath = resolve(this.runsDir, `${runId}.json`);
|
|
1543
|
+
writeFileSync(detailPath, JSON.stringify(detail, null, 2), "utf8");
|
|
1544
|
+
} catch (err) {
|
|
1545
|
+
console.error(`${TAG} Checkpoint failed for run ${runId}:`, err.message);
|
|
1546
|
+
}
|
|
1547
|
+
}, CHECKPOINT_DEBOUNCE_MS);
|
|
1548
|
+
|
|
1549
|
+
// Don't let the timer prevent clean process exit
|
|
1550
|
+
if (timer.unref) timer.unref();
|
|
1551
|
+
this._checkpointTimers.set(runId, timer);
|
|
1552
|
+
}
|
|
1553
|
+
|
|
1554
|
+
/**
|
|
1555
|
+
* Remove a run from the active-runs index and clear its checkpoint timer.
|
|
1556
|
+
* Called after a run completes (success or failure) so it won't be
|
|
1557
|
+
* mistakenly resumed on next boot.
|
|
1558
|
+
*/
|
|
1559
|
+
_clearActiveRunState(runId) {
|
|
1560
|
+
try {
|
|
1561
|
+
// Clear debounce timer
|
|
1562
|
+
const timer = this._checkpointTimers.get(runId);
|
|
1563
|
+
if (timer) {
|
|
1564
|
+
clearTimeout(timer);
|
|
1565
|
+
this._checkpointTimers.delete(runId);
|
|
1566
|
+
}
|
|
1567
|
+
// Remove from active-runs index
|
|
1568
|
+
const entries = this._readActiveRunsIndex().filter((e) => e.runId !== runId);
|
|
1569
|
+
this._writeActiveRunsIndex(entries);
|
|
1570
|
+
} catch (err) {
|
|
1571
|
+
console.error(`${TAG} Failed to clear active run state:`, err.message);
|
|
1572
|
+
}
|
|
1573
|
+
}
|
|
1574
|
+
|
|
1575
|
+
/**
|
|
1576
|
+
* Ensure a run entry exists in the main runs index (index.json).
|
|
1577
|
+
* Deduplicates by runId — if the run already exists, updates it in place.
|
|
1578
|
+
*/
|
|
1579
|
+
_ensureRunInIndex(runId, workflowId, workflowName, detail) {
|
|
1580
|
+
try {
|
|
1581
|
+
const indexPath = resolve(this.runsDir, "index.json");
|
|
1582
|
+
const runs = this._readRunIndex();
|
|
1583
|
+
const existingIdx = runs.findIndex((r) => r.runId === runId);
|
|
1584
|
+
|
|
1585
|
+
const summary = this._buildSummaryFromDetail({
|
|
1586
|
+
runId,
|
|
1587
|
+
workflowId,
|
|
1588
|
+
workflowName,
|
|
1589
|
+
status: WorkflowStatus.RUNNING,
|
|
1590
|
+
detail,
|
|
1591
|
+
});
|
|
1592
|
+
|
|
1593
|
+
if (existingIdx >= 0) {
|
|
1594
|
+
runs[existingIdx] = summary;
|
|
1595
|
+
} else {
|
|
1596
|
+
runs.push(summary);
|
|
1597
|
+
}
|
|
1598
|
+
if (runs.length > MAX_PERSISTED_RUNS) runs.splice(0, runs.length - MAX_PERSISTED_RUNS);
|
|
1599
|
+
writeFileSync(indexPath, JSON.stringify({ runs }, null, 2), "utf8");
|
|
1600
|
+
} catch (err) {
|
|
1601
|
+
console.error(`${TAG} Failed to ensure run in index:`, err.message);
|
|
1602
|
+
}
|
|
1603
|
+
}
|
|
1604
|
+
|
|
1605
|
+
/**
|
|
1606
|
+
* Detect runs that were interrupted by a previous shutdown.
|
|
1607
|
+
* Scans the _active-runs.json index for entries that are NOT in our
|
|
1608
|
+
* in-memory _activeRuns map (which is empty on fresh boot). Marks them
|
|
1609
|
+
* as PAUSED in the main index and clears the active-runs index.
|
|
1610
|
+
*/
|
|
1611
|
+
_detectInterruptedRuns() {
|
|
1612
|
+
try {
|
|
1613
|
+
const activeEntries = this._readActiveRunsIndex();
|
|
1614
|
+
if (!activeEntries.length) return;
|
|
1615
|
+
|
|
1616
|
+
const interrupted = [];
|
|
1617
|
+
for (const entry of activeEntries) {
|
|
1618
|
+
// If it's somehow still in _activeRuns, skip it (not interrupted)
|
|
1619
|
+
if (this._activeRuns.has(entry.runId)) continue;
|
|
1620
|
+
|
|
1621
|
+
// Mark this run as PAUSED in the main index
|
|
1622
|
+
const indexPath = resolve(this.runsDir, "index.json");
|
|
1623
|
+
const runs = this._readRunIndex();
|
|
1624
|
+
const idx = runs.findIndex((r) => r.runId === entry.runId);
|
|
1625
|
+
if (idx >= 0) {
|
|
1626
|
+
runs[idx].status = WorkflowStatus.PAUSED;
|
|
1627
|
+
runs[idx].resumable = true;
|
|
1628
|
+
runs[idx].interruptedAt = Date.now();
|
|
1629
|
+
writeFileSync(indexPath, JSON.stringify({ runs }, null, 2), "utf8");
|
|
1630
|
+
}
|
|
1631
|
+
interrupted.push(entry);
|
|
1632
|
+
}
|
|
1633
|
+
|
|
1634
|
+
// Clear the active-runs index — we've handled them
|
|
1635
|
+
this._writeActiveRunsIndex([]);
|
|
1636
|
+
|
|
1637
|
+
if (interrupted.length > 0) {
|
|
1638
|
+
console.log(
|
|
1639
|
+
`${TAG} Detected ${interrupted.length} interrupted run(s): ${interrupted.map((e) => e.runId).join(", ")}`,
|
|
1640
|
+
);
|
|
1641
|
+
this.emit("runs:interrupted", { runs: interrupted });
|
|
1642
|
+
}
|
|
1643
|
+
} catch (err) {
|
|
1644
|
+
console.error(`${TAG} Failed to detect interrupted runs:`, err.message);
|
|
1645
|
+
}
|
|
1646
|
+
}
|
|
1647
|
+
|
|
1648
|
+
/**
|
|
1649
|
+
* Resume all interrupted (PAUSED + resumable) runs.
|
|
1650
|
+
* Should be called AFTER services are wired up (e.g. after workflow
|
|
1651
|
+
* engine is fully initialized with node executors).
|
|
1652
|
+
*/
|
|
1653
|
+
async resumeInterruptedRuns() {
|
|
1654
|
+
if (this._resumingRuns) return;
|
|
1655
|
+
this._resumingRuns = true;
|
|
1656
|
+
|
|
1657
|
+
try {
|
|
1658
|
+
const runs = this._readRunIndex().filter(
|
|
1659
|
+
(r) => r.status === WorkflowStatus.PAUSED && r.resumable,
|
|
1660
|
+
);
|
|
1661
|
+
|
|
1662
|
+
if (!runs.length) {
|
|
1663
|
+
this._resumingRuns = false;
|
|
1664
|
+
return;
|
|
1665
|
+
}
|
|
1666
|
+
|
|
1667
|
+
console.log(`${TAG} Resuming ${runs.length} interrupted run(s)...`);
|
|
1668
|
+
|
|
1669
|
+
for (const run of runs) {
|
|
1670
|
+
try {
|
|
1671
|
+
// Check if the workflow definition still exists
|
|
1672
|
+
const def = this.get(run.workflowId);
|
|
1673
|
+
if (!def) {
|
|
1674
|
+
console.warn(`${TAG} Cannot resume run ${run.runId}: workflow "${run.workflowId}" no longer exists`);
|
|
1675
|
+
this._markRunUnresumable(run.runId, "workflow_deleted");
|
|
1676
|
+
continue;
|
|
1677
|
+
}
|
|
1678
|
+
|
|
1679
|
+
// Load the persisted detail file to get the context state
|
|
1680
|
+
const detailPath = resolve(this.runsDir, `${run.runId}.json`);
|
|
1681
|
+
if (!existsSync(detailPath)) {
|
|
1682
|
+
console.warn(`${TAG} Cannot resume run ${run.runId}: no detail file found`);
|
|
1683
|
+
this._markRunUnresumable(run.runId, "no_detail_file");
|
|
1684
|
+
continue;
|
|
1685
|
+
}
|
|
1686
|
+
|
|
1687
|
+
const detail = JSON.parse(readFileSync(detailPath, "utf8"));
|
|
1688
|
+
const nodeStatuses = detail.nodeStatuses || {};
|
|
1689
|
+
const hasCompletedNodes = Object.values(nodeStatuses).some(
|
|
1690
|
+
(s) => s === NodeStatus.COMPLETED,
|
|
1691
|
+
);
|
|
1692
|
+
|
|
1693
|
+
if (hasCompletedNodes) {
|
|
1694
|
+
// Resume from where it left off using retryRun("from_failed")
|
|
1695
|
+
console.log(`${TAG} Resuming run ${run.runId} from failed/interrupted node...`);
|
|
1696
|
+
await this.retryRun(run.runId, { mode: "from_failed" }).catch((err) => {
|
|
1697
|
+
console.error(`${TAG} Failed to resume run ${run.runId}:`, err.message);
|
|
1698
|
+
this._markRunUnresumable(run.runId, `retry_error: ${err.message}`);
|
|
1699
|
+
});
|
|
1700
|
+
} else {
|
|
1701
|
+
// No nodes completed — re-run from scratch
|
|
1702
|
+
console.log(`${TAG} Re-executing run ${run.runId} from scratch...`);
|
|
1703
|
+
const originalData = detail.inputData || detail.data || {};
|
|
1704
|
+
// Clean up internal metadata from data before re-executing
|
|
1705
|
+
const { _workflowId, _workflowName, _retryOf, ...cleanData } = originalData;
|
|
1706
|
+
await this.execute(run.workflowId, cleanData, { force: true }).catch((err) => {
|
|
1707
|
+
console.error(`${TAG} Failed to re-execute run ${run.runId}:`, err.message);
|
|
1708
|
+
this._markRunUnresumable(run.runId, `execute_error: ${err.message}`);
|
|
1709
|
+
});
|
|
1710
|
+
}
|
|
1711
|
+
|
|
1712
|
+
// Mark the original interrupted run as no longer resumable
|
|
1713
|
+
// (the retry/re-execute created a new run)
|
|
1714
|
+
this._markRunUnresumable(run.runId, "resumed");
|
|
1715
|
+
} catch (err) {
|
|
1716
|
+
console.error(`${TAG} Error resuming run ${run.runId}:`, err.message);
|
|
1717
|
+
this._markRunUnresumable(run.runId, `error: ${err.message}`);
|
|
1718
|
+
}
|
|
1719
|
+
}
|
|
1720
|
+
} finally {
|
|
1721
|
+
this._resumingRuns = false;
|
|
1722
|
+
}
|
|
1723
|
+
}
|
|
1724
|
+
|
|
1725
|
+
/**
|
|
1726
|
+
* Mark a run as no longer resumable in the main index.
|
|
1727
|
+
*/
|
|
1728
|
+
_markRunUnresumable(runId, reason) {
|
|
1729
|
+
try {
|
|
1730
|
+
const indexPath = resolve(this.runsDir, "index.json");
|
|
1731
|
+
const runs = this._readRunIndex();
|
|
1732
|
+
const idx = runs.findIndex((r) => r.runId === runId);
|
|
1733
|
+
if (idx >= 0) {
|
|
1734
|
+
runs[idx].resumable = false;
|
|
1735
|
+
runs[idx].resumeResult = reason;
|
|
1736
|
+
writeFileSync(indexPath, JSON.stringify({ runs }, null, 2), "utf8");
|
|
1737
|
+
}
|
|
1738
|
+
} catch (err) {
|
|
1739
|
+
console.error(`${TAG} Failed to mark run unresumable:`, err.message);
|
|
1740
|
+
}
|
|
1741
|
+
}
|
|
1742
|
+
|
|
1743
|
+
// ── Persist completed run ─────────────────────────────────────────────
|
|
1744
|
+
|
|
1091
1745
|
_persistRun(runId, workflowId, ctx) {
|
|
1092
1746
|
try {
|
|
1093
1747
|
this._ensureDirs();
|
|
@@ -1101,14 +1755,13 @@ export class WorkflowEngine extends EventEmitter {
|
|
|
1101
1755
|
detail,
|
|
1102
1756
|
});
|
|
1103
1757
|
|
|
1104
|
-
//
|
|
1758
|
+
// Deduplicate: remove any existing entry for this runId before appending
|
|
1105
1759
|
const indexPath = resolve(this.runsDir, "index.json");
|
|
1106
|
-
let
|
|
1107
|
-
|
|
1108
|
-
index.runs.push(summary);
|
|
1760
|
+
let runs = this._readRunIndex().filter((r) => r.runId !== runId);
|
|
1761
|
+
runs.push(summary);
|
|
1109
1762
|
// Keep last N runs
|
|
1110
|
-
if (
|
|
1111
|
-
writeFileSync(indexPath, JSON.stringify(
|
|
1763
|
+
if (runs.length > MAX_PERSISTED_RUNS) runs = runs.slice(-MAX_PERSISTED_RUNS);
|
|
1764
|
+
writeFileSync(indexPath, JSON.stringify({ runs }, null, 2), "utf8");
|
|
1112
1765
|
|
|
1113
1766
|
// Save full run detail
|
|
1114
1767
|
const detailPath = resolve(this.runsDir, `${runId}.json`);
|
|
@@ -1147,3 +1800,4 @@ export function deleteWorkflow(id, opts) { return getWorkflowEngine(opts).delete
|
|
|
1147
1800
|
export function listWorkflows(opts) { return getWorkflowEngine(opts).list(); }
|
|
1148
1801
|
export function getWorkflow(id, opts) { return getWorkflowEngine(opts).get(id); }
|
|
1149
1802
|
export async function executeWorkflow(id, data, opts) { return getWorkflowEngine(opts).execute(id, data, opts); }
|
|
1803
|
+
export async function retryWorkflowRun(runId, retryOpts, engineOpts) { return getWorkflowEngine(engineOpts).retryRun(runId, retryOpts); }
|