@smithers-orchestrator/scheduler 0.22.0 → 0.24.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +3 -3
- package/src/makeWorkflowSession.js +137 -47
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@smithers-orchestrator/scheduler",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.24.0",
|
|
4
4
|
"description": "Pure decision engine: session, scheduler, and task state management for Smithers workflows",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"sideEffects": false,
|
|
@@ -176,8 +176,8 @@
|
|
|
176
176
|
],
|
|
177
177
|
"dependencies": {
|
|
178
178
|
"effect": "^3.21.1",
|
|
179
|
-
"@smithers-orchestrator/
|
|
180
|
-
"@smithers-orchestrator/
|
|
179
|
+
"@smithers-orchestrator/errors": "0.24.0",
|
|
180
|
+
"@smithers-orchestrator/graph": "0.24.0"
|
|
181
181
|
},
|
|
182
182
|
"devDependencies": {
|
|
183
183
|
"@types/bun": "latest",
|
|
@@ -4,6 +4,7 @@ import { toSmithersError } from "@smithers-orchestrator/errors/toSmithersError";
|
|
|
4
4
|
import { buildPlanTree } from "./buildPlanTree.js";
|
|
5
5
|
import { buildStateKey } from "./buildStateKey.js";
|
|
6
6
|
import { cloneTaskStateMap } from "./cloneTaskStateMap.js";
|
|
7
|
+
import { computeRetryDelayMs } from "./computeRetryDelayMs.js";
|
|
7
8
|
import { parseStateKey } from "./parseStateKey.js";
|
|
8
9
|
import { scheduleTasks } from "./scheduleTasks.js";
|
|
9
10
|
/** @typedef {import("./ApprovalResolution.ts").ApprovalResolution} ApprovalResolution */
|
|
@@ -149,27 +150,6 @@ function parseDurationMs(value) {
|
|
|
149
150
|
return amount;
|
|
150
151
|
}
|
|
151
152
|
}
|
|
152
|
-
/**
|
|
153
|
-
* @param {TaskDescriptor} descriptor
|
|
154
|
-
* @param {number} failureCount
|
|
155
|
-
* @returns {number}
|
|
156
|
-
*/
|
|
157
|
-
function retryDelayMs(descriptor, failureCount) {
|
|
158
|
-
const policy = descriptor.retryPolicy;
|
|
159
|
-
if (!policy)
|
|
160
|
-
return 0;
|
|
161
|
-
const initial = policy.initialDelayMs ?? 0;
|
|
162
|
-
if (policy.backoff === "exponential") {
|
|
163
|
-
const multiplier = policy.multiplier ?? 2;
|
|
164
|
-
const computed = initial * Math.pow(multiplier, Math.max(0, failureCount - 1));
|
|
165
|
-
return Math.min(policy.maxDelayMs ?? computed, computed);
|
|
166
|
-
}
|
|
167
|
-
if (policy.backoff === "linear") {
|
|
168
|
-
const computed = initial * Math.max(1, failureCount);
|
|
169
|
-
return Math.min(policy.maxDelayMs ?? computed, computed);
|
|
170
|
-
}
|
|
171
|
-
return initial;
|
|
172
|
-
}
|
|
173
153
|
/**
|
|
174
154
|
* @param {TaskDescriptor} descriptor
|
|
175
155
|
* @param {unknown} error
|
|
@@ -199,6 +179,67 @@ function isRetryableFailure(descriptor, error) {
|
|
|
199
179
|
}
|
|
200
180
|
return true;
|
|
201
181
|
}
|
|
182
|
+
/**
|
|
183
|
+
* @param {unknown} error
|
|
184
|
+
* @returns {boolean}
|
|
185
|
+
*/
|
|
186
|
+
function isTransientSessionFailure(error) {
|
|
187
|
+
const normalized = toSmithersError(error);
|
|
188
|
+
const code = error && typeof error === "object" && typeof error.code === "string"
|
|
189
|
+
? error.code
|
|
190
|
+
: normalized.code;
|
|
191
|
+
return code === "SESSION_ERROR" ||
|
|
192
|
+
code === "TASK_TIMEOUT" ||
|
|
193
|
+
code === "TASK_HEARTBEAT_TIMEOUT" ||
|
|
194
|
+
code === "TASK_ABORTED" ||
|
|
195
|
+
normalized.details?.failureRetryable === true;
|
|
196
|
+
}
|
|
197
|
+
/**
|
|
198
|
+
* Build a human-readable diagnostic for a dependency deadlock: pending tasks
|
|
199
|
+
* that can never run because their `dependsOn` edges point at tasks missing from
|
|
200
|
+
* the graph or themselves permanently blocked. The most common cause is a
|
|
201
|
+
* `deps`/`needs` mismatch — a `deps={{ key: ... }}` whose key is not the upstream
|
|
202
|
+
* task's id and was not remapped with `needs={{ key: '<id>' }}`, which the Task
|
|
203
|
+
* component (deriveDepNodeIds) turns into a dependency on a non-existent node id.
|
|
204
|
+
* @param {SessionState} state
|
|
205
|
+
* @returns {string}
|
|
206
|
+
*/
|
|
207
|
+
function describeDeadlock(state) {
|
|
208
|
+
const blocked = [];
|
|
209
|
+
let sawMissing = false;
|
|
210
|
+
for (const descriptor of state.descriptors.values()) {
|
|
211
|
+
const taskState = state.states.get(stateKeyFor(descriptor)) ?? "pending";
|
|
212
|
+
if (taskState !== "pending" && taskState !== "cancelled")
|
|
213
|
+
continue;
|
|
214
|
+
const unmet = [];
|
|
215
|
+
for (const depId of descriptor.dependsOn ?? []) {
|
|
216
|
+
const dep = state.descriptors.get(depId);
|
|
217
|
+
if (!dep) {
|
|
218
|
+
sawMissing = true;
|
|
219
|
+
unmet.push(`'${depId}' (no such task)`);
|
|
220
|
+
}
|
|
221
|
+
else {
|
|
222
|
+
const depState = state.states.get(stateKeyFor(dep)) ?? "pending";
|
|
223
|
+
unmet.push(`'${depId}' (${depState})`);
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
if (unmet.length > 0) {
|
|
227
|
+
blocked.push(` - '${descriptor.nodeId}' is blocked on ${unmet.join(", ")}`);
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
const lines = [
|
|
231
|
+
"Workflow deadlocked: no task can run, and none is waiting on an approval, event, timer, or retry.",
|
|
232
|
+
];
|
|
233
|
+
if (blocked.length > 0) {
|
|
234
|
+
lines.push("Pending tasks and their unsatisfied dependencies:", ...blocked);
|
|
235
|
+
}
|
|
236
|
+
if (sawMissing) {
|
|
237
|
+
lines.push("", "A dependency marked '(no such task)' references a node id that is not a mounted task. " +
|
|
238
|
+
"If it came from deps={{ <key>: ... }}, the key is treated as the upstream task's id unless you remap it: " +
|
|
239
|
+
"add needs={{ <key>: '<upstream task id>' }} (or rename the upstream task to match the key).");
|
|
240
|
+
}
|
|
241
|
+
return lines.join("\n");
|
|
242
|
+
}
|
|
202
243
|
/**
|
|
203
244
|
* @param {unknown} error
|
|
204
245
|
* @param {string} label
|
|
@@ -224,6 +265,7 @@ export function makeWorkflowSession(options = {}) {
|
|
|
224
265
|
states: new Map(),
|
|
225
266
|
outputs: new Map(),
|
|
226
267
|
failures: new Map(),
|
|
268
|
+
failureDescriptors: new Map(),
|
|
227
269
|
retryCounts: new Map(),
|
|
228
270
|
retryWait: new Map(),
|
|
229
271
|
approvals: new Set(),
|
|
@@ -231,6 +273,7 @@ export function makeWorkflowSession(options = {}) {
|
|
|
231
273
|
schedule: null,
|
|
232
274
|
cancelled: false,
|
|
233
275
|
lastMountedSignature: null,
|
|
276
|
+
lastDeadlockSignature: null,
|
|
234
277
|
};
|
|
235
278
|
/**
|
|
236
279
|
* @param {Pick<TaskOutput, "nodeId" | "iteration">} output
|
|
@@ -288,6 +331,7 @@ export function makeWorkflowSession(options = {}) {
|
|
|
288
331
|
state.retryWait.delete(key);
|
|
289
332
|
state.approvals.delete(key);
|
|
290
333
|
state.retryCounts.delete(key);
|
|
334
|
+
state.failureDescriptors.delete(key);
|
|
291
335
|
}
|
|
292
336
|
}
|
|
293
337
|
for (const ralph of ralphs) {
|
|
@@ -317,6 +361,7 @@ export function makeWorkflowSession(options = {}) {
|
|
|
317
361
|
state.states.set(key, "finished");
|
|
318
362
|
state.outputs.set(key, output);
|
|
319
363
|
state.retryWait.delete(key);
|
|
364
|
+
state.failureDescriptors.delete(key);
|
|
320
365
|
}
|
|
321
366
|
/**
|
|
322
367
|
* @param {number} [iteration]
|
|
@@ -367,7 +412,7 @@ export function makeWorkflowSession(options = {}) {
|
|
|
367
412
|
const canRetry = retryable &&
|
|
368
413
|
(descriptor.retries === Infinity || failureCount <= descriptor.retries);
|
|
369
414
|
if (canRetry) {
|
|
370
|
-
const delay =
|
|
415
|
+
const delay = computeRetryDelayMs(descriptor.retryPolicy, failureCount);
|
|
371
416
|
state.states.set(key, "pending");
|
|
372
417
|
if (delay > 0) {
|
|
373
418
|
state.retryWait.set(key, nowMs() + delay);
|
|
@@ -379,6 +424,7 @@ export function makeWorkflowSession(options = {}) {
|
|
|
379
424
|
}
|
|
380
425
|
state.states.set(key, "failed");
|
|
381
426
|
state.failures.set(key, error);
|
|
427
|
+
state.failureDescriptors.set(key, descriptor);
|
|
382
428
|
return decide();
|
|
383
429
|
}
|
|
384
430
|
/**
|
|
@@ -387,11 +433,15 @@ export function makeWorkflowSession(options = {}) {
|
|
|
387
433
|
function unhandledFailureDecision(recoveryKeys = new Set()) {
|
|
388
434
|
for (const [key, taskState] of state.states) {
|
|
389
435
|
const parsed = parseStateKey(key);
|
|
390
|
-
const descriptor = findDescriptor(state, parsed.nodeId, parsed.iteration)
|
|
436
|
+
const descriptor = findDescriptor(state, parsed.nodeId, parsed.iteration) ??
|
|
437
|
+
state.failureDescriptors.get(key);
|
|
391
438
|
if (taskState === "failed" && !descriptor?.continueOnFail) {
|
|
392
439
|
if (recoveryKeys.has(key)) {
|
|
393
440
|
continue;
|
|
394
441
|
}
|
|
442
|
+
if (descriptor?.agent && isTransientSessionFailure(state.failures.get(key))) {
|
|
443
|
+
continue;
|
|
444
|
+
}
|
|
395
445
|
return {
|
|
396
446
|
_tag: "Failed",
|
|
397
447
|
error: new SmithersError("SESSION_ERROR", `Task failed: ${descriptor?.nodeId ?? key}`, { key }, state.failures.get(key)),
|
|
@@ -497,26 +547,17 @@ export function makeWorkflowSession(options = {}) {
|
|
|
497
547
|
if (existingWait) {
|
|
498
548
|
return { _tag: "Wait", reason: existingWait };
|
|
499
549
|
}
|
|
500
|
-
if (schedule.
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
}
|
|
512
|
-
if ([...state.states.values()].some((taskState) => taskState === "in-progress")) {
|
|
513
|
-
return { _tag: "Wait", reason: { _tag: "ExternalTrigger" } };
|
|
514
|
-
}
|
|
515
|
-
failure = unhandledFailureDecision(recoveryKeys);
|
|
516
|
-
if (failure) {
|
|
517
|
-
return failure;
|
|
518
|
-
}
|
|
519
|
-
if (schedule.readyRalphs.length > 0) {
|
|
550
|
+
if (schedule.readyRalphs.length > 0 && !unhandledFailureDecision(recoveryKeys)) {
|
|
551
|
+
// A ralph is ready only when every task in its own subtree is
|
|
552
|
+
// terminal, so pending or in-flight work elsewhere in the graph must
|
|
553
|
+
// not starve its next iteration (#267). Run-level continue-as-new
|
|
554
|
+
// handoffs stay quiescence-only: tearing down the run while sibling
|
|
555
|
+
// tasks are mid-flight is not safe, so those ralphs are deferred.
|
|
556
|
+
// An unhandled task failure keeps its precedence over further loop
|
|
557
|
+
// iterations (decide() already returns it at the top; this guard
|
|
558
|
+
// makes the ordering explicit).
|
|
559
|
+
const hasInProgress = [...state.states.values()].some((taskState) => taskState === "in-progress");
|
|
560
|
+
let advanced = false;
|
|
520
561
|
for (const ralph of schedule.readyRalphs) {
|
|
521
562
|
const current = state.ralphState.get(ralph.id) ?? {
|
|
522
563
|
iteration: 0,
|
|
@@ -524,6 +565,7 @@ export function makeWorkflowSession(options = {}) {
|
|
|
524
565
|
};
|
|
525
566
|
if (ralph.until) {
|
|
526
567
|
state.ralphState.set(ralph.id, { ...current, done: true });
|
|
568
|
+
advanced = true;
|
|
527
569
|
continue;
|
|
528
570
|
}
|
|
529
571
|
const nextIteration = current.iteration + 1;
|
|
@@ -535,13 +577,18 @@ export function makeWorkflowSession(options = {}) {
|
|
|
535
577
|
};
|
|
536
578
|
}
|
|
537
579
|
state.ralphState.set(ralph.id, { iteration: current.iteration, done: true });
|
|
580
|
+
advanced = true;
|
|
538
581
|
continue;
|
|
539
582
|
}
|
|
540
|
-
|
|
541
|
-
if (ralph.continueAsNewEvery != null &&
|
|
583
|
+
const wantsContinueAsNew = ralph.continueAsNewEvery != null &&
|
|
542
584
|
ralph.continueAsNewEvery > 0 &&
|
|
543
585
|
nextIteration > 0 &&
|
|
544
|
-
nextIteration % ralph.continueAsNewEvery === 0
|
|
586
|
+
nextIteration % ralph.continueAsNewEvery === 0;
|
|
587
|
+
if (wantsContinueAsNew && (hasInProgress || schedule.pendingExists)) {
|
|
588
|
+
continue;
|
|
589
|
+
}
|
|
590
|
+
state.ralphState.set(ralph.id, { iteration: nextIteration, done: false });
|
|
591
|
+
if (wantsContinueAsNew) {
|
|
545
592
|
return {
|
|
546
593
|
_tag: "ContinueAsNew",
|
|
547
594
|
transition: {
|
|
@@ -551,8 +598,51 @@ export function makeWorkflowSession(options = {}) {
|
|
|
551
598
|
},
|
|
552
599
|
};
|
|
553
600
|
}
|
|
601
|
+
advanced = true;
|
|
602
|
+
}
|
|
603
|
+
if (advanced) {
|
|
604
|
+
return { _tag: "ReRender", context: renderContext(state) };
|
|
605
|
+
}
|
|
606
|
+
}
|
|
607
|
+
if (schedule.pendingExists) {
|
|
608
|
+
if (schedule.nextRetryAtMs != null) {
|
|
609
|
+
return {
|
|
610
|
+
_tag: "Wait",
|
|
611
|
+
reason: {
|
|
612
|
+
_tag: "RetryBackoff",
|
|
613
|
+
waitMs: Math.max(0, schedule.nextRetryAtMs - nowMs()),
|
|
614
|
+
},
|
|
615
|
+
};
|
|
616
|
+
}
|
|
617
|
+
// Nothing is runnable, in flight, or waiting on an approval, event, or
|
|
618
|
+
// timer, yet tasks remain pending. They are blocked on dependencies
|
|
619
|
+
// nothing will ever satisfy — most often a deps/needs key that maps to
|
|
620
|
+
// a node id no task produces, which becomes a dependsOn on a missing
|
|
621
|
+
// node. Returning Wait here suspends the run forever with no error.
|
|
622
|
+
// Give a reactive re-render one chance to mount a producer (the mounted
|
|
623
|
+
// signature changes), then fail loudly with a diagnostic.
|
|
624
|
+
const noInProgress = ![...state.states.values()].some((taskState) => taskState === "in-progress");
|
|
625
|
+
if (noInProgress) {
|
|
626
|
+
if (options.requireStableFinish && state.graph) {
|
|
627
|
+
const signature = mountedSignature(state.graph);
|
|
628
|
+
if (state.lastDeadlockSignature !== signature) {
|
|
629
|
+
state.lastDeadlockSignature = signature;
|
|
630
|
+
return { _tag: "ReRender", context: renderContext(state) };
|
|
631
|
+
}
|
|
632
|
+
}
|
|
633
|
+
return {
|
|
634
|
+
_tag: "Failed",
|
|
635
|
+
error: new SmithersError("DEPENDENCY_DEADLOCK", describeDeadlock(state)),
|
|
636
|
+
};
|
|
554
637
|
}
|
|
555
|
-
return { _tag: "
|
|
638
|
+
return { _tag: "Wait", reason: { _tag: "ExternalTrigger" } };
|
|
639
|
+
}
|
|
640
|
+
if ([...state.states.values()].some((taskState) => taskState === "in-progress")) {
|
|
641
|
+
return { _tag: "Wait", reason: { _tag: "ExternalTrigger" } };
|
|
642
|
+
}
|
|
643
|
+
failure = unhandledFailureDecision(recoveryKeys);
|
|
644
|
+
if (failure) {
|
|
645
|
+
return failure;
|
|
556
646
|
}
|
|
557
647
|
if (options.requireStableFinish && state.graph) {
|
|
558
648
|
const signature = mountedSignature(state.graph);
|