@smithers-orchestrator/scheduler 0.23.0 → 0.24.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +3 -3
- package/src/WorkflowSessionOptions.ts +28 -0
- package/src/index.d.ts +9 -0
- package/src/makeWorkflowSession.js +159 -47
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@smithers-orchestrator/scheduler",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.24.2",
|
|
4
4
|
"description": "Pure decision engine: session, scheduler, and task state management for Smithers workflows",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"sideEffects": false,
|
|
@@ -176,8 +176,8 @@
|
|
|
176
176
|
],
|
|
177
177
|
"dependencies": {
|
|
178
178
|
"effect": "^3.21.1",
|
|
179
|
-
"@smithers-orchestrator/errors": "0.
|
|
180
|
-
"@smithers-orchestrator/graph": "0.
|
|
179
|
+
"@smithers-orchestrator/errors": "0.24.2",
|
|
180
|
+
"@smithers-orchestrator/graph": "0.24.2"
|
|
181
181
|
},
|
|
182
182
|
"devDependencies": {
|
|
183
183
|
"@types/bun": "latest",
|
|
@@ -1,3 +1,13 @@
|
|
|
1
|
+
import type { TaskDescriptor } from "@smithers-orchestrator/graph/TaskDescriptor";
|
|
2
|
+
|
|
3
|
+
/** A breached Aspects budget for a task that is about to be dispatched. */
|
|
4
|
+
export type AspectBudgetBreach = {
|
|
5
|
+
readonly kind: "tokens" | "latency";
|
|
6
|
+
readonly limit: number;
|
|
7
|
+
readonly current: number;
|
|
8
|
+
readonly onExceeded: "fail" | "warn" | "skip-remaining";
|
|
9
|
+
};
|
|
10
|
+
|
|
1
11
|
export type WorkflowSessionOptions = {
|
|
2
12
|
readonly runId?: string;
|
|
3
13
|
readonly nowMs?: () => number;
|
|
@@ -7,4 +17,22 @@ export type WorkflowSessionOptions = {
|
|
|
7
17
|
readonly iteration: number;
|
|
8
18
|
readonly done: boolean;
|
|
9
19
|
}>;
|
|
20
|
+
/**
|
|
21
|
+
* Evaluate a runnable task's Aspects budgets against the run's accumulated
|
|
22
|
+
* usage. Return the first breach, or `null`/`undefined` when within budget.
|
|
23
|
+
* Only invoked for tasks that would otherwise execute.
|
|
24
|
+
*/
|
|
25
|
+
readonly evaluateAspectBudget?: (
|
|
26
|
+
descriptor: TaskDescriptor,
|
|
27
|
+
) => AspectBudgetBreach | null | undefined;
|
|
28
|
+
/** Called when a task is skipped because its budget was exceeded (`skip-remaining`). */
|
|
29
|
+
readonly onAspectBudgetSkip?: (
|
|
30
|
+
descriptor: TaskDescriptor,
|
|
31
|
+
breach: AspectBudgetBreach,
|
|
32
|
+
) => void;
|
|
33
|
+
/** Called when a task continues despite an exceeded budget (`warn`). */
|
|
34
|
+
readonly onAspectBudgetWarn?: (
|
|
35
|
+
descriptor: TaskDescriptor,
|
|
36
|
+
breach: AspectBudgetBreach,
|
|
37
|
+
) => void;
|
|
10
38
|
};
|
package/src/index.d.ts
CHANGED
|
@@ -199,6 +199,12 @@ type WorkflowSessionService$2 = {
|
|
|
199
199
|
readonly getCurrentGraph: () => Effect.Effect<WorkflowGraph | null>;
|
|
200
200
|
};
|
|
201
201
|
|
|
202
|
+
type AspectBudgetBreach$1 = {
|
|
203
|
+
readonly kind: "tokens" | "latency";
|
|
204
|
+
readonly limit: number;
|
|
205
|
+
readonly current: number;
|
|
206
|
+
readonly onExceeded: "fail" | "warn" | "skip-remaining";
|
|
207
|
+
};
|
|
202
208
|
type WorkflowSessionOptions$2 = {
|
|
203
209
|
readonly runId?: string;
|
|
204
210
|
readonly nowMs?: () => number;
|
|
@@ -208,6 +214,9 @@ type WorkflowSessionOptions$2 = {
|
|
|
208
214
|
readonly iteration: number;
|
|
209
215
|
readonly done: boolean;
|
|
210
216
|
}>;
|
|
217
|
+
readonly evaluateAspectBudget?: (descriptor: TaskDescriptor$3) => AspectBudgetBreach$1 | null | undefined;
|
|
218
|
+
readonly onAspectBudgetSkip?: (descriptor: TaskDescriptor$3, breach: AspectBudgetBreach$1) => void;
|
|
219
|
+
readonly onAspectBudgetWarn?: (descriptor: TaskDescriptor$3, breach: AspectBudgetBreach$1) => void;
|
|
211
220
|
};
|
|
212
221
|
|
|
213
222
|
type TaskRecord$1 = {
|
|
@@ -4,6 +4,7 @@ import { toSmithersError } from "@smithers-orchestrator/errors/toSmithersError";
|
|
|
4
4
|
import { buildPlanTree } from "./buildPlanTree.js";
|
|
5
5
|
import { buildStateKey } from "./buildStateKey.js";
|
|
6
6
|
import { cloneTaskStateMap } from "./cloneTaskStateMap.js";
|
|
7
|
+
import { computeRetryDelayMs } from "./computeRetryDelayMs.js";
|
|
7
8
|
import { parseStateKey } from "./parseStateKey.js";
|
|
8
9
|
import { scheduleTasks } from "./scheduleTasks.js";
|
|
9
10
|
/** @typedef {import("./ApprovalResolution.ts").ApprovalResolution} ApprovalResolution */
|
|
@@ -149,27 +150,6 @@ function parseDurationMs(value) {
|
|
|
149
150
|
return amount;
|
|
150
151
|
}
|
|
151
152
|
}
|
|
152
|
-
/**
|
|
153
|
-
* @param {TaskDescriptor} descriptor
|
|
154
|
-
* @param {number} failureCount
|
|
155
|
-
* @returns {number}
|
|
156
|
-
*/
|
|
157
|
-
function retryDelayMs(descriptor, failureCount) {
|
|
158
|
-
const policy = descriptor.retryPolicy;
|
|
159
|
-
if (!policy)
|
|
160
|
-
return 0;
|
|
161
|
-
const initial = policy.initialDelayMs ?? 0;
|
|
162
|
-
if (policy.backoff === "exponential") {
|
|
163
|
-
const multiplier = policy.multiplier ?? 2;
|
|
164
|
-
const computed = initial * Math.pow(multiplier, Math.max(0, failureCount - 1));
|
|
165
|
-
return Math.min(policy.maxDelayMs ?? computed, computed);
|
|
166
|
-
}
|
|
167
|
-
if (policy.backoff === "linear") {
|
|
168
|
-
const computed = initial * Math.max(1, failureCount);
|
|
169
|
-
return Math.min(policy.maxDelayMs ?? computed, computed);
|
|
170
|
-
}
|
|
171
|
-
return initial;
|
|
172
|
-
}
|
|
173
153
|
/**
|
|
174
154
|
* @param {TaskDescriptor} descriptor
|
|
175
155
|
* @param {unknown} error
|
|
@@ -199,6 +179,67 @@ function isRetryableFailure(descriptor, error) {
|
|
|
199
179
|
}
|
|
200
180
|
return true;
|
|
201
181
|
}
|
|
182
|
+
/**
|
|
183
|
+
* @param {unknown} error
|
|
184
|
+
* @returns {boolean}
|
|
185
|
+
*/
|
|
186
|
+
function isTransientSessionFailure(error) {
|
|
187
|
+
const normalized = toSmithersError(error);
|
|
188
|
+
const code = error && typeof error === "object" && typeof error.code === "string"
|
|
189
|
+
? error.code
|
|
190
|
+
: normalized.code;
|
|
191
|
+
return code === "SESSION_ERROR" ||
|
|
192
|
+
code === "TASK_TIMEOUT" ||
|
|
193
|
+
code === "TASK_HEARTBEAT_TIMEOUT" ||
|
|
194
|
+
code === "TASK_ABORTED" ||
|
|
195
|
+
normalized.details?.failureRetryable === true;
|
|
196
|
+
}
|
|
197
|
+
/**
|
|
198
|
+
* Build a human-readable diagnostic for a dependency deadlock: pending tasks
|
|
199
|
+
* that can never run because their `dependsOn` edges point at tasks missing from
|
|
200
|
+
* the graph or themselves permanently blocked. The most common cause is a
|
|
201
|
+
* `deps`/`needs` mismatch — a `deps={{ key: ... }}` whose key is not the upstream
|
|
202
|
+
* task's id and was not remapped with `needs={{ key: '<id>' }}`, which the Task
|
|
203
|
+
* component (deriveDepNodeIds) turns into a dependency on a non-existent node id.
|
|
204
|
+
* @param {SessionState} state
|
|
205
|
+
* @returns {string}
|
|
206
|
+
*/
|
|
207
|
+
function describeDeadlock(state) {
|
|
208
|
+
const blocked = [];
|
|
209
|
+
let sawMissing = false;
|
|
210
|
+
for (const descriptor of state.descriptors.values()) {
|
|
211
|
+
const taskState = state.states.get(stateKeyFor(descriptor)) ?? "pending";
|
|
212
|
+
if (taskState !== "pending" && taskState !== "cancelled")
|
|
213
|
+
continue;
|
|
214
|
+
const unmet = [];
|
|
215
|
+
for (const depId of descriptor.dependsOn ?? []) {
|
|
216
|
+
const dep = state.descriptors.get(depId);
|
|
217
|
+
if (!dep) {
|
|
218
|
+
sawMissing = true;
|
|
219
|
+
unmet.push(`'${depId}' (no such task)`);
|
|
220
|
+
}
|
|
221
|
+
else {
|
|
222
|
+
const depState = state.states.get(stateKeyFor(dep)) ?? "pending";
|
|
223
|
+
unmet.push(`'${depId}' (${depState})`);
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
if (unmet.length > 0) {
|
|
227
|
+
blocked.push(` - '${descriptor.nodeId}' is blocked on ${unmet.join(", ")}`);
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
const lines = [
|
|
231
|
+
"Workflow deadlocked: no task can run, and none is waiting on an approval, event, timer, or retry.",
|
|
232
|
+
];
|
|
233
|
+
if (blocked.length > 0) {
|
|
234
|
+
lines.push("Pending tasks and their unsatisfied dependencies:", ...blocked);
|
|
235
|
+
}
|
|
236
|
+
if (sawMissing) {
|
|
237
|
+
lines.push("", "A dependency marked '(no such task)' references a node id that is not a mounted task. " +
|
|
238
|
+
"If it came from deps={{ <key>: ... }}, the key is treated as the upstream task's id unless you remap it: " +
|
|
239
|
+
"add needs={{ <key>: '<upstream task id>' }} (or rename the upstream task to match the key).");
|
|
240
|
+
}
|
|
241
|
+
return lines.join("\n");
|
|
242
|
+
}
|
|
202
243
|
/**
|
|
203
244
|
* @param {unknown} error
|
|
204
245
|
* @param {string} label
|
|
@@ -224,6 +265,7 @@ export function makeWorkflowSession(options = {}) {
|
|
|
224
265
|
states: new Map(),
|
|
225
266
|
outputs: new Map(),
|
|
226
267
|
failures: new Map(),
|
|
268
|
+
failureDescriptors: new Map(),
|
|
227
269
|
retryCounts: new Map(),
|
|
228
270
|
retryWait: new Map(),
|
|
229
271
|
approvals: new Set(),
|
|
@@ -231,6 +273,7 @@ export function makeWorkflowSession(options = {}) {
|
|
|
231
273
|
schedule: null,
|
|
232
274
|
cancelled: false,
|
|
233
275
|
lastMountedSignature: null,
|
|
276
|
+
lastDeadlockSignature: null,
|
|
234
277
|
};
|
|
235
278
|
/**
|
|
236
279
|
* @param {Pick<TaskOutput, "nodeId" | "iteration">} output
|
|
@@ -288,6 +331,7 @@ export function makeWorkflowSession(options = {}) {
|
|
|
288
331
|
state.retryWait.delete(key);
|
|
289
332
|
state.approvals.delete(key);
|
|
290
333
|
state.retryCounts.delete(key);
|
|
334
|
+
state.failureDescriptors.delete(key);
|
|
291
335
|
}
|
|
292
336
|
}
|
|
293
337
|
for (const ralph of ralphs) {
|
|
@@ -317,6 +361,7 @@ export function makeWorkflowSession(options = {}) {
|
|
|
317
361
|
state.states.set(key, "finished");
|
|
318
362
|
state.outputs.set(key, output);
|
|
319
363
|
state.retryWait.delete(key);
|
|
364
|
+
state.failureDescriptors.delete(key);
|
|
320
365
|
}
|
|
321
366
|
/**
|
|
322
367
|
* @param {number} [iteration]
|
|
@@ -367,7 +412,7 @@ export function makeWorkflowSession(options = {}) {
|
|
|
367
412
|
const canRetry = retryable &&
|
|
368
413
|
(descriptor.retries === Infinity || failureCount <= descriptor.retries);
|
|
369
414
|
if (canRetry) {
|
|
370
|
-
const delay =
|
|
415
|
+
const delay = computeRetryDelayMs(descriptor.retryPolicy, failureCount);
|
|
371
416
|
state.states.set(key, "pending");
|
|
372
417
|
if (delay > 0) {
|
|
373
418
|
state.retryWait.set(key, nowMs() + delay);
|
|
@@ -379,6 +424,7 @@ export function makeWorkflowSession(options = {}) {
|
|
|
379
424
|
}
|
|
380
425
|
state.states.set(key, "failed");
|
|
381
426
|
state.failures.set(key, error);
|
|
427
|
+
state.failureDescriptors.set(key, descriptor);
|
|
382
428
|
return decide();
|
|
383
429
|
}
|
|
384
430
|
/**
|
|
@@ -387,11 +433,15 @@ export function makeWorkflowSession(options = {}) {
|
|
|
387
433
|
function unhandledFailureDecision(recoveryKeys = new Set()) {
|
|
388
434
|
for (const [key, taskState] of state.states) {
|
|
389
435
|
const parsed = parseStateKey(key);
|
|
390
|
-
const descriptor = findDescriptor(state, parsed.nodeId, parsed.iteration)
|
|
436
|
+
const descriptor = findDescriptor(state, parsed.nodeId, parsed.iteration) ??
|
|
437
|
+
state.failureDescriptors.get(key);
|
|
391
438
|
if (taskState === "failed" && !descriptor?.continueOnFail) {
|
|
392
439
|
if (recoveryKeys.has(key)) {
|
|
393
440
|
continue;
|
|
394
441
|
}
|
|
442
|
+
if (descriptor?.agent && isTransientSessionFailure(state.failures.get(key))) {
|
|
443
|
+
continue;
|
|
444
|
+
}
|
|
395
445
|
return {
|
|
396
446
|
_tag: "Failed",
|
|
397
447
|
error: new SmithersError("SESSION_ERROR", `Task failed: ${descriptor?.nodeId ?? key}`, { key }, state.failures.get(key)),
|
|
@@ -480,6 +530,28 @@ export function makeWorkflowSession(options = {}) {
|
|
|
480
530
|
changed = true;
|
|
481
531
|
continue;
|
|
482
532
|
}
|
|
533
|
+
const budgetBreach = options.evaluateAspectBudget?.(task);
|
|
534
|
+
if (budgetBreach) {
|
|
535
|
+
if (budgetBreach.onExceeded === "skip-remaining") {
|
|
536
|
+
options.onAspectBudgetSkip?.(task, budgetBreach);
|
|
537
|
+
state.states.set(key, "skipped");
|
|
538
|
+
changed = true;
|
|
539
|
+
continue;
|
|
540
|
+
}
|
|
541
|
+
if (budgetBreach.onExceeded === "warn") {
|
|
542
|
+
options.onAspectBudgetWarn?.(task, budgetBreach);
|
|
543
|
+
}
|
|
544
|
+
else {
|
|
545
|
+
return {
|
|
546
|
+
_tag: "Failed",
|
|
547
|
+
error: new SmithersError("ASPECT_BUDGET_EXCEEDED", `Aspects ${budgetBreach.kind} budget exceeded for task "${task.nodeId}": ${budgetBreach.current} >= ${budgetBreach.limit}`, {
|
|
548
|
+
kind: budgetBreach.kind,
|
|
549
|
+
limit: budgetBreach.limit,
|
|
550
|
+
current: budgetBreach.current,
|
|
551
|
+
}),
|
|
552
|
+
};
|
|
553
|
+
}
|
|
554
|
+
}
|
|
483
555
|
state.states.set(key, "in-progress");
|
|
484
556
|
executable.push(task);
|
|
485
557
|
changed = true;
|
|
@@ -497,26 +569,17 @@ export function makeWorkflowSession(options = {}) {
|
|
|
497
569
|
if (existingWait) {
|
|
498
570
|
return { _tag: "Wait", reason: existingWait };
|
|
499
571
|
}
|
|
500
|
-
if (schedule.
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
}
|
|
512
|
-
if ([...state.states.values()].some((taskState) => taskState === "in-progress")) {
|
|
513
|
-
return { _tag: "Wait", reason: { _tag: "ExternalTrigger" } };
|
|
514
|
-
}
|
|
515
|
-
failure = unhandledFailureDecision(recoveryKeys);
|
|
516
|
-
if (failure) {
|
|
517
|
-
return failure;
|
|
518
|
-
}
|
|
519
|
-
if (schedule.readyRalphs.length > 0) {
|
|
572
|
+
if (schedule.readyRalphs.length > 0 && !unhandledFailureDecision(recoveryKeys)) {
|
|
573
|
+
// A ralph is ready only when every task in its own subtree is
|
|
574
|
+
// terminal, so pending or in-flight work elsewhere in the graph must
|
|
575
|
+
// not starve its next iteration (#267). Run-level continue-as-new
|
|
576
|
+
// handoffs stay quiescence-only: tearing down the run while sibling
|
|
577
|
+
// tasks are mid-flight is not safe, so those ralphs are deferred.
|
|
578
|
+
// An unhandled task failure keeps its precedence over further loop
|
|
579
|
+
// iterations (decide() already returns it at the top; this guard
|
|
580
|
+
// makes the ordering explicit).
|
|
581
|
+
const hasInProgress = [...state.states.values()].some((taskState) => taskState === "in-progress");
|
|
582
|
+
let advanced = false;
|
|
520
583
|
for (const ralph of schedule.readyRalphs) {
|
|
521
584
|
const current = state.ralphState.get(ralph.id) ?? {
|
|
522
585
|
iteration: 0,
|
|
@@ -524,6 +587,7 @@ export function makeWorkflowSession(options = {}) {
|
|
|
524
587
|
};
|
|
525
588
|
if (ralph.until) {
|
|
526
589
|
state.ralphState.set(ralph.id, { ...current, done: true });
|
|
590
|
+
advanced = true;
|
|
527
591
|
continue;
|
|
528
592
|
}
|
|
529
593
|
const nextIteration = current.iteration + 1;
|
|
@@ -535,13 +599,18 @@ export function makeWorkflowSession(options = {}) {
|
|
|
535
599
|
};
|
|
536
600
|
}
|
|
537
601
|
state.ralphState.set(ralph.id, { iteration: current.iteration, done: true });
|
|
602
|
+
advanced = true;
|
|
538
603
|
continue;
|
|
539
604
|
}
|
|
540
|
-
|
|
541
|
-
if (ralph.continueAsNewEvery != null &&
|
|
605
|
+
const wantsContinueAsNew = ralph.continueAsNewEvery != null &&
|
|
542
606
|
ralph.continueAsNewEvery > 0 &&
|
|
543
607
|
nextIteration > 0 &&
|
|
544
|
-
nextIteration % ralph.continueAsNewEvery === 0
|
|
608
|
+
nextIteration % ralph.continueAsNewEvery === 0;
|
|
609
|
+
if (wantsContinueAsNew && (hasInProgress || schedule.pendingExists)) {
|
|
610
|
+
continue;
|
|
611
|
+
}
|
|
612
|
+
state.ralphState.set(ralph.id, { iteration: nextIteration, done: false });
|
|
613
|
+
if (wantsContinueAsNew) {
|
|
545
614
|
return {
|
|
546
615
|
_tag: "ContinueAsNew",
|
|
547
616
|
transition: {
|
|
@@ -551,8 +620,51 @@ export function makeWorkflowSession(options = {}) {
|
|
|
551
620
|
},
|
|
552
621
|
};
|
|
553
622
|
}
|
|
623
|
+
advanced = true;
|
|
624
|
+
}
|
|
625
|
+
if (advanced) {
|
|
626
|
+
return { _tag: "ReRender", context: renderContext(state) };
|
|
627
|
+
}
|
|
628
|
+
}
|
|
629
|
+
if (schedule.pendingExists) {
|
|
630
|
+
if (schedule.nextRetryAtMs != null) {
|
|
631
|
+
return {
|
|
632
|
+
_tag: "Wait",
|
|
633
|
+
reason: {
|
|
634
|
+
_tag: "RetryBackoff",
|
|
635
|
+
waitMs: Math.max(0, schedule.nextRetryAtMs - nowMs()),
|
|
636
|
+
},
|
|
637
|
+
};
|
|
638
|
+
}
|
|
639
|
+
// Nothing is runnable, in flight, or waiting on an approval, event, or
|
|
640
|
+
// timer, yet tasks remain pending. They are blocked on dependencies
|
|
641
|
+
// nothing will ever satisfy — most often a deps/needs key that maps to
|
|
642
|
+
// a node id no task produces, which becomes a dependsOn on a missing
|
|
643
|
+
// node. Returning Wait here suspends the run forever with no error.
|
|
644
|
+
// Give a reactive re-render one chance to mount a producer (the mounted
|
|
645
|
+
// signature changes), then fail loudly with a diagnostic.
|
|
646
|
+
const noInProgress = ![...state.states.values()].some((taskState) => taskState === "in-progress");
|
|
647
|
+
if (noInProgress) {
|
|
648
|
+
if (options.requireStableFinish && state.graph) {
|
|
649
|
+
const signature = mountedSignature(state.graph);
|
|
650
|
+
if (state.lastDeadlockSignature !== signature) {
|
|
651
|
+
state.lastDeadlockSignature = signature;
|
|
652
|
+
return { _tag: "ReRender", context: renderContext(state) };
|
|
653
|
+
}
|
|
654
|
+
}
|
|
655
|
+
return {
|
|
656
|
+
_tag: "Failed",
|
|
657
|
+
error: new SmithersError("DEPENDENCY_DEADLOCK", describeDeadlock(state)),
|
|
658
|
+
};
|
|
554
659
|
}
|
|
555
|
-
return { _tag: "
|
|
660
|
+
return { _tag: "Wait", reason: { _tag: "ExternalTrigger" } };
|
|
661
|
+
}
|
|
662
|
+
if ([...state.states.values()].some((taskState) => taskState === "in-progress")) {
|
|
663
|
+
return { _tag: "Wait", reason: { _tag: "ExternalTrigger" } };
|
|
664
|
+
}
|
|
665
|
+
failure = unhandledFailureDecision(recoveryKeys);
|
|
666
|
+
if (failure) {
|
|
667
|
+
return failure;
|
|
556
668
|
}
|
|
557
669
|
if (options.requireStableFinish && state.graph) {
|
|
558
670
|
const signature = mountedSignature(state.graph);
|