@smithers-orchestrator/scheduler 0.23.0 → 0.24.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@smithers-orchestrator/scheduler",
3
- "version": "0.23.0",
3
+ "version": "0.24.2",
4
4
  "description": "Pure decision engine: session, scheduler, and task state management for Smithers workflows",
5
5
  "type": "module",
6
6
  "sideEffects": false,
@@ -176,8 +176,8 @@
176
176
  ],
177
177
  "dependencies": {
178
178
  "effect": "^3.21.1",
179
- "@smithers-orchestrator/errors": "0.23.0",
180
- "@smithers-orchestrator/graph": "0.23.0"
179
+ "@smithers-orchestrator/errors": "0.24.2",
180
+ "@smithers-orchestrator/graph": "0.24.2"
181
181
  },
182
182
  "devDependencies": {
183
183
  "@types/bun": "latest",
@@ -1,3 +1,13 @@
1
+ import type { TaskDescriptor } from "@smithers-orchestrator/graph/TaskDescriptor";
2
+
3
+ /** A breached Aspects budget for a task that is about to be dispatched. */
4
+ export type AspectBudgetBreach = {
5
+ readonly kind: "tokens" | "latency";
6
+ readonly limit: number;
7
+ readonly current: number;
8
+ readonly onExceeded: "fail" | "warn" | "skip-remaining";
9
+ };
10
+
1
11
  export type WorkflowSessionOptions = {
2
12
  readonly runId?: string;
3
13
  readonly nowMs?: () => number;
@@ -7,4 +17,22 @@ export type WorkflowSessionOptions = {
7
17
  readonly iteration: number;
8
18
  readonly done: boolean;
9
19
  }>;
20
+ /**
21
+ * Evaluate a runnable task's Aspects budgets against the run's accumulated
22
+ * usage. Return the first breach, or `null`/`undefined` when within budget.
23
+ * Only invoked for tasks that would otherwise execute.
24
+ */
25
+ readonly evaluateAspectBudget?: (
26
+ descriptor: TaskDescriptor,
27
+ ) => AspectBudgetBreach | null | undefined;
28
+ /** Called when a task is skipped because its budget was exceeded (`skip-remaining`). */
29
+ readonly onAspectBudgetSkip?: (
30
+ descriptor: TaskDescriptor,
31
+ breach: AspectBudgetBreach,
32
+ ) => void;
33
+ /** Called when a task continues despite an exceeded budget (`warn`). */
34
+ readonly onAspectBudgetWarn?: (
35
+ descriptor: TaskDescriptor,
36
+ breach: AspectBudgetBreach,
37
+ ) => void;
10
38
  };
package/src/index.d.ts CHANGED
@@ -199,6 +199,12 @@ type WorkflowSessionService$2 = {
199
199
  readonly getCurrentGraph: () => Effect.Effect<WorkflowGraph | null>;
200
200
  };
201
201
 
202
+ type AspectBudgetBreach$1 = {
203
+ readonly kind: "tokens" | "latency";
204
+ readonly limit: number;
205
+ readonly current: number;
206
+ readonly onExceeded: "fail" | "warn" | "skip-remaining";
207
+ };
202
208
  type WorkflowSessionOptions$2 = {
203
209
  readonly runId?: string;
204
210
  readonly nowMs?: () => number;
@@ -208,6 +214,9 @@ type WorkflowSessionOptions$2 = {
208
214
  readonly iteration: number;
209
215
  readonly done: boolean;
210
216
  }>;
217
+ readonly evaluateAspectBudget?: (descriptor: TaskDescriptor$3) => AspectBudgetBreach$1 | null | undefined;
218
+ readonly onAspectBudgetSkip?: (descriptor: TaskDescriptor$3, breach: AspectBudgetBreach$1) => void;
219
+ readonly onAspectBudgetWarn?: (descriptor: TaskDescriptor$3, breach: AspectBudgetBreach$1) => void;
211
220
  };
212
221
 
213
222
  type TaskRecord$1 = {
@@ -4,6 +4,7 @@ import { toSmithersError } from "@smithers-orchestrator/errors/toSmithersError";
4
4
  import { buildPlanTree } from "./buildPlanTree.js";
5
5
  import { buildStateKey } from "./buildStateKey.js";
6
6
  import { cloneTaskStateMap } from "./cloneTaskStateMap.js";
7
+ import { computeRetryDelayMs } from "./computeRetryDelayMs.js";
7
8
  import { parseStateKey } from "./parseStateKey.js";
8
9
  import { scheduleTasks } from "./scheduleTasks.js";
9
10
  /** @typedef {import("./ApprovalResolution.ts").ApprovalResolution} ApprovalResolution */
@@ -149,27 +150,6 @@ function parseDurationMs(value) {
149
150
  return amount;
150
151
  }
151
152
  }
152
- /**
153
- * @param {TaskDescriptor} descriptor
154
- * @param {number} failureCount
155
- * @returns {number}
156
- */
157
- function retryDelayMs(descriptor, failureCount) {
158
- const policy = descriptor.retryPolicy;
159
- if (!policy)
160
- return 0;
161
- const initial = policy.initialDelayMs ?? 0;
162
- if (policy.backoff === "exponential") {
163
- const multiplier = policy.multiplier ?? 2;
164
- const computed = initial * Math.pow(multiplier, Math.max(0, failureCount - 1));
165
- return Math.min(policy.maxDelayMs ?? computed, computed);
166
- }
167
- if (policy.backoff === "linear") {
168
- const computed = initial * Math.max(1, failureCount);
169
- return Math.min(policy.maxDelayMs ?? computed, computed);
170
- }
171
- return initial;
172
- }
173
153
  /**
174
154
  * @param {TaskDescriptor} descriptor
175
155
  * @param {unknown} error
@@ -199,6 +179,67 @@ function isRetryableFailure(descriptor, error) {
199
179
  }
200
180
  return true;
201
181
  }
182
+ /**
183
+ * @param {unknown} error
184
+ * @returns {boolean}
185
+ */
186
+ function isTransientSessionFailure(error) {
187
+ const normalized = toSmithersError(error);
188
+ const code = error && typeof error === "object" && typeof error.code === "string"
189
+ ? error.code
190
+ : normalized.code;
191
+ return code === "SESSION_ERROR" ||
192
+ code === "TASK_TIMEOUT" ||
193
+ code === "TASK_HEARTBEAT_TIMEOUT" ||
194
+ code === "TASK_ABORTED" ||
195
+ normalized.details?.failureRetryable === true;
196
+ }
197
+ /**
198
+ * Build a human-readable diagnostic for a dependency deadlock: pending tasks
199
+ * that can never run because their `dependsOn` edges point at tasks missing from
200
+ * the graph or themselves permanently blocked. The most common cause is a
201
+ * `deps`/`needs` mismatch — a `deps={{ key: ... }}` whose key is not the upstream
202
+ * task's id and was not remapped with `needs={{ key: '<id>' }}`, which the Task
203
+ * component (deriveDepNodeIds) turns into a dependency on a non-existent node id.
204
+ * @param {SessionState} state
205
+ * @returns {string}
206
+ */
207
+ function describeDeadlock(state) {
208
+ const blocked = [];
209
+ let sawMissing = false;
210
+ for (const descriptor of state.descriptors.values()) {
211
+ const taskState = state.states.get(stateKeyFor(descriptor)) ?? "pending";
212
+ if (taskState !== "pending" && taskState !== "cancelled")
213
+ continue;
214
+ const unmet = [];
215
+ for (const depId of descriptor.dependsOn ?? []) {
216
+ const dep = state.descriptors.get(depId);
217
+ if (!dep) {
218
+ sawMissing = true;
219
+ unmet.push(`'${depId}' (no such task)`);
220
+ }
221
+ else {
222
+ const depState = state.states.get(stateKeyFor(dep)) ?? "pending";
223
+ unmet.push(`'${depId}' (${depState})`);
224
+ }
225
+ }
226
+ if (unmet.length > 0) {
227
+ blocked.push(` - '${descriptor.nodeId}' is blocked on ${unmet.join(", ")}`);
228
+ }
229
+ }
230
+ const lines = [
231
+ "Workflow deadlocked: no task can run, and none is waiting on an approval, event, timer, or retry.",
232
+ ];
233
+ if (blocked.length > 0) {
234
+ lines.push("Pending tasks and their unsatisfied dependencies:", ...blocked);
235
+ }
236
+ if (sawMissing) {
237
+ lines.push("", "A dependency marked '(no such task)' references a node id that is not a mounted task. " +
238
+ "If it came from deps={{ <key>: ... }}, the key is treated as the upstream task's id unless you remap it: " +
239
+ "add needs={{ <key>: '<upstream task id>' }} (or rename the upstream task to match the key).");
240
+ }
241
+ return lines.join("\n");
242
+ }
202
243
  /**
203
244
  * @param {unknown} error
204
245
  * @param {string} label
@@ -224,6 +265,7 @@ export function makeWorkflowSession(options = {}) {
224
265
  states: new Map(),
225
266
  outputs: new Map(),
226
267
  failures: new Map(),
268
+ failureDescriptors: new Map(),
227
269
  retryCounts: new Map(),
228
270
  retryWait: new Map(),
229
271
  approvals: new Set(),
@@ -231,6 +273,7 @@ export function makeWorkflowSession(options = {}) {
231
273
  schedule: null,
232
274
  cancelled: false,
233
275
  lastMountedSignature: null,
276
+ lastDeadlockSignature: null,
234
277
  };
235
278
  /**
236
279
  * @param {Pick<TaskOutput, "nodeId" | "iteration">} output
@@ -288,6 +331,7 @@ export function makeWorkflowSession(options = {}) {
288
331
  state.retryWait.delete(key);
289
332
  state.approvals.delete(key);
290
333
  state.retryCounts.delete(key);
334
+ state.failureDescriptors.delete(key);
291
335
  }
292
336
  }
293
337
  for (const ralph of ralphs) {
@@ -317,6 +361,7 @@ export function makeWorkflowSession(options = {}) {
317
361
  state.states.set(key, "finished");
318
362
  state.outputs.set(key, output);
319
363
  state.retryWait.delete(key);
364
+ state.failureDescriptors.delete(key);
320
365
  }
321
366
  /**
322
367
  * @param {number} [iteration]
@@ -367,7 +412,7 @@ export function makeWorkflowSession(options = {}) {
367
412
  const canRetry = retryable &&
368
413
  (descriptor.retries === Infinity || failureCount <= descriptor.retries);
369
414
  if (canRetry) {
370
- const delay = retryDelayMs(descriptor, failureCount);
415
+ const delay = computeRetryDelayMs(descriptor.retryPolicy, failureCount);
371
416
  state.states.set(key, "pending");
372
417
  if (delay > 0) {
373
418
  state.retryWait.set(key, nowMs() + delay);
@@ -379,6 +424,7 @@ export function makeWorkflowSession(options = {}) {
379
424
  }
380
425
  state.states.set(key, "failed");
381
426
  state.failures.set(key, error);
427
+ state.failureDescriptors.set(key, descriptor);
382
428
  return decide();
383
429
  }
384
430
  /**
@@ -387,11 +433,15 @@ export function makeWorkflowSession(options = {}) {
387
433
  function unhandledFailureDecision(recoveryKeys = new Set()) {
388
434
  for (const [key, taskState] of state.states) {
389
435
  const parsed = parseStateKey(key);
390
- const descriptor = findDescriptor(state, parsed.nodeId, parsed.iteration);
436
+ const descriptor = findDescriptor(state, parsed.nodeId, parsed.iteration) ??
437
+ state.failureDescriptors.get(key);
391
438
  if (taskState === "failed" && !descriptor?.continueOnFail) {
392
439
  if (recoveryKeys.has(key)) {
393
440
  continue;
394
441
  }
442
+ if (descriptor?.agent && isTransientSessionFailure(state.failures.get(key))) {
443
+ continue;
444
+ }
395
445
  return {
396
446
  _tag: "Failed",
397
447
  error: new SmithersError("SESSION_ERROR", `Task failed: ${descriptor?.nodeId ?? key}`, { key }, state.failures.get(key)),
@@ -480,6 +530,28 @@ export function makeWorkflowSession(options = {}) {
480
530
  changed = true;
481
531
  continue;
482
532
  }
533
+ const budgetBreach = options.evaluateAspectBudget?.(task);
534
+ if (budgetBreach) {
535
+ if (budgetBreach.onExceeded === "skip-remaining") {
536
+ options.onAspectBudgetSkip?.(task, budgetBreach);
537
+ state.states.set(key, "skipped");
538
+ changed = true;
539
+ continue;
540
+ }
541
+ if (budgetBreach.onExceeded === "warn") {
542
+ options.onAspectBudgetWarn?.(task, budgetBreach);
543
+ }
544
+ else {
545
+ return {
546
+ _tag: "Failed",
547
+ error: new SmithersError("ASPECT_BUDGET_EXCEEDED", `Aspects ${budgetBreach.kind} budget exceeded for task "${task.nodeId}": ${budgetBreach.current} >= ${budgetBreach.limit}`, {
548
+ kind: budgetBreach.kind,
549
+ limit: budgetBreach.limit,
550
+ current: budgetBreach.current,
551
+ }),
552
+ };
553
+ }
554
+ }
483
555
  state.states.set(key, "in-progress");
484
556
  executable.push(task);
485
557
  changed = true;
@@ -497,26 +569,17 @@ export function makeWorkflowSession(options = {}) {
497
569
  if (existingWait) {
498
570
  return { _tag: "Wait", reason: existingWait };
499
571
  }
500
- if (schedule.pendingExists) {
501
- if (schedule.nextRetryAtMs != null) {
502
- return {
503
- _tag: "Wait",
504
- reason: {
505
- _tag: "RetryBackoff",
506
- waitMs: Math.max(0, schedule.nextRetryAtMs - nowMs()),
507
- },
508
- };
509
- }
510
- return { _tag: "Wait", reason: { _tag: "ExternalTrigger" } };
511
- }
512
- if ([...state.states.values()].some((taskState) => taskState === "in-progress")) {
513
- return { _tag: "Wait", reason: { _tag: "ExternalTrigger" } };
514
- }
515
- failure = unhandledFailureDecision(recoveryKeys);
516
- if (failure) {
517
- return failure;
518
- }
519
- if (schedule.readyRalphs.length > 0) {
572
+ if (schedule.readyRalphs.length > 0 && !unhandledFailureDecision(recoveryKeys)) {
573
+ // A ralph is ready only when every task in its own subtree is
574
+ // terminal, so pending or in-flight work elsewhere in the graph must
575
+ // not starve its next iteration (#267). Run-level continue-as-new
576
+ // handoffs stay quiescence-only: tearing down the run while sibling
577
+ // tasks are mid-flight is not safe, so those ralphs are deferred.
578
+ // An unhandled task failure keeps its precedence over further loop
579
+ // iterations (decide() already returns it at the top; this guard
580
+ // makes the ordering explicit).
581
+ const hasInProgress = [...state.states.values()].some((taskState) => taskState === "in-progress");
582
+ let advanced = false;
520
583
  for (const ralph of schedule.readyRalphs) {
521
584
  const current = state.ralphState.get(ralph.id) ?? {
522
585
  iteration: 0,
@@ -524,6 +587,7 @@ export function makeWorkflowSession(options = {}) {
524
587
  };
525
588
  if (ralph.until) {
526
589
  state.ralphState.set(ralph.id, { ...current, done: true });
590
+ advanced = true;
527
591
  continue;
528
592
  }
529
593
  const nextIteration = current.iteration + 1;
@@ -535,13 +599,18 @@ export function makeWorkflowSession(options = {}) {
535
599
  };
536
600
  }
537
601
  state.ralphState.set(ralph.id, { iteration: current.iteration, done: true });
602
+ advanced = true;
538
603
  continue;
539
604
  }
540
- state.ralphState.set(ralph.id, { iteration: nextIteration, done: false });
541
- if (ralph.continueAsNewEvery != null &&
605
+ const wantsContinueAsNew = ralph.continueAsNewEvery != null &&
542
606
  ralph.continueAsNewEvery > 0 &&
543
607
  nextIteration > 0 &&
544
- nextIteration % ralph.continueAsNewEvery === 0) {
608
+ nextIteration % ralph.continueAsNewEvery === 0;
609
+ if (wantsContinueAsNew && (hasInProgress || schedule.pendingExists)) {
610
+ continue;
611
+ }
612
+ state.ralphState.set(ralph.id, { iteration: nextIteration, done: false });
613
+ if (wantsContinueAsNew) {
545
614
  return {
546
615
  _tag: "ContinueAsNew",
547
616
  transition: {
@@ -551,8 +620,51 @@ export function makeWorkflowSession(options = {}) {
551
620
  },
552
621
  };
553
622
  }
623
+ advanced = true;
624
+ }
625
+ if (advanced) {
626
+ return { _tag: "ReRender", context: renderContext(state) };
627
+ }
628
+ }
629
+ if (schedule.pendingExists) {
630
+ if (schedule.nextRetryAtMs != null) {
631
+ return {
632
+ _tag: "Wait",
633
+ reason: {
634
+ _tag: "RetryBackoff",
635
+ waitMs: Math.max(0, schedule.nextRetryAtMs - nowMs()),
636
+ },
637
+ };
638
+ }
639
+ // Nothing is runnable, in flight, or waiting on an approval, event, or
640
+ // timer, yet tasks remain pending. They are blocked on dependencies
641
+ // nothing will ever satisfy — most often a deps/needs key that maps to
642
+ // a node id no task produces, which becomes a dependsOn on a missing
643
+ // node. Returning Wait here suspends the run forever with no error.
644
+ // Give a reactive re-render one chance to mount a producer (the mounted
645
+ // signature changes), then fail loudly with a diagnostic.
646
+ const noInProgress = ![...state.states.values()].some((taskState) => taskState === "in-progress");
647
+ if (noInProgress) {
648
+ if (options.requireStableFinish && state.graph) {
649
+ const signature = mountedSignature(state.graph);
650
+ if (state.lastDeadlockSignature !== signature) {
651
+ state.lastDeadlockSignature = signature;
652
+ return { _tag: "ReRender", context: renderContext(state) };
653
+ }
654
+ }
655
+ return {
656
+ _tag: "Failed",
657
+ error: new SmithersError("DEPENDENCY_DEADLOCK", describeDeadlock(state)),
658
+ };
554
659
  }
555
- return { _tag: "ReRender", context: renderContext(state) };
660
+ return { _tag: "Wait", reason: { _tag: "ExternalTrigger" } };
661
+ }
662
+ if ([...state.states.values()].some((taskState) => taskState === "in-progress")) {
663
+ return { _tag: "Wait", reason: { _tag: "ExternalTrigger" } };
664
+ }
665
+ failure = unhandledFailureDecision(recoveryKeys);
666
+ if (failure) {
667
+ return failure;
556
668
  }
557
669
  if (options.requireStableFinish && state.graph) {
558
670
  const signature = mountedSignature(state.graph);