@smithers-orchestrator/scheduler 0.23.0 → 0.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@smithers-orchestrator/scheduler",
3
- "version": "0.23.0",
3
+ "version": "0.24.0",
4
4
  "description": "Pure decision engine: session, scheduler, and task state management for Smithers workflows",
5
5
  "type": "module",
6
6
  "sideEffects": false,
@@ -176,8 +176,8 @@
176
176
  ],
177
177
  "dependencies": {
178
178
  "effect": "^3.21.1",
179
- "@smithers-orchestrator/errors": "0.23.0",
180
- "@smithers-orchestrator/graph": "0.23.0"
179
+ "@smithers-orchestrator/errors": "0.24.0",
180
+ "@smithers-orchestrator/graph": "0.24.0"
181
181
  },
182
182
  "devDependencies": {
183
183
  "@types/bun": "latest",
@@ -4,6 +4,7 @@ import { toSmithersError } from "@smithers-orchestrator/errors/toSmithersError";
4
4
  import { buildPlanTree } from "./buildPlanTree.js";
5
5
  import { buildStateKey } from "./buildStateKey.js";
6
6
  import { cloneTaskStateMap } from "./cloneTaskStateMap.js";
7
+ import { computeRetryDelayMs } from "./computeRetryDelayMs.js";
7
8
  import { parseStateKey } from "./parseStateKey.js";
8
9
  import { scheduleTasks } from "./scheduleTasks.js";
9
10
  /** @typedef {import("./ApprovalResolution.ts").ApprovalResolution} ApprovalResolution */
@@ -149,27 +150,6 @@ function parseDurationMs(value) {
149
150
  return amount;
150
151
  }
151
152
  }
152
- /**
153
- * @param {TaskDescriptor} descriptor
154
- * @param {number} failureCount
155
- * @returns {number}
156
- */
157
- function retryDelayMs(descriptor, failureCount) {
158
- const policy = descriptor.retryPolicy;
159
- if (!policy)
160
- return 0;
161
- const initial = policy.initialDelayMs ?? 0;
162
- if (policy.backoff === "exponential") {
163
- const multiplier = policy.multiplier ?? 2;
164
- const computed = initial * Math.pow(multiplier, Math.max(0, failureCount - 1));
165
- return Math.min(policy.maxDelayMs ?? computed, computed);
166
- }
167
- if (policy.backoff === "linear") {
168
- const computed = initial * Math.max(1, failureCount);
169
- return Math.min(policy.maxDelayMs ?? computed, computed);
170
- }
171
- return initial;
172
- }
173
153
  /**
174
154
  * @param {TaskDescriptor} descriptor
175
155
  * @param {unknown} error
@@ -199,6 +179,67 @@ function isRetryableFailure(descriptor, error) {
199
179
  }
200
180
  return true;
201
181
  }
182
+ /**
183
+ * @param {unknown} error
184
+ * @returns {boolean}
185
+ */
186
+ function isTransientSessionFailure(error) {
187
+ const normalized = toSmithersError(error);
188
+ const code = error && typeof error === "object" && typeof error.code === "string"
189
+ ? error.code
190
+ : normalized.code;
191
+ return code === "SESSION_ERROR" ||
192
+ code === "TASK_TIMEOUT" ||
193
+ code === "TASK_HEARTBEAT_TIMEOUT" ||
194
+ code === "TASK_ABORTED" ||
195
+ normalized.details?.failureRetryable === true;
196
+ }
197
+ /**
198
+ * Build a human-readable diagnostic for a dependency deadlock: pending tasks
199
+ * that can never run because their `dependsOn` edges point at tasks missing from
200
+ * the graph or themselves permanently blocked. The most common cause is a
201
+ * `deps`/`needs` mismatch — a `deps={{ key: ... }}` whose key is not the upstream
202
+ * task's id and was not remapped with `needs={{ key: '<id>' }}`, which the Task
203
+ * component (deriveDepNodeIds) turns into a dependency on a non-existent node id.
204
+ * @param {SessionState} state
205
+ * @returns {string}
206
+ */
207
+ function describeDeadlock(state) {
208
+ const blocked = [];
209
+ let sawMissing = false;
210
+ for (const descriptor of state.descriptors.values()) {
211
+ const taskState = state.states.get(stateKeyFor(descriptor)) ?? "pending";
212
+ if (taskState !== "pending" && taskState !== "cancelled")
213
+ continue;
214
+ const unmet = [];
215
+ for (const depId of descriptor.dependsOn ?? []) {
216
+ const dep = state.descriptors.get(depId);
217
+ if (!dep) {
218
+ sawMissing = true;
219
+ unmet.push(`'${depId}' (no such task)`);
220
+ }
221
+ else {
222
+ const depState = state.states.get(stateKeyFor(dep)) ?? "pending";
223
+ unmet.push(`'${depId}' (${depState})`);
224
+ }
225
+ }
226
+ if (unmet.length > 0) {
227
+ blocked.push(` - '${descriptor.nodeId}' is blocked on ${unmet.join(", ")}`);
228
+ }
229
+ }
230
+ const lines = [
231
+ "Workflow deadlocked: no task can run, and none is waiting on an approval, event, timer, or retry.",
232
+ ];
233
+ if (blocked.length > 0) {
234
+ lines.push("Pending tasks and their unsatisfied dependencies:", ...blocked);
235
+ }
236
+ if (sawMissing) {
237
+ lines.push("", "A dependency marked '(no such task)' references a node id that is not a mounted task. " +
238
+ "If it came from deps={{ <key>: ... }}, the key is treated as the upstream task's id unless you remap it: " +
239
+ "add needs={{ <key>: '<upstream task id>' }} (or rename the upstream task to match the key).");
240
+ }
241
+ return lines.join("\n");
242
+ }
202
243
  /**
203
244
  * @param {unknown} error
204
245
  * @param {string} label
@@ -224,6 +265,7 @@ export function makeWorkflowSession(options = {}) {
224
265
  states: new Map(),
225
266
  outputs: new Map(),
226
267
  failures: new Map(),
268
+ failureDescriptors: new Map(),
227
269
  retryCounts: new Map(),
228
270
  retryWait: new Map(),
229
271
  approvals: new Set(),
@@ -231,6 +273,7 @@ export function makeWorkflowSession(options = {}) {
231
273
  schedule: null,
232
274
  cancelled: false,
233
275
  lastMountedSignature: null,
276
+ lastDeadlockSignature: null,
234
277
  };
235
278
  /**
236
279
  * @param {Pick<TaskOutput, "nodeId" | "iteration">} output
@@ -288,6 +331,7 @@ export function makeWorkflowSession(options = {}) {
288
331
  state.retryWait.delete(key);
289
332
  state.approvals.delete(key);
290
333
  state.retryCounts.delete(key);
334
+ state.failureDescriptors.delete(key);
291
335
  }
292
336
  }
293
337
  for (const ralph of ralphs) {
@@ -317,6 +361,7 @@ export function makeWorkflowSession(options = {}) {
317
361
  state.states.set(key, "finished");
318
362
  state.outputs.set(key, output);
319
363
  state.retryWait.delete(key);
364
+ state.failureDescriptors.delete(key);
320
365
  }
321
366
  /**
322
367
  * @param {number} [iteration]
@@ -367,7 +412,7 @@ export function makeWorkflowSession(options = {}) {
367
412
  const canRetry = retryable &&
368
413
  (descriptor.retries === Infinity || failureCount <= descriptor.retries);
369
414
  if (canRetry) {
370
- const delay = retryDelayMs(descriptor, failureCount);
415
+ const delay = computeRetryDelayMs(descriptor.retryPolicy, failureCount);
371
416
  state.states.set(key, "pending");
372
417
  if (delay > 0) {
373
418
  state.retryWait.set(key, nowMs() + delay);
@@ -379,6 +424,7 @@ export function makeWorkflowSession(options = {}) {
379
424
  }
380
425
  state.states.set(key, "failed");
381
426
  state.failures.set(key, error);
427
+ state.failureDescriptors.set(key, descriptor);
382
428
  return decide();
383
429
  }
384
430
  /**
@@ -387,11 +433,15 @@ export function makeWorkflowSession(options = {}) {
387
433
  function unhandledFailureDecision(recoveryKeys = new Set()) {
388
434
  for (const [key, taskState] of state.states) {
389
435
  const parsed = parseStateKey(key);
390
- const descriptor = findDescriptor(state, parsed.nodeId, parsed.iteration);
436
+ const descriptor = findDescriptor(state, parsed.nodeId, parsed.iteration) ??
437
+ state.failureDescriptors.get(key);
391
438
  if (taskState === "failed" && !descriptor?.continueOnFail) {
392
439
  if (recoveryKeys.has(key)) {
393
440
  continue;
394
441
  }
442
+ if (descriptor?.agent && isTransientSessionFailure(state.failures.get(key))) {
443
+ continue;
444
+ }
395
445
  return {
396
446
  _tag: "Failed",
397
447
  error: new SmithersError("SESSION_ERROR", `Task failed: ${descriptor?.nodeId ?? key}`, { key }, state.failures.get(key)),
@@ -497,26 +547,17 @@ export function makeWorkflowSession(options = {}) {
497
547
  if (existingWait) {
498
548
  return { _tag: "Wait", reason: existingWait };
499
549
  }
500
- if (schedule.pendingExists) {
501
- if (schedule.nextRetryAtMs != null) {
502
- return {
503
- _tag: "Wait",
504
- reason: {
505
- _tag: "RetryBackoff",
506
- waitMs: Math.max(0, schedule.nextRetryAtMs - nowMs()),
507
- },
508
- };
509
- }
510
- return { _tag: "Wait", reason: { _tag: "ExternalTrigger" } };
511
- }
512
- if ([...state.states.values()].some((taskState) => taskState === "in-progress")) {
513
- return { _tag: "Wait", reason: { _tag: "ExternalTrigger" } };
514
- }
515
- failure = unhandledFailureDecision(recoveryKeys);
516
- if (failure) {
517
- return failure;
518
- }
519
- if (schedule.readyRalphs.length > 0) {
550
+ if (schedule.readyRalphs.length > 0 && !unhandledFailureDecision(recoveryKeys)) {
551
+ // A ralph is ready only when every task in its own subtree is
552
+ // terminal, so pending or in-flight work elsewhere in the graph must
553
+ // not starve its next iteration (#267). Run-level continue-as-new
554
+ // handoffs stay quiescence-only: tearing down the run while sibling
555
+ // tasks are mid-flight is not safe, so those ralphs are deferred.
556
+ // An unhandled task failure keeps its precedence over further loop
557
+ // iterations (decide() already returns it at the top; this guard
558
+ // makes the ordering explicit).
559
+ const hasInProgress = [...state.states.values()].some((taskState) => taskState === "in-progress");
560
+ let advanced = false;
520
561
  for (const ralph of schedule.readyRalphs) {
521
562
  const current = state.ralphState.get(ralph.id) ?? {
522
563
  iteration: 0,
@@ -524,6 +565,7 @@ export function makeWorkflowSession(options = {}) {
524
565
  };
525
566
  if (ralph.until) {
526
567
  state.ralphState.set(ralph.id, { ...current, done: true });
568
+ advanced = true;
527
569
  continue;
528
570
  }
529
571
  const nextIteration = current.iteration + 1;
@@ -535,13 +577,18 @@ export function makeWorkflowSession(options = {}) {
535
577
  };
536
578
  }
537
579
  state.ralphState.set(ralph.id, { iteration: current.iteration, done: true });
580
+ advanced = true;
538
581
  continue;
539
582
  }
540
- state.ralphState.set(ralph.id, { iteration: nextIteration, done: false });
541
- if (ralph.continueAsNewEvery != null &&
583
+ const wantsContinueAsNew = ralph.continueAsNewEvery != null &&
542
584
  ralph.continueAsNewEvery > 0 &&
543
585
  nextIteration > 0 &&
544
- nextIteration % ralph.continueAsNewEvery === 0) {
586
+ nextIteration % ralph.continueAsNewEvery === 0;
587
+ if (wantsContinueAsNew && (hasInProgress || schedule.pendingExists)) {
588
+ continue;
589
+ }
590
+ state.ralphState.set(ralph.id, { iteration: nextIteration, done: false });
591
+ if (wantsContinueAsNew) {
545
592
  return {
546
593
  _tag: "ContinueAsNew",
547
594
  transition: {
@@ -551,8 +598,51 @@ export function makeWorkflowSession(options = {}) {
551
598
  },
552
599
  };
553
600
  }
601
+ advanced = true;
602
+ }
603
+ if (advanced) {
604
+ return { _tag: "ReRender", context: renderContext(state) };
605
+ }
606
+ }
607
+ if (schedule.pendingExists) {
608
+ if (schedule.nextRetryAtMs != null) {
609
+ return {
610
+ _tag: "Wait",
611
+ reason: {
612
+ _tag: "RetryBackoff",
613
+ waitMs: Math.max(0, schedule.nextRetryAtMs - nowMs()),
614
+ },
615
+ };
616
+ }
617
+ // Nothing is runnable, in flight, or waiting on an approval, event, or
618
+ // timer, yet tasks remain pending. They are blocked on dependencies
619
+ // nothing will ever satisfy — most often a deps/needs key that maps to
620
+ // a node id no task produces, which becomes a dependsOn on a missing
621
+ // node. Returning Wait here suspends the run forever with no error.
622
+ // Give a reactive re-render one chance to mount a producer (the mounted
623
+ // signature changes), then fail loudly with a diagnostic.
624
+ const noInProgress = ![...state.states.values()].some((taskState) => taskState === "in-progress");
625
+ if (noInProgress) {
626
+ if (options.requireStableFinish && state.graph) {
627
+ const signature = mountedSignature(state.graph);
628
+ if (state.lastDeadlockSignature !== signature) {
629
+ state.lastDeadlockSignature = signature;
630
+ return { _tag: "ReRender", context: renderContext(state) };
631
+ }
632
+ }
633
+ return {
634
+ _tag: "Failed",
635
+ error: new SmithersError("DEPENDENCY_DEADLOCK", describeDeadlock(state)),
636
+ };
554
637
  }
555
- return { _tag: "ReRender", context: renderContext(state) };
638
+ return { _tag: "Wait", reason: { _tag: "ExternalTrigger" } };
639
+ }
640
+ if ([...state.states.values()].some((taskState) => taskState === "in-progress")) {
641
+ return { _tag: "Wait", reason: { _tag: "ExternalTrigger" } };
642
+ }
643
+ failure = unhandledFailureDecision(recoveryKeys);
644
+ if (failure) {
645
+ return failure;
556
646
  }
557
647
  if (options.requireStableFinish && state.graph) {
558
648
  const signature = mountedSignature(state.graph);