@smithers-orchestrator/scheduler 0.25.0 → 0.25.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@smithers-orchestrator/scheduler",
3
- "version": "0.25.0",
3
+ "version": "0.25.2",
4
4
  "description": "Pure decision engine: session, scheduler, and task state management for Smithers workflows",
5
5
  "type": "module",
6
6
  "sideEffects": false,
@@ -176,8 +176,8 @@
176
176
  ],
177
177
  "dependencies": {
178
178
  "effect": "^3.21.1",
179
- "@smithers-orchestrator/errors": "0.25.0",
180
- "@smithers-orchestrator/graph": "0.25.0"
179
+ "@smithers-orchestrator/errors": "0.25.2",
180
+ "@smithers-orchestrator/graph": "0.25.2"
181
181
  },
182
182
  "devDependencies": {
183
183
  "@types/bun": "latest",
package/src/RunResult.ts CHANGED
@@ -13,4 +13,20 @@ export type RunResult = {
13
13
  readonly output?: unknown;
14
14
  readonly error?: unknown;
15
15
  readonly nextRunId?: string;
16
+ /**
17
+ * Number of tasks that ended in a `failed` state yet did not fail the run —
18
+ * "masked" child failures the run-level status cannot express. Present (and
19
+ * `> 0`) only on a `finished` result that tolerated at least one failure
20
+ * (a {@link https://smithers.sh/components/task `continueOnFail`} task, or an
21
+ * agent task that failed transiently: rate limit, timeout, abort). A binary
22
+ * `finished` status would otherwise read as a clean success. See
23
+ * `docs/runtime/run-state.mdx`.
24
+ */
25
+ readonly failedChildren?: number;
26
+ /**
27
+ * Task state keys (`nodeId::iteration`) of the tasks counted by
28
+ * {@link failedChildren}. The iteration disambiguates the same `nodeId` failing
29
+ * across loop/Ralph iterations.
30
+ */
31
+ readonly failedChildKeys?: readonly string[];
16
32
  };
package/src/index.d.ts CHANGED
@@ -138,6 +138,22 @@ type RunResult$1 = {
138
138
  readonly output?: unknown;
139
139
  readonly error?: unknown;
140
140
  readonly nextRunId?: string;
141
+ /**
142
+ * Number of tasks that ended in a `failed` state yet did not fail the run —
143
+ * "masked" child failures the run-level status cannot express. Present (and
144
+ * `> 0`) only on a `finished` result that tolerated at least one failure
145
+ * (a {@link https://smithers.sh/components/task `continueOnFail`} task, or an
146
+ * agent task that failed transiently: rate limit, timeout, abort). A binary
147
+ * `finished` status would otherwise read as a clean success. See
148
+ * `docs/runtime/run-state.mdx`.
149
+ */
150
+ readonly failedChildren?: number;
151
+ /**
152
+ * Task state keys (`nodeId::iteration`) of the tasks counted by
153
+ * {@link failedChildren}. The iteration disambiguates the same `nodeId` failing
154
+ * across loop/Ralph iterations.
155
+ */
156
+ readonly failedChildKeys?: readonly string[];
141
157
  };
142
158
 
143
159
  type WaitReason$1 = {
@@ -353,14 +353,37 @@ export function makeWorkflowSession(options = {}) {
353
353
  * @returns {EngineDecision}
354
354
  */
355
355
  function finishedResult(status = "finished") {
356
- return {
357
- _tag: "Finished",
358
- result: {
359
- runId: state.runId,
360
- status,
361
- output: [...state.outputs.values()].at(-1)?.output,
362
- },
356
+ /** @type {RunResult} */
357
+ const result = {
358
+ runId: state.runId,
359
+ status,
360
+ output: [...state.outputs.values()].at(-1)?.output,
363
361
  };
362
+ if (status === "finished") {
363
+ // At a `finished` terminal, any task still in `failed` state is a
364
+ // *tolerated* failure — an unhandled one would have produced a `Failed`
365
+ // decision via unhandledFailureDecision() and never reached here. Those
366
+ // are exactly the masked children (continueOnFail tasks, transient agent
367
+ // failures) the binary run status cannot express. Surface them so callers
368
+ // can detect a run that "succeeded" while children failed. See issue #295
369
+ // and docs/runtime/run-state.mdx.
370
+ //
371
+ // Keys are the canonical task state keys (`nodeId::iteration`), not bare
372
+ // node ids: a looped/Ralph workflow can fail the same nodeId across
373
+ // iterations, and the iteration is what disambiguates which child to
374
+ // inspect.
375
+ const failedChildKeys = [];
376
+ for (const [key, taskState] of state.states) {
377
+ if (taskState === "failed") {
378
+ failedChildKeys.push(key);
379
+ }
380
+ }
381
+ if (failedChildKeys.length > 0) {
382
+ result.failedChildren = failedChildKeys.length;
383
+ result.failedChildKeys = failedChildKeys;
384
+ }
385
+ }
386
+ return { _tag: "Finished", result };
364
387
  }
365
388
  /**
366
389
  * @returns {ScheduleResult}
@@ -574,14 +597,23 @@ export function makeWorkflowSession(options = {}) {
574
597
  };
575
598
  }
576
599
  /**
577
- * @param {number} [depth] recursion depth; guarded at 10 to catch decision cycles
600
+ * @param {number} [depth] recursion depth; a safety net for a true decision
601
+ * cycle (a non-monotonic transition bug)
578
602
  * @returns {EngineDecision}
579
603
  */
580
604
  function decide(depth = 0) {
581
- if (depth > 10) {
605
+ // Each recursion below only fires when `changed` is true, i.e. at least
606
+ // one task moved to a terminal/in-progress/waiting state — monotonic
607
+ // forward progress. A legitimate chain can therefore be as long as the
608
+ // number of tasks: e.g. a <Sequence> of N skipIf steps yields exactly one
609
+ // skip per pass (#bug: 11+ such steps tripped a hard constant-10 guard and
610
+ // failed a perfectly valid run). Bound by the task count + slack instead;
611
+ // a genuine cycle keeps recursing past the point where every task settled.
612
+ const maxDecideDepth = state.descriptors.size + 10;
613
+ if (depth > maxDecideDepth) {
582
614
  return {
583
615
  _tag: "Failed",
584
- error: new SmithersError("SCHEDULER_ERROR", "Exceeded scheduler decide() depth guard.", { depth }),
616
+ error: new SmithersError("SCHEDULER_ERROR", "Exceeded scheduler decide() depth guard.", { depth, maxDepth: maxDecideDepth }),
585
617
  };
586
618
  }
587
619
  if (state.cancelled) {
@@ -398,8 +398,20 @@ export function scheduleTasks(plan, states, descriptors, ralphState, retryWait,
398
398
  const status = inspect(child, {
399
399
  includeContinuedFailures: true,
400
400
  });
401
- if (!status.terminal)
401
+ if (!status.terminal) {
402
+ // A failure already present in this still-running action
403
+ // subtree (e.g. a failed task in a <Parallel> whose sibling
404
+ // is still in flight) must be recorded as recoverable now.
405
+ // Otherwise decide()'s unhandled-failure check fails the run
406
+ // before the action region settles and the saga's
407
+ // compensation can run — an order-dependent bug that only
408
+ // bites when the failing task settles before its sibling.
409
+ const before = failureRecoveryKeys.size;
410
+ collectFailureKeys(child, { includeContinuedFailures: true });
411
+ if (failureRecoveryKeys.size > before)
412
+ failureRecoveryActive = true;
402
413
  return walk(child);
414
+ }
403
415
  if (status.failed) {
404
416
  failed = true;
405
417
  break;
@@ -448,8 +460,19 @@ export function scheduleTasks(plan, states, descriptors, ralphState, retryWait,
448
460
  const status = inspect(child, {
449
461
  includeContinuedFailures: true,
450
462
  });
451
- if (!status.terminal)
463
+ if (!status.terminal) {
464
+ // A failure already present in this still-running try child
465
+ // (e.g. a failed task in a <Parallel> whose sibling is still
466
+ // in flight) must be recorded as recoverable now, or decide()
467
+ // fails the run before the try region settles — skipping
468
+ // catch AND finally. Deferring here lets the region finish so
469
+ // catch/finally run regardless of which task settles first.
470
+ const before = failureRecoveryKeys.size;
471
+ collectFailureKeys(child, { includeContinuedFailures: true });
472
+ if (failureRecoveryKeys.size > before)
473
+ failureRecoveryActive = true;
452
474
  return walk(child);
475
+ }
453
476
  if (status.failed) {
454
477
  tryFailed = true;
455
478
  break;