npm - @smithers-orchestrator/scheduler - Versions diffs - 0.25.0 → 0.25.2 - Mend

@smithers-orchestrator/scheduler 0.25.0 → 0.25.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/package.json +3 -3
package/src/RunResult.ts +16 -0
package/src/index.d.ts +16 -0
package/src/makeWorkflowSession.js +42 -10
package/src/scheduleTasks.js +25 -2

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@smithers-orchestrator/scheduler",
-  "version": "0.25.0",
+  "version": "0.25.2",
   "description": "Pure decision engine: session, scheduler, and task state management for Smithers workflows",
   "type": "module",
   "sideEffects": false,
@@ -176,8 +176,8 @@
   ],
   "dependencies": {
     "effect": "^3.21.1",
-    "@smithers-orchestrator/errors": "0.25.0",
-    "@smithers-orchestrator/graph": "0.25.0"
+    "@smithers-orchestrator/errors": "0.25.2",
+    "@smithers-orchestrator/graph": "0.25.2"
   },
   "devDependencies": {
     "@types/bun": "latest",

package/src/RunResult.ts CHANGED Viewed

@@ -13,4 +13,20 @@ export type RunResult = {
   readonly output?: unknown;
   readonly error?: unknown;
   readonly nextRunId?: string;
+  /**
+   * Number of tasks that ended in a `failed` state yet did not fail the run —
+   * "masked" child failures the run-level status cannot express. Present (and
+   * `> 0`) only on a `finished` result that tolerated at least one failure
+   * (a {@link https://smithers.sh/components/task `continueOnFail`} task, or an
+   * agent task that failed transiently: rate limit, timeout, abort). A binary
+   * `finished` status would otherwise read as a clean success. See
+   * `docs/runtime/run-state.mdx`.
+   */
+  readonly failedChildren?: number;
+  /**
+   * Task state keys (`nodeId::iteration`) of the tasks counted by
+   * {@link failedChildren}. The iteration disambiguates the same `nodeId` failing
+   * across loop/Ralph iterations.
+   */
+  readonly failedChildKeys?: readonly string[];
 };

package/src/index.d.ts CHANGED Viewed

@@ -138,6 +138,22 @@ type RunResult$1 = {
     readonly output?: unknown;
     readonly error?: unknown;
     readonly nextRunId?: string;
+    /**
+     * Number of tasks that ended in a `failed` state yet did not fail the run —
+     * "masked" child failures the run-level status cannot express. Present (and
+     * `> 0`) only on a `finished` result that tolerated at least one failure
+     * (a {@link https://smithers.sh/components/task `continueOnFail`} task, or an
+     * agent task that failed transiently: rate limit, timeout, abort). A binary
+     * `finished` status would otherwise read as a clean success. See
+     * `docs/runtime/run-state.mdx`.
+     */
+    readonly failedChildren?: number;
+    /**
+     * Task state keys (`nodeId::iteration`) of the tasks counted by
+     * {@link failedChildren}. The iteration disambiguates the same `nodeId` failing
+     * across loop/Ralph iterations.
+     */
+    readonly failedChildKeys?: readonly string[];
 };
 type WaitReason$1 = {

package/src/makeWorkflowSession.js CHANGED Viewed

@@ -353,14 +353,37 @@ export function makeWorkflowSession(options = {}) {
    * @returns {EngineDecision}
    */
     function finishedResult(status = "finished") {
-        return {
-            _tag: "Finished",
-            result: {
-                runId: state.runId,
-                status,
-                output: [...state.outputs.values()].at(-1)?.output,
-            },
+        /** @type {RunResult} */
+        const result = {
+            runId: state.runId,
+            status,
+            output: [...state.outputs.values()].at(-1)?.output,
         };
+        if (status === "finished") {
+            // At a `finished` terminal, any task still in `failed` state is a
+            // *tolerated* failure — an unhandled one would have produced a `Failed`
+            // decision via unhandledFailureDecision() and never reached here. Those
+            // are exactly the masked children (continueOnFail tasks, transient agent
+            // failures) the binary run status cannot express. Surface them so callers
+            // can detect a run that "succeeded" while children failed. See issue #295
+            // and docs/runtime/run-state.mdx.
+            //
+            // Keys are the canonical task state keys (`nodeId::iteration`), not bare
+            // node ids: a looped/Ralph workflow can fail the same nodeId across
+            // iterations, and the iteration is what disambiguates which child to
+            // inspect.
+            const failedChildKeys = [];
+            for (const [key, taskState] of state.states) {
+                if (taskState === "failed") {
+                    failedChildKeys.push(key);
+                }
+            }
+            if (failedChildKeys.length > 0) {
+                result.failedChildren = failedChildKeys.length;
+                result.failedChildKeys = failedChildKeys;
+            }
+        }
+        return { _tag: "Finished", result };
     }
     /**
    * @returns {ScheduleResult}
@@ -574,14 +597,23 @@ export function makeWorkflowSession(options = {}) {
         };
     }
     /**
-   * @param {number} [depth] recursion depth; guarded at 10 to catch decision cycles
+   * @param {number} [depth] recursion depth; a safety net for a true decision
+   *   cycle (a non-monotonic transition bug)
    * @returns {EngineDecision}
    */
     function decide(depth = 0) {
-        if (depth > 10) {
+        // Each recursion below only fires when `changed` is true, i.e. at least
+        // one task moved to a terminal/in-progress/waiting state — monotonic
+        // forward progress. A legitimate chain can therefore be as long as the
+        // number of tasks: e.g. a <Sequence> of N skipIf steps yields exactly one
+        // skip per pass (#bug: 11+ such steps tripped a hard constant-10 guard and
+        // failed a perfectly valid run). Bound by the task count + slack instead;
+        // a genuine cycle keeps recursing past the point where every task settled.
+        const maxDecideDepth = state.descriptors.size + 10;
+        if (depth > maxDecideDepth) {
             return {
                 _tag: "Failed",
-                error: new SmithersError("SCHEDULER_ERROR", "Exceeded scheduler decide() depth guard.", { depth }),
+                error: new SmithersError("SCHEDULER_ERROR", "Exceeded scheduler decide() depth guard.", { depth, maxDepth: maxDecideDepth }),
             };
         }
         if (state.cancelled) {

package/src/scheduleTasks.js CHANGED Viewed

@@ -398,8 +398,20 @@ export function scheduleTasks(plan, states, descriptors, ralphState, retryWait,
                     const status = inspect(child, {
                         includeContinuedFailures: true,
                     });
-                    if (!status.terminal)
+                    if (!status.terminal) {
+                        // A failure already present in this still-running action
+                        // subtree (e.g. a failed task in a <Parallel> whose sibling
+                        // is still in flight) must be recorded as recoverable now.
+                        // Otherwise decide()'s unhandled-failure check fails the run
+                        // before the action region settles and the saga's
+                        // compensation can run — an order-dependent bug that only
+                        // bites when the failing task settles before its sibling.
+                        const before = failureRecoveryKeys.size;
+                        collectFailureKeys(child, { includeContinuedFailures: true });
+                        if (failureRecoveryKeys.size > before)
+                            failureRecoveryActive = true;
                         return walk(child);
+                    }
                     if (status.failed) {
                         failed = true;
                         break;
@@ -448,8 +460,19 @@ export function scheduleTasks(plan, states, descriptors, ralphState, retryWait,
                     const status = inspect(child, {
                         includeContinuedFailures: true,
                     });
-                    if (!status.terminal)
+                    if (!status.terminal) {
+                        // A failure already present in this still-running try child
+                        // (e.g. a failed task in a <Parallel> whose sibling is still
+                        // in flight) must be recorded as recoverable now, or decide()
+                        // fails the run before the try region settles — skipping
+                        // catch AND finally. Deferring here lets the region finish so
+                        // catch/finally run regardless of which task settles first.
+                        const before = failureRecoveryKeys.size;
+                        collectFailureKeys(child, { includeContinuedFailures: true });
+                        if (failureRecoveryKeys.size > before)
+                            failureRecoveryActive = true;
                         return walk(child);
+                    }
                     if (status.failed) {
                         tryFailed = true;
                         break;