npm - @smithers-orchestrator/scheduler - Versions diffs - 0.24.2 → 0.25.1 - Mend

@smithers-orchestrator/scheduler 0.24.2 → 0.25.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/package.json +3 -3
package/src/RenderContext.ts +16 -0
package/src/RunResult.ts +18 -1
package/src/TaskState.ts +1 -0
package/src/WaitReason.ts +6 -1
package/src/WorkflowSessionLive.js +12 -1
package/src/index.d.ts +85 -30
package/src/makeWorkflowSession.js +191 -72
package/src/scheduleTasks.js +1 -0

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@smithers-orchestrator/scheduler",
-  "version": "0.24.2",
+  "version": "0.25.1",
   "description": "Pure decision engine: session, scheduler, and task state management for Smithers workflows",
   "type": "module",
   "sideEffects": false,
@@ -176,8 +176,8 @@
   ],
   "dependencies": {
     "effect": "^3.21.1",
-    "@smithers-orchestrator/errors": "0.24.2",
-    "@smithers-orchestrator/graph": "0.24.2"
+    "@smithers-orchestrator/errors": "0.25.1",
+    "@smithers-orchestrator/graph": "0.25.1"
   },
   "devDependencies": {
     "@types/bun": "latest",

package/src/RenderContext.ts CHANGED Viewed

@@ -1,6 +1,21 @@
 import type { WorkflowGraph } from "@smithers-orchestrator/graph";
 import type { TaskOutput } from "./TaskOutput.ts";
+export type RenderTriggerReason =
+  | "task-finished"
+  | "timer-fired"
+  | "cache-resolved"
+  | "loop-advanced"
+  | "deadlock-check"
+  | "stability-check"
+  | (string & {});
+export type RenderTrigger = {
+  readonly reason: RenderTriggerReason;
+  readonly nodeId?: string;
+  readonly iteration?: number;
+};
 export type RenderContext = {
   readonly runId: string;
   readonly graph?: WorkflowGraph | null;
@@ -11,4 +26,5 @@ export type RenderContext = {
   readonly auth?: unknown;
   readonly taskStates?: unknown;
   readonly ralphIterations?: ReadonlyMap<string, number>;
+  readonly trigger?: RenderTrigger;
 };

package/src/RunResult.ts CHANGED Viewed

@@ -8,8 +8,25 @@ export type RunResult = {
     | "continued"
     | "waiting-approval"
     | "waiting-event"
-    | "waiting-timer";
+    | "waiting-timer"
+    | "waiting-quota";
   readonly output?: unknown;
   readonly error?: unknown;
   readonly nextRunId?: string;
+  /**
+   * Number of tasks that ended in a `failed` state yet did not fail the run —
+   * "masked" child failures the run-level status cannot express. Present (and
+   * `> 0`) only on a `finished` result that tolerated at least one failure
+   * (a {@link https://smithers.sh/components/task `continueOnFail`} task, or an
+   * agent task that failed transiently: rate limit, timeout, abort). A binary
+   * `finished` status would otherwise read as a clean success. See
+   * `docs/runtime/run-state.mdx`.
+   */
+  readonly failedChildren?: number;
+  /**
+   * Task state keys (`nodeId::iteration`) of the tasks counted by
+   * {@link failedChildren}. The iteration disambiguates the same `nodeId` failing
+   * across loop/Ralph iterations.
+   */
+  readonly failedChildKeys?: readonly string[];
 };

package/src/TaskState.ts CHANGED Viewed

@@ -3,6 +3,7 @@ export type TaskState =
   | "waiting-approval"
   | "waiting-event"
   | "waiting-timer"
+  | "waiting-quota"
   | "in-progress"
   | "finished"
   | "failed"

package/src/WaitReason.ts CHANGED Viewed

@@ -5,4 +5,9 @@ export type WaitReason =
   | { readonly _tag: "RetryBackoff"; readonly waitMs: number }
   | { readonly _tag: "HotReload" }
   | { readonly _tag: "OrphanRecovery"; readonly count: number }
-  | { readonly _tag: "ExternalTrigger" };
+  | { readonly _tag: "ExternalTrigger" }
+  | {
+      readonly _tag: "Quota";
+      readonly quotaBlockedCount: number;
+      readonly resetAtMs?: number;
+    };

package/src/WorkflowSessionLive.js CHANGED Viewed

@@ -2,5 +2,16 @@ import { Layer } from "effect";
 import { WorkflowSession } from "./WorkflowSession.js";
 import { makeWorkflowSession } from "./makeWorkflowSession.js";
-/** @type {Layer.Layer<WorkflowSession, never, never>} */
+/**
+ * WARNING — do not consume this layer as-is. `Layer.sync` builds **one** shared
+ * `makeWorkflowSession()` instance for the whole layer scope, but a workflow
+ * session carries per-run state, so sharing it across runs is a correctness bug.
+ * The engine intentionally bypasses this Tag and constructs a fresh session per
+ * run via `makeWorkflowSession()` directly — which is why nothing yields
+ * `WorkflowSession` today. Before any consumer reads the Tag, rework this into a
+ * per-run/scoped provider (e.g. `Layer.scoped` or a factory service) so each run
+ * gets its own session.
+ *
+ * @type {Layer.Layer<WorkflowSession, never, never>}
+ */
 export const WorkflowSessionLive = Layer.sync(WorkflowSession, makeWorkflowSession);

package/src/index.d.ts CHANGED Viewed

@@ -1,9 +1,10 @@
 import * as effect from 'effect';
-import { Context, Layer, Effect, Schedule } from 'effect';
-import * as _smithers_graph from '@smithers-orchestrator/graph';
+import { Context, Layer, Effect, Schedule as Schedule$1 } from 'effect';
+import * as _smithers_orchestrator_graph from '@smithers-orchestrator/graph';
 import { TaskDescriptor as TaskDescriptor$3, WorkflowGraph } from '@smithers-orchestrator/graph';
+import { TaskDescriptor as TaskDescriptor$4 } from '@smithers-orchestrator/graph/TaskDescriptor';
-type TaskState$2 = "pending" | "waiting-approval" | "waiting-event" | "waiting-timer" | "in-progress" | "finished" | "failed" | "cancelled" | "skipped";
+type TaskState$2 = "pending" | "waiting-approval" | "waiting-event" | "waiting-timer" | "waiting-quota" | "in-progress" | "finished" | "failed" | "cancelled" | "skipped";
 type TaskStateMap$4 = Map<string, TaskState$2>;
@@ -112,6 +113,12 @@ type TaskOutput$1 = {
     readonly usage?: TokenUsage$1 | null;
 };
+type RenderTriggerReason = "task-finished" | "timer-fired" | "cache-resolved" | "loop-advanced" | "deadlock-check" | "stability-check" | (string & {});
+type RenderTrigger = {
+    readonly reason: RenderTriggerReason;
+    readonly nodeId?: string;
+    readonly iteration?: number;
+};
 type RenderContext$1 = {
     readonly runId: string;
     readonly graph?: WorkflowGraph | null;
@@ -122,14 +129,31 @@ type RenderContext$1 = {
     readonly auth?: unknown;
     readonly taskStates?: unknown;
     readonly ralphIterations?: ReadonlyMap<string, number>;
+    readonly trigger?: RenderTrigger;
 };
 type RunResult$1 = {
     readonly runId: string;
-    readonly status: "running" | "finished" | "failed" | "cancelled" | "continued" | "waiting-approval" | "waiting-event" | "waiting-timer";
+    readonly status: "running" | "finished" | "failed" | "cancelled" | "continued" | "waiting-approval" | "waiting-event" | "waiting-timer" | "waiting-quota";
     readonly output?: unknown;
     readonly error?: unknown;
     readonly nextRunId?: string;
+    /**
+     * Number of tasks that ended in a `failed` state yet did not fail the run —
+     * "masked" child failures the run-level status cannot express. Present (and
+     * `> 0`) only on a `finished` result that tolerated at least one failure
+     * (a {@link https://smithers.sh/components/task `continueOnFail`} task, or an
+     * agent task that failed transiently: rate limit, timeout, abort). A binary
+     * `finished` status would otherwise read as a clean success. See
+     * `docs/runtime/run-state.mdx`.
+     */
+    readonly failedChildren?: number;
+    /**
+     * Task state keys (`nodeId::iteration`) of the tasks counted by
+     * {@link failedChildren}. The iteration disambiguates the same `nodeId` failing
+     * across loop/Ralph iterations.
+     */
+    readonly failedChildKeys?: readonly string[];
 };
 type WaitReason$1 = {
@@ -151,6 +175,10 @@ type WaitReason$1 = {
     readonly count: number;
 } | {
     readonly _tag: "ExternalTrigger";
+} | {
+    readonly _tag: "Quota";
+    readonly quotaBlockedCount: number;
+    readonly resetAtMs?: number;
 };
 type EngineDecision$1 = {
@@ -199,7 +227,8 @@ type WorkflowSessionService$2 = {
     readonly getCurrentGraph: () => Effect.Effect<WorkflowGraph | null>;
 };
-type AspectBudgetBreach$1 = {
+/** A breached Aspects budget for a task that is about to be dispatched. */
+type AspectBudgetBreach = {
     readonly kind: "tokens" | "latency";
     readonly limit: number;
     readonly current: number;
@@ -214,9 +243,16 @@ type WorkflowSessionOptions$2 = {
         readonly iteration: number;
         readonly done: boolean;
     }>;
-    readonly evaluateAspectBudget?: (descriptor: TaskDescriptor$3) => AspectBudgetBreach$1 | null | undefined;
-    readonly onAspectBudgetSkip?: (descriptor: TaskDescriptor$3, breach: AspectBudgetBreach$1) => void;
-    readonly onAspectBudgetWarn?: (descriptor: TaskDescriptor$3, breach: AspectBudgetBreach$1) => void;
+    /**
+     * Evaluate a runnable task's Aspects budgets against the run's accumulated
+     * usage. Return the first breach, or `null`/`undefined` when within budget.
+     * Only invoked for tasks that would otherwise execute.
+     */
+    readonly evaluateAspectBudget?: (descriptor: TaskDescriptor$4) => AspectBudgetBreach | null | undefined;
+    /** Called when a task is skipped because its budget was exceeded (`skip-remaining`). */
+    readonly onAspectBudgetSkip?: (descriptor: TaskDescriptor$4, breach: AspectBudgetBreach) => void;
+    /** Called when a task continues despite an exceeded budget (`warn`). */
+    readonly onAspectBudgetWarn?: (descriptor: TaskDescriptor$4, breach: AspectBudgetBreach) => void;
 };
 type TaskRecord$1 = {
@@ -227,10 +263,10 @@ type TaskRecord$1 = {
     readonly updatedAtMs: number;
 };
-type SmithersAlertSeverity = "info" | "warning" | "critical";
-type SmithersAlertLabels = Record<string, string>;
-type SmithersAlertReactionKind = "emit-only" | "pause" | "cancel" | "open-approval" | "deliver";
-type SmithersAlertReaction = {
+type SmithersAlertSeverity$1 = "info" | "warning" | "critical";
+type SmithersAlertLabels$1 = Record<string, string>;
+type SmithersAlertReactionKind$1 = "emit-only" | "pause" | "cancel" | "open-approval" | "deliver";
+type SmithersAlertReaction$1 = {
     kind: "emit-only";
 } | {
     kind: "pause";
@@ -242,24 +278,24 @@ type SmithersAlertReaction = {
     kind: "deliver";
     destination: string;
 };
-type SmithersAlertReactionRef = string | SmithersAlertReaction;
-type SmithersAlertPolicyDefaults = {
+type SmithersAlertReactionRef$1 = string | SmithersAlertReaction$1;
+type SmithersAlertPolicyDefaults$1 = {
     owner?: string;
-    severity?: SmithersAlertSeverity;
+    severity?: SmithersAlertSeverity$1;
     runbook?: string;
-    labels?: SmithersAlertLabels;
+    labels?: SmithersAlertLabels$1;
 };
-type SmithersAlertPolicyRule = SmithersAlertPolicyDefaults & {
+type SmithersAlertPolicyRule$1 = SmithersAlertPolicyDefaults$1 & {
     afterMs?: number;
-    reaction?: SmithersAlertReactionRef;
+    reaction?: SmithersAlertReactionRef$1;
 };
-type SmithersAlertPolicy = {
-    defaults?: SmithersAlertPolicyDefaults;
-    rules?: Record<string, SmithersAlertPolicyRule>;
-    reactions?: Record<string, SmithersAlertReaction>;
+type SmithersAlertPolicy$1 = {
+    defaults?: SmithersAlertPolicyDefaults$1;
+    rules?: Record<string, SmithersAlertPolicyRule$1>;
+    reactions?: Record<string, SmithersAlertReaction$1>;
 };
 type SmithersWorkflowOptions$1 = {
-    alertPolicy?: SmithersAlertPolicy;
+    alertPolicy?: SmithersAlertPolicy$1;
     cache?: boolean;
     workflowHash?: string;
 };
@@ -281,7 +317,7 @@ type RalphState$1 = {
 type RalphStateMap$4 = Map<string, RalphState$1>;
-type CachePolicy$1<Ctx = any> = {
+type CachePolicy$1<Ctx = unknown> = {
     by?: (ctx: Ctx) => unknown;
     version?: string;
     key?: string;
@@ -324,12 +360,12 @@ type TaskStateMap$3 = TaskStateMap$4;
  * @returns {boolean}
  */
 declare function isTerminalState(state: TaskState$1, descriptor?: Pick<TaskDescriptor$2, "continueOnFail">): boolean;
-type TaskDescriptor$2 = _smithers_graph.TaskDescriptor;
+type TaskDescriptor$2 = _smithers_orchestrator_graph.TaskDescriptor;
 type TaskState$1 = TaskState$2;
 declare class Scheduler extends Context.TagClassShape<"Scheduler", SchedulerService> {
 }
-type TaskDescriptor$1 = _smithers_graph.TaskDescriptor;
+type TaskDescriptor$1 = _smithers_orchestrator_graph.TaskDescriptor;
 type TaskStateMap$2 = TaskStateMap$4;
 type PlanNode$3 = PlanNode$4;
 type RalphStateMap$3 = RalphStateMap$4;
@@ -354,7 +390,7 @@ declare function buildPlanTree(xml: XmlNode | null, ralphState?: RalphStateMap$2
 type PlanNode$2 = PlanNode$4;
 type RalphMeta$1 = RalphMeta$2;
 type RalphStateMap$2 = RalphStateMap$4;
-type XmlNode = _smithers_graph.XmlNode;
+type XmlNode = _smithers_orchestrator_graph.XmlNode;
 /**
  * @param {PlanNode | null} plan
@@ -370,7 +406,7 @@ type PlanNode$1 = PlanNode$4;
 type RalphStateMap$1 = RalphStateMap$4;
 type RetryWaitMap$1 = RetryWaitMap$3;
 type ScheduleResult$1 = ScheduleResult$3;
-type TaskDescriptor = _smithers_graph.TaskDescriptor;
+type TaskDescriptor = _smithers_orchestrator_graph.TaskDescriptor;
 type TaskStateMap$1 = TaskStateMap$4;
 declare class WorkflowSession extends Context.TagClassShape<"WorkflowSession", WorkflowSessionService$2> {
@@ -384,7 +420,18 @@ declare function makeWorkflowSession(options?: WorkflowSessionOptions$1): Workfl
 type WorkflowSessionOptions$1 = WorkflowSessionOptions$2;
 type WorkflowSessionService$1 = WorkflowSessionService$2;
-/** @type {Layer.Layer<WorkflowSession, never, never>} */
+/**
+ * WARNING — do not consume this layer as-is. `Layer.sync` builds **one** shared
+ * `makeWorkflowSession()` instance for the whole layer scope, but a workflow
+ * session carries per-run state, so sharing it across runs is a correctness bug.
+ * The engine intentionally bypasses this Tag and constructs a fresh session per
+ * run via `makeWorkflowSession()` directly — which is why nothing yields
+ * `WorkflowSession` today. Before any consumer reads the Tag, rework this into a
+ * per-run/scoped provider (e.g. `Layer.scoped` or a factory service) so each run
+ * gets its own session.
+ *
+ * @type {Layer.Layer<WorkflowSession, never, never>}
+ */
 declare const WorkflowSessionLive: Layer.Layer<WorkflowSession, never, never>;
 /**
@@ -398,7 +445,7 @@ declare function nowMs(): number;
  * @param {RetryPolicy} policy
  * @returns {Schedule.Schedule<unknown>}
  */
-declare function retryPolicyToSchedule(policy: RetryPolicy$2): Schedule.Schedule<unknown>;
+declare function retryPolicyToSchedule(policy: RetryPolicy$2): Schedule$1.Schedule<unknown>;
 type RetryPolicy$2 = RetryPolicy$3;
 /**
@@ -434,6 +481,14 @@ type RetryWaitMap = RetryWaitMap$3;
 type RunResult = RunResult$1;
 type ScheduleResult = ScheduleResult$3;
 type ScheduleSnapshot = ScheduleSnapshot$1;
+type SmithersAlertLabels = SmithersAlertLabels$1;
+type SmithersAlertPolicy = SmithersAlertPolicy$1;
+type SmithersAlertPolicyDefaults = SmithersAlertPolicyDefaults$1;
+type SmithersAlertPolicyRule = SmithersAlertPolicyRule$1;
+type SmithersAlertReaction = SmithersAlertReaction$1;
+type SmithersAlertReactionKind = SmithersAlertReactionKind$1;
+type SmithersAlertReactionRef = SmithersAlertReactionRef$1;
+type SmithersAlertSeverity = SmithersAlertSeverity$1;
 type SmithersWorkflowOptions = SmithersWorkflowOptions$1;
 type TaskFailure = TaskFailure$1;
 type TaskOutput = TaskOutput$1;

package/src/makeWorkflowSession.js CHANGED Viewed

@@ -65,9 +65,10 @@ function mountedSignature(graph) {
 /**
  * @param {SessionState} state
  * @param {number} [iterationOverride]
+ * @param {RenderContext["trigger"]} [trigger]
  * @returns {RenderContext}
  */
-function renderContext(state, iterationOverride) {
+function renderContext(state, iterationOverride, trigger) {
     const ralphIterations = [...state.ralphState.values()].map((value) => value.iteration);
     return {
         runId: state.runId,
@@ -77,6 +78,7 @@ function renderContext(state, iterationOverride) {
         taskStates: cloneTaskStateMap(state.states),
         outputs: new Map(state.outputs),
         ralphIterations: new Map([...state.ralphState.entries()].map(([id, value]) => [id, value.iteration])),
+        ...(trigger ? { trigger } : {}),
     };
 }
 /**
@@ -85,23 +87,48 @@ function renderContext(state, iterationOverride) {
  * @returns {WaitReason | undefined}
  */
 function findWaitingReason(state, currentTimeMs) {
+    // Do a full pass to accumulate quota count and find the highest-priority
+    // non-quota wait reason. This prevents an early-return from shadowing
+    // quota-blocked tasks when mixed wait types coexist in the same run.
+    let primaryReason;
+    let quotaBlockedCount = 0;
+    let earliestQuotaResetAtMs;
     for (const descriptor of state.descriptors.values()) {
         const taskState = state.states.get(stateKeyFor(descriptor));
-        if (taskState === "waiting-approval") {
-            return { _tag: "Approval", nodeId: descriptor.nodeId };
+        if (taskState === "waiting-approval" && !primaryReason) {
+            primaryReason = { _tag: "Approval", nodeId: descriptor.nodeId };
         }
-        if (taskState === "waiting-event") {
+        else if (taskState === "waiting-event" && !primaryReason) {
             const eventName = typeof descriptor.meta?.__eventName === "string"
                 ? descriptor.meta.__eventName
                 : "";
-            return { _tag: "Event", eventName };
+            primaryReason = { _tag: "Event", eventName };
         }
-        if (taskState === "waiting-timer") {
-            return {
+        else if (taskState === "waiting-timer" && !primaryReason) {
+            primaryReason = {
                 _tag: "Timer",
                 resumeAtMs: timerResumeAtMs(descriptor, currentTimeMs),
             };
         }
+        else if (taskState === "waiting-quota") {
+            quotaBlockedCount += 1;
+            const resetAtMs = state.quotaResetTimes.get(stateKeyFor(descriptor));
+            if (resetAtMs != null) {
+                earliestQuotaResetAtMs = earliestQuotaResetAtMs == null
+                    ? resetAtMs
+                    : Math.min(earliestQuotaResetAtMs, resetAtMs);
+            }
+        }
+    }
+    if (primaryReason) {
+        return primaryReason;
+    }
+    if (quotaBlockedCount > 0) {
+        return {
+            _tag: "Quota",
+            quotaBlockedCount,
+            ...(earliestQuotaResetAtMs != null ? { resetAtMs: earliestQuotaResetAtMs } : {}),
+        };
     }
     return undefined;
 }
@@ -179,6 +206,39 @@ function isRetryableFailure(descriptor, error) {
     }
     return true;
 }
+/**
+ * @param {unknown} error
+ * @returns {boolean}
+ */
+function isQuotaFailure(error) {
+    const payloadCode = error && typeof error === "object" && typeof error.code === "string"
+        ? error.code
+        : undefined;
+    const payloadDetails = error && typeof error === "object" && error.details && typeof error.details === "object"
+        ? error.details
+        : undefined;
+    const normalized = toSmithersError(error);
+    const code = payloadCode ?? normalized.code;
+    if (code === "AGENT_QUOTA_EXCEEDED")
+        return true;
+    const details = payloadDetails ?? normalized.details;
+    return Boolean(details && typeof details === "object" && details.failureQuota === true);
+}
+/**
+ * @param {unknown} error
+ * @returns {number | undefined}
+ */
+function getQuotaResetAtMs(error) {
+    const payloadDetails = error && typeof error === "object" && error.details && typeof error.details === "object"
+        ? error.details
+        : undefined;
+    const normalized = toSmithersError(error);
+    const details = payloadDetails ?? normalized.details;
+    if (!details || typeof details !== "object")
+        return undefined;
+    const resetAtMs = details.quotaResetAtMs;
+    return typeof resetAtMs === "number" && Number.isFinite(resetAtMs) ? resetAtMs : undefined;
+}
 /**
  * @param {unknown} error
  * @returns {boolean}
@@ -220,7 +280,11 @@ function describeDeadlock(state) {
             }
             else {
                 const depState = state.states.get(stateKeyFor(dep)) ?? "pending";
-                unmet.push(`'${depId}' (${depState})`);
+                if (depState !== "finished" &&
+                    depState !== "skipped" &&
+                    !(depState === "failed" && dep.continueOnFail)) {
+                    unmet.push(`'${depId}' (${depState})`);
+                }
             }
         }
         if (unmet.length > 0) {
@@ -270,6 +334,8 @@ export function makeWorkflowSession(options = {}) {
         retryWait: new Map(),
         approvals: new Set(),
         ralphState: new Map(options.initialRalphState ?? []),
+        /** @type {Map<string, number>} Maps state key → quota reset timestamp (ms) */
+        quotaResetTimes: new Map(),
         schedule: null,
         cancelled: false,
         lastMountedSignature: null,
@@ -287,14 +353,37 @@ export function makeWorkflowSession(options = {}) {
    * @returns {EngineDecision}
    */
     function finishedResult(status = "finished") {
-        return {
-            _tag: "Finished",
-            result: {
-                runId: state.runId,
-                status,
-                output: [...state.outputs.values()].at(-1)?.output,
-            },
+        /** @type {RunResult} */
+        const result = {
+            runId: state.runId,
+            status,
+            output: [...state.outputs.values()].at(-1)?.output,
         };
+        if (status === "finished") {
+            // At a `finished` terminal, any task still in `failed` state is a
+            // *tolerated* failure — an unhandled one would have produced a `Failed`
+            // decision via unhandledFailureDecision() and never reached here. Those
+            // are exactly the masked children (continueOnFail tasks, transient agent
+            // failures) the binary run status cannot express. Surface them so callers
+            // can detect a run that "succeeded" while children failed. See issue #295
+            // and docs/runtime/run-state.mdx.
+            //
+            // Keys are the canonical task state keys (`nodeId::iteration`), not bare
+            // node ids: a looped/Ralph workflow can fail the same nodeId across
+            // iterations, and the iteration is what disambiguates which child to
+            // inspect.
+            const failedChildKeys = [];
+            for (const [key, taskState] of state.states) {
+                if (taskState === "failed") {
+                    failedChildKeys.push(key);
+                }
+            }
+            if (failedChildKeys.length > 0) {
+                result.failedChildren = failedChildKeys.length;
+                result.failedChildKeys = failedChildKeys;
+            }
+        }
+        return { _tag: "Finished", result };
     }
     /**
    * @returns {ScheduleResult}
@@ -332,6 +421,7 @@ export function makeWorkflowSession(options = {}) {
                 state.approvals.delete(key);
                 state.retryCounts.delete(key);
                 state.failureDescriptors.delete(key);
+                state.quotaResetTimes.delete(key);
             }
         }
         for (const ralph of ralphs) {
@@ -362,14 +452,16 @@ export function makeWorkflowSession(options = {}) {
         state.outputs.set(key, output);
         state.retryWait.delete(key);
         state.failureDescriptors.delete(key);
+        state.quotaResetTimes.delete(key);
     }
     /**
    * @param {number} [iteration]
+   * @param {RenderContext["trigger"]} [trigger]
    * @returns {EngineDecision}
    */
-    function decideAfterOutputChange(iteration) {
+    function decideAfterOutputChange(iteration, trigger) {
         if (options.requireRerenderOnOutputChange) {
-            return { _tag: "ReRender", context: renderContext(state, iteration) };
+            return { _tag: "ReRender", context: renderContext(state, iteration, trigger) };
         }
         return decide();
     }
@@ -400,12 +492,54 @@ export function makeWorkflowSession(options = {}) {
         }
     }
     /**
+   * @param {string} eventName
+   * @param {unknown} payload
+   * @param {string | null} correlationId
+   */
+    function applyEventReceived(eventName, payload, correlationId) {
+        for (const descriptor of state.descriptors.values()) {
+            const key = stateKeyFor(descriptor);
+            const taskState = state.states.get(key);
+            const expected = typeof descriptor.meta?.__eventName === "string"
+                ? descriptor.meta.__eventName
+                : undefined;
+            const expectedCorrelation = typeof descriptor.meta?.__correlationId === "string"
+                ? descriptor.meta.__correlationId
+                : undefined;
+            if (taskState === "waiting-event" &&
+                (!expected || expected === eventName) &&
+                (expectedCorrelation === undefined || expectedCorrelation === correlationId)) {
+                state.states.set(key, "finished");
+                state.outputs.set(key, {
+                    nodeId: descriptor.nodeId,
+                    iteration: descriptor.iteration,
+                    output: payload,
+                });
+            }
+        }
+    }
+    /**
    * @param {TaskDescriptor} descriptor
    * @param {unknown} error
    * @returns {EngineDecision}
    */
     function applyFailure(descriptor, error) {
         const key = stateKeyFor(descriptor);
+        // Quota/usage-limit errors do not consume the task's retry budget.
+        // Instead, put the task into "waiting-quota" so the run can pause
+        // durably and resume cleanly after the provider resets.
+        if (isQuotaFailure(error)) {
+            state.states.set(key, "waiting-quota");
+            state.failures.set(key, error);
+            const resetAtMs = getQuotaResetAtMs(error);
+            if (resetAtMs != null) {
+                state.quotaResetTimes.set(key, resetAtMs);
+            }
+            else {
+                state.quotaResetTimes.delete(key);
+            }
+            return decide();
+        }
         const failureCount = (state.retryCounts.get(key) ?? 0) + 1;
         state.retryCounts.set(key, failureCount);
         const retryable = isRetryableFailure(descriptor, error);
@@ -425,7 +559,11 @@ export function makeWorkflowSession(options = {}) {
         state.states.set(key, "failed");
         state.failures.set(key, error);
         state.failureDescriptors.set(key, descriptor);
-        return decide();
+        return decideAfterOutputChange(descriptor.iteration, {
+            reason: "task-finished",
+            nodeId: descriptor.nodeId,
+            iteration: descriptor.iteration,
+        });
     }
     /**
    * @returns {EngineDecision | null}
@@ -459,11 +597,15 @@ export function makeWorkflowSession(options = {}) {
         };
     }
     /**
+   * @param {number} [depth] recursion depth; guarded at 10 to catch decision cycles
    * @returns {EngineDecision}
    */
     function decide(depth = 0) {
         if (depth > 10) {
-            return { _tag: "Wait", reason: { _tag: "ExternalTrigger" } };
+            return {
+                _tag: "Failed",
+                error: new SmithersError("SCHEDULER_ERROR", "Exceeded scheduler decide() depth guard.", { depth }),
+            };
         }
         if (state.cancelled) {
             return finishedResult("cancelled");
@@ -623,7 +765,7 @@ export function makeWorkflowSession(options = {}) {
                 advanced = true;
             }
             if (advanced) {
-                return { _tag: "ReRender", context: renderContext(state) };
+                return { _tag: "ReRender", context: renderContext(state, undefined, { reason: "loop-advanced" }) };
             }
         }
         if (schedule.pendingExists) {
@@ -649,7 +791,7 @@ export function makeWorkflowSession(options = {}) {
                     const signature = mountedSignature(state.graph);
                     if (state.lastDeadlockSignature !== signature) {
                         state.lastDeadlockSignature = signature;
-                        return { _tag: "ReRender", context: renderContext(state) };
+                        return { _tag: "ReRender", context: renderContext(state, undefined, { reason: "deadlock-check" }) };
                     }
                 }
                 return {
@@ -670,7 +812,7 @@ export function makeWorkflowSession(options = {}) {
             const signature = mountedSignature(state.graph);
             if (state.lastMountedSignature !== signature) {
                 state.lastMountedSignature = signature;
-                return { _tag: "ReRender", context: renderContext(state) };
+                return { _tag: "ReRender", context: renderContext(state, undefined, { reason: "stability-check" }) };
             }
         }
         return finishedResult();
@@ -686,17 +828,26 @@ export function makeWorkflowSession(options = {}) {
             }
         }),
         taskCompleted: (output) => Effect.sync(() => {
-            const descriptor = findDescriptor(state, output.nodeId, output.iteration);
-            if (!descriptor) {
-                return failedDecision(new SmithersError("NODE_NOT_FOUND", `Unknown task ${output.nodeId}`), "taskCompleted");
-            }
+            // A completion can legitimately arrive for a task that is no longer in the
+            // current graph: a conditionally-rendered task (e.g. `{done ? <Task pr/> : null}`)
+            // whose parent re-rendered it out while it was still running in the background.
+            // That result is stale, not fatal — record it (so it is available if the task
+            // re-mounts) and let the current graph drive the next decision. Failing here
+            // would discard every other in-flight task in the run.
             markTaskFinished(output);
-            return decideAfterOutputChange(output.iteration);
+            return decideAfterOutputChange(output.iteration, {
+                reason: "task-finished",
+                nodeId: output.nodeId,
+                iteration: output.iteration,
+            });
         }),
         taskFailed: (failure) => Effect.sync(() => {
             const descriptor = findDescriptor(state, failure.nodeId, failure.iteration);
             if (!descriptor) {
-                return failedDecision(new SmithersError("NODE_NOT_FOUND", `Unknown task ${failure.nodeId}`), "taskFailed");
+                // Stale failure for a task that already left the graph (see taskCompleted) —
+                // the task is gone, so its failure is moot. Re-decide on the current graph
+                // rather than failing the whole run.
+                return decide();
             }
             return applyFailure(descriptor, failure.error);
         }),
@@ -727,51 +878,11 @@ export function makeWorkflowSession(options = {}) {
             return decide();
         }),
         eventReceived: (eventName, payload, correlationId = null) => Effect.sync(() => {
-            for (const descriptor of state.descriptors.values()) {
-                const key = stateKeyFor(descriptor);
-                const taskState = state.states.get(key);
-                const expected = typeof descriptor.meta?.__eventName === "string"
-                    ? descriptor.meta.__eventName
-                    : undefined;
-                const expectedCorrelation = typeof descriptor.meta?.__correlationId === "string"
-                    ? descriptor.meta.__correlationId
-                    : undefined;
-                if (taskState === "waiting-event" &&
-                    (!expected || expected === eventName) &&
-                    (expectedCorrelation === undefined || expectedCorrelation === correlationId)) {
-                    state.states.set(key, "finished");
-                    state.outputs.set(key, {
-                        nodeId: descriptor.nodeId,
-                        iteration: descriptor.iteration,
-                        output: payload,
-                    });
-                }
-            }
+            applyEventReceived(eventName, payload, correlationId);
             return decide();
         }),
         signalReceived: (signalName, payload, correlationId = null) => Effect.sync(() => {
-            for (const descriptor of state.descriptors.values()) {
-                const key = stateKeyFor(descriptor);
-                const taskState = state.states.get(key);
-                const expected = typeof descriptor.meta?.__signalName === "string"
-                    ? descriptor.meta.__signalName
-                    : typeof descriptor.meta?.__eventName === "string"
-                        ? descriptor.meta.__eventName
-                        : undefined;
-                const expectedCorrelation = typeof descriptor.meta?.__correlationId === "string"
-                    ? descriptor.meta.__correlationId
-                    : undefined;
-                if (taskState === "waiting-event" &&
-                    (!expected || expected === signalName) &&
-                    (expectedCorrelation === undefined || expectedCorrelation === correlationId)) {
-                    state.states.set(key, "finished");
-                    state.outputs.set(key, {
-                        nodeId: descriptor.nodeId,
-                        iteration: descriptor.iteration,
-                        output: payload,
-                    });
-                }
-            }
+            applyEventReceived(signalName, payload, correlationId);
             return decide();
         }),
         timerFired: (nodeId, firedAtMs = nowMs()) => Effect.sync(() => {
@@ -788,7 +899,11 @@ export function makeWorkflowSession(options = {}) {
                 iteration: descriptor.iteration,
                 output: { firedAtMs },
             });
-            return decideAfterOutputChange(descriptor.iteration);
+            return decideAfterOutputChange(descriptor.iteration, {
+                reason: "timer-fired",
+                nodeId: descriptor.nodeId,
+                iteration: descriptor.iteration,
+            });
         }),
         hotReloaded: (graph) => Effect.sync(() => {
             try {
@@ -822,7 +937,11 @@ export function makeWorkflowSession(options = {}) {
                 usage: output.usage ?? null,
                 output: output.output,
             });
-            return decideAfterOutputChange(output.iteration);
+            return decideAfterOutputChange(output.iteration, {
+                reason: "cache-resolved",
+                nodeId: output.nodeId,
+                iteration: output.iteration,
+            });
         }),
         cacheMissed: (nodeId, iteration) => Effect.sync(() => {
             const descriptor = findDescriptor(state, nodeId, iteration);

package/src/scheduleTasks.js CHANGED Viewed

@@ -123,6 +123,7 @@ export function scheduleTasks(plan, states, descriptors, ralphState, retryWait,
     }
     /**
    * @param {PlanNode} node
+   * @param {{ includeContinuedFailures?: boolean }} [options]
    * @returns {{ readonly terminal: boolean; readonly failed: boolean }}
    */
     function inspect(node, options = {}) {