@smithers-orchestrator/scheduler 0.24.2 → 0.25.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@smithers-orchestrator/scheduler",
3
- "version": "0.24.2",
3
+ "version": "0.25.1",
4
4
  "description": "Pure decision engine: session, scheduler, and task state management for Smithers workflows",
5
5
  "type": "module",
6
6
  "sideEffects": false,
@@ -176,8 +176,8 @@
176
176
  ],
177
177
  "dependencies": {
178
178
  "effect": "^3.21.1",
179
- "@smithers-orchestrator/errors": "0.24.2",
180
- "@smithers-orchestrator/graph": "0.24.2"
179
+ "@smithers-orchestrator/errors": "0.25.1",
180
+ "@smithers-orchestrator/graph": "0.25.1"
181
181
  },
182
182
  "devDependencies": {
183
183
  "@types/bun": "latest",
@@ -1,6 +1,21 @@
1
1
  import type { WorkflowGraph } from "@smithers-orchestrator/graph";
2
2
  import type { TaskOutput } from "./TaskOutput.ts";
3
3
 
4
+ export type RenderTriggerReason =
5
+ | "task-finished"
6
+ | "timer-fired"
7
+ | "cache-resolved"
8
+ | "loop-advanced"
9
+ | "deadlock-check"
10
+ | "stability-check"
11
+ | (string & {});
12
+
13
+ export type RenderTrigger = {
14
+ readonly reason: RenderTriggerReason;
15
+ readonly nodeId?: string;
16
+ readonly iteration?: number;
17
+ };
18
+
4
19
  export type RenderContext = {
5
20
  readonly runId: string;
6
21
  readonly graph?: WorkflowGraph | null;
@@ -11,4 +26,5 @@ export type RenderContext = {
11
26
  readonly auth?: unknown;
12
27
  readonly taskStates?: unknown;
13
28
  readonly ralphIterations?: ReadonlyMap<string, number>;
29
+ readonly trigger?: RenderTrigger;
14
30
  };
package/src/RunResult.ts CHANGED
@@ -8,8 +8,25 @@ export type RunResult = {
8
8
  | "continued"
9
9
  | "waiting-approval"
10
10
  | "waiting-event"
11
- | "waiting-timer";
11
+ | "waiting-timer"
12
+ | "waiting-quota";
12
13
  readonly output?: unknown;
13
14
  readonly error?: unknown;
14
15
  readonly nextRunId?: string;
16
+ /**
17
+ * Number of tasks that ended in a `failed` state yet did not fail the run —
18
+ * "masked" child failures the run-level status cannot express. Present (and
19
+ * `> 0`) only on a `finished` result that tolerated at least one failure
20
+ * (a {@link https://smithers.sh/components/task `continueOnFail`} task, or an
21
+ * agent task that failed transiently: rate limit, timeout, abort). A binary
22
+ * `finished` status would otherwise read as a clean success. See
23
+ * `docs/runtime/run-state.mdx`.
24
+ */
25
+ readonly failedChildren?: number;
26
+ /**
27
+ * Task state keys (`nodeId::iteration`) of the tasks counted by
28
+ * {@link failedChildren}. The iteration disambiguates the same `nodeId` failing
29
+ * across loop/Ralph iterations.
30
+ */
31
+ readonly failedChildKeys?: readonly string[];
15
32
  };
package/src/TaskState.ts CHANGED
@@ -3,6 +3,7 @@ export type TaskState =
3
3
  | "waiting-approval"
4
4
  | "waiting-event"
5
5
  | "waiting-timer"
6
+ | "waiting-quota"
6
7
  | "in-progress"
7
8
  | "finished"
8
9
  | "failed"
package/src/WaitReason.ts CHANGED
@@ -5,4 +5,9 @@ export type WaitReason =
5
5
  | { readonly _tag: "RetryBackoff"; readonly waitMs: number }
6
6
  | { readonly _tag: "HotReload" }
7
7
  | { readonly _tag: "OrphanRecovery"; readonly count: number }
8
- | { readonly _tag: "ExternalTrigger" };
8
+ | { readonly _tag: "ExternalTrigger" }
9
+ | {
10
+ readonly _tag: "Quota";
11
+ readonly quotaBlockedCount: number;
12
+ readonly resetAtMs?: number;
13
+ };
@@ -2,5 +2,16 @@ import { Layer } from "effect";
2
2
  import { WorkflowSession } from "./WorkflowSession.js";
3
3
  import { makeWorkflowSession } from "./makeWorkflowSession.js";
4
4
 
5
- /** @type {Layer.Layer<WorkflowSession, never, never>} */
5
+ /**
6
+ * WARNING — do not consume this layer as-is. `Layer.sync` builds **one** shared
7
+ * `makeWorkflowSession()` instance for the whole layer scope, but a workflow
8
+ * session carries per-run state, so sharing it across runs is a correctness bug.
9
+ * The engine intentionally bypasses this Tag and constructs a fresh session per
10
+ * run via `makeWorkflowSession()` directly — which is why nothing yields
11
+ * `WorkflowSession` today. Before any consumer reads the Tag, rework this into a
12
+ * per-run/scoped provider (e.g. `Layer.scoped` or a factory service) so each run
13
+ * gets its own session.
14
+ *
15
+ * @type {Layer.Layer<WorkflowSession, never, never>}
16
+ */
6
17
  export const WorkflowSessionLive = Layer.sync(WorkflowSession, makeWorkflowSession);
package/src/index.d.ts CHANGED
@@ -1,9 +1,10 @@
1
1
  import * as effect from 'effect';
2
- import { Context, Layer, Effect, Schedule } from 'effect';
3
- import * as _smithers_graph from '@smithers-orchestrator/graph';
2
+ import { Context, Layer, Effect, Schedule as Schedule$1 } from 'effect';
3
+ import * as _smithers_orchestrator_graph from '@smithers-orchestrator/graph';
4
4
  import { TaskDescriptor as TaskDescriptor$3, WorkflowGraph } from '@smithers-orchestrator/graph';
5
+ import { TaskDescriptor as TaskDescriptor$4 } from '@smithers-orchestrator/graph/TaskDescriptor';
5
6
 
6
- type TaskState$2 = "pending" | "waiting-approval" | "waiting-event" | "waiting-timer" | "in-progress" | "finished" | "failed" | "cancelled" | "skipped";
7
+ type TaskState$2 = "pending" | "waiting-approval" | "waiting-event" | "waiting-timer" | "waiting-quota" | "in-progress" | "finished" | "failed" | "cancelled" | "skipped";
7
8
 
8
9
  type TaskStateMap$4 = Map<string, TaskState$2>;
9
10
 
@@ -112,6 +113,12 @@ type TaskOutput$1 = {
112
113
  readonly usage?: TokenUsage$1 | null;
113
114
  };
114
115
 
116
+ type RenderTriggerReason = "task-finished" | "timer-fired" | "cache-resolved" | "loop-advanced" | "deadlock-check" | "stability-check" | (string & {});
117
+ type RenderTrigger = {
118
+ readonly reason: RenderTriggerReason;
119
+ readonly nodeId?: string;
120
+ readonly iteration?: number;
121
+ };
115
122
  type RenderContext$1 = {
116
123
  readonly runId: string;
117
124
  readonly graph?: WorkflowGraph | null;
@@ -122,14 +129,31 @@ type RenderContext$1 = {
122
129
  readonly auth?: unknown;
123
130
  readonly taskStates?: unknown;
124
131
  readonly ralphIterations?: ReadonlyMap<string, number>;
132
+ readonly trigger?: RenderTrigger;
125
133
  };
126
134
 
127
135
  type RunResult$1 = {
128
136
  readonly runId: string;
129
- readonly status: "running" | "finished" | "failed" | "cancelled" | "continued" | "waiting-approval" | "waiting-event" | "waiting-timer";
137
+ readonly status: "running" | "finished" | "failed" | "cancelled" | "continued" | "waiting-approval" | "waiting-event" | "waiting-timer" | "waiting-quota";
130
138
  readonly output?: unknown;
131
139
  readonly error?: unknown;
132
140
  readonly nextRunId?: string;
141
+ /**
142
+ * Number of tasks that ended in a `failed` state yet did not fail the run —
143
+ * "masked" child failures the run-level status cannot express. Present (and
144
+ * `> 0`) only on a `finished` result that tolerated at least one failure
145
+ * (a {@link https://smithers.sh/components/task `continueOnFail`} task, or an
146
+ * agent task that failed transiently: rate limit, timeout, abort). A binary
147
+ * `finished` status would otherwise read as a clean success. See
148
+ * `docs/runtime/run-state.mdx`.
149
+ */
150
+ readonly failedChildren?: number;
151
+ /**
152
+ * Task state keys (`nodeId::iteration`) of the tasks counted by
153
+ * {@link failedChildren}. The iteration disambiguates the same `nodeId` failing
154
+ * across loop/Ralph iterations.
155
+ */
156
+ readonly failedChildKeys?: readonly string[];
133
157
  };
134
158
 
135
159
  type WaitReason$1 = {
@@ -151,6 +175,10 @@ type WaitReason$1 = {
151
175
  readonly count: number;
152
176
  } | {
153
177
  readonly _tag: "ExternalTrigger";
178
+ } | {
179
+ readonly _tag: "Quota";
180
+ readonly quotaBlockedCount: number;
181
+ readonly resetAtMs?: number;
154
182
  };
155
183
 
156
184
  type EngineDecision$1 = {
@@ -199,7 +227,8 @@ type WorkflowSessionService$2 = {
199
227
  readonly getCurrentGraph: () => Effect.Effect<WorkflowGraph | null>;
200
228
  };
201
229
 
202
- type AspectBudgetBreach$1 = {
230
+ /** A breached Aspects budget for a task that is about to be dispatched. */
231
+ type AspectBudgetBreach = {
203
232
  readonly kind: "tokens" | "latency";
204
233
  readonly limit: number;
205
234
  readonly current: number;
@@ -214,9 +243,16 @@ type WorkflowSessionOptions$2 = {
214
243
  readonly iteration: number;
215
244
  readonly done: boolean;
216
245
  }>;
217
- readonly evaluateAspectBudget?: (descriptor: TaskDescriptor$3) => AspectBudgetBreach$1 | null | undefined;
218
- readonly onAspectBudgetSkip?: (descriptor: TaskDescriptor$3, breach: AspectBudgetBreach$1) => void;
219
- readonly onAspectBudgetWarn?: (descriptor: TaskDescriptor$3, breach: AspectBudgetBreach$1) => void;
246
+ /**
247
+ * Evaluate a runnable task's Aspects budgets against the run's accumulated
248
+ * usage. Return the first breach, or `null`/`undefined` when within budget.
249
+ * Only invoked for tasks that would otherwise execute.
250
+ */
251
+ readonly evaluateAspectBudget?: (descriptor: TaskDescriptor$4) => AspectBudgetBreach | null | undefined;
252
+ /** Called when a task is skipped because its budget was exceeded (`skip-remaining`). */
253
+ readonly onAspectBudgetSkip?: (descriptor: TaskDescriptor$4, breach: AspectBudgetBreach) => void;
254
+ /** Called when a task continues despite an exceeded budget (`warn`). */
255
+ readonly onAspectBudgetWarn?: (descriptor: TaskDescriptor$4, breach: AspectBudgetBreach) => void;
220
256
  };
221
257
 
222
258
  type TaskRecord$1 = {
@@ -227,10 +263,10 @@ type TaskRecord$1 = {
227
263
  readonly updatedAtMs: number;
228
264
  };
229
265
 
230
- type SmithersAlertSeverity = "info" | "warning" | "critical";
231
- type SmithersAlertLabels = Record<string, string>;
232
- type SmithersAlertReactionKind = "emit-only" | "pause" | "cancel" | "open-approval" | "deliver";
233
- type SmithersAlertReaction = {
266
+ type SmithersAlertSeverity$1 = "info" | "warning" | "critical";
267
+ type SmithersAlertLabels$1 = Record<string, string>;
268
+ type SmithersAlertReactionKind$1 = "emit-only" | "pause" | "cancel" | "open-approval" | "deliver";
269
+ type SmithersAlertReaction$1 = {
234
270
  kind: "emit-only";
235
271
  } | {
236
272
  kind: "pause";
@@ -242,24 +278,24 @@ type SmithersAlertReaction = {
242
278
  kind: "deliver";
243
279
  destination: string;
244
280
  };
245
- type SmithersAlertReactionRef = string | SmithersAlertReaction;
246
- type SmithersAlertPolicyDefaults = {
281
+ type SmithersAlertReactionRef$1 = string | SmithersAlertReaction$1;
282
+ type SmithersAlertPolicyDefaults$1 = {
247
283
  owner?: string;
248
- severity?: SmithersAlertSeverity;
284
+ severity?: SmithersAlertSeverity$1;
249
285
  runbook?: string;
250
- labels?: SmithersAlertLabels;
286
+ labels?: SmithersAlertLabels$1;
251
287
  };
252
- type SmithersAlertPolicyRule = SmithersAlertPolicyDefaults & {
288
+ type SmithersAlertPolicyRule$1 = SmithersAlertPolicyDefaults$1 & {
253
289
  afterMs?: number;
254
- reaction?: SmithersAlertReactionRef;
290
+ reaction?: SmithersAlertReactionRef$1;
255
291
  };
256
- type SmithersAlertPolicy = {
257
- defaults?: SmithersAlertPolicyDefaults;
258
- rules?: Record<string, SmithersAlertPolicyRule>;
259
- reactions?: Record<string, SmithersAlertReaction>;
292
+ type SmithersAlertPolicy$1 = {
293
+ defaults?: SmithersAlertPolicyDefaults$1;
294
+ rules?: Record<string, SmithersAlertPolicyRule$1>;
295
+ reactions?: Record<string, SmithersAlertReaction$1>;
260
296
  };
261
297
  type SmithersWorkflowOptions$1 = {
262
- alertPolicy?: SmithersAlertPolicy;
298
+ alertPolicy?: SmithersAlertPolicy$1;
263
299
  cache?: boolean;
264
300
  workflowHash?: string;
265
301
  };
@@ -281,7 +317,7 @@ type RalphState$1 = {
281
317
 
282
318
  type RalphStateMap$4 = Map<string, RalphState$1>;
283
319
 
284
- type CachePolicy$1<Ctx = any> = {
320
+ type CachePolicy$1<Ctx = unknown> = {
285
321
  by?: (ctx: Ctx) => unknown;
286
322
  version?: string;
287
323
  key?: string;
@@ -324,12 +360,12 @@ type TaskStateMap$3 = TaskStateMap$4;
324
360
  * @returns {boolean}
325
361
  */
326
362
  declare function isTerminalState(state: TaskState$1, descriptor?: Pick<TaskDescriptor$2, "continueOnFail">): boolean;
327
- type TaskDescriptor$2 = _smithers_graph.TaskDescriptor;
363
+ type TaskDescriptor$2 = _smithers_orchestrator_graph.TaskDescriptor;
328
364
  type TaskState$1 = TaskState$2;
329
365
 
330
366
  declare class Scheduler extends Context.TagClassShape<"Scheduler", SchedulerService> {
331
367
  }
332
- type TaskDescriptor$1 = _smithers_graph.TaskDescriptor;
368
+ type TaskDescriptor$1 = _smithers_orchestrator_graph.TaskDescriptor;
333
369
  type TaskStateMap$2 = TaskStateMap$4;
334
370
  type PlanNode$3 = PlanNode$4;
335
371
  type RalphStateMap$3 = RalphStateMap$4;
@@ -354,7 +390,7 @@ declare function buildPlanTree(xml: XmlNode | null, ralphState?: RalphStateMap$2
354
390
  type PlanNode$2 = PlanNode$4;
355
391
  type RalphMeta$1 = RalphMeta$2;
356
392
  type RalphStateMap$2 = RalphStateMap$4;
357
- type XmlNode = _smithers_graph.XmlNode;
393
+ type XmlNode = _smithers_orchestrator_graph.XmlNode;
358
394
 
359
395
  /**
360
396
  * @param {PlanNode | null} plan
@@ -370,7 +406,7 @@ type PlanNode$1 = PlanNode$4;
370
406
  type RalphStateMap$1 = RalphStateMap$4;
371
407
  type RetryWaitMap$1 = RetryWaitMap$3;
372
408
  type ScheduleResult$1 = ScheduleResult$3;
373
- type TaskDescriptor = _smithers_graph.TaskDescriptor;
409
+ type TaskDescriptor = _smithers_orchestrator_graph.TaskDescriptor;
374
410
  type TaskStateMap$1 = TaskStateMap$4;
375
411
 
376
412
  declare class WorkflowSession extends Context.TagClassShape<"WorkflowSession", WorkflowSessionService$2> {
@@ -384,7 +420,18 @@ declare function makeWorkflowSession(options?: WorkflowSessionOptions$1): Workfl
384
420
  type WorkflowSessionOptions$1 = WorkflowSessionOptions$2;
385
421
  type WorkflowSessionService$1 = WorkflowSessionService$2;
386
422
 
387
- /** @type {Layer.Layer<WorkflowSession, never, never>} */
423
+ /**
424
+ * WARNING — do not consume this layer as-is. `Layer.sync` builds **one** shared
425
+ * `makeWorkflowSession()` instance for the whole layer scope, but a workflow
426
+ * session carries per-run state, so sharing it across runs is a correctness bug.
427
+ * The engine intentionally bypasses this Tag and constructs a fresh session per
428
+ * run via `makeWorkflowSession()` directly — which is why nothing yields
429
+ * `WorkflowSession` today. Before any consumer reads the Tag, rework this into a
430
+ * per-run/scoped provider (e.g. `Layer.scoped` or a factory service) so each run
431
+ * gets its own session.
432
+ *
433
+ * @type {Layer.Layer<WorkflowSession, never, never>}
434
+ */
388
435
  declare const WorkflowSessionLive: Layer.Layer<WorkflowSession, never, never>;
389
436
 
390
437
  /**
@@ -398,7 +445,7 @@ declare function nowMs(): number;
398
445
  * @param {RetryPolicy} policy
399
446
  * @returns {Schedule.Schedule<unknown>}
400
447
  */
401
- declare function retryPolicyToSchedule(policy: RetryPolicy$2): Schedule.Schedule<unknown>;
448
+ declare function retryPolicyToSchedule(policy: RetryPolicy$2): Schedule$1.Schedule<unknown>;
402
449
  type RetryPolicy$2 = RetryPolicy$3;
403
450
 
404
451
  /**
@@ -434,6 +481,14 @@ type RetryWaitMap = RetryWaitMap$3;
434
481
  type RunResult = RunResult$1;
435
482
  type ScheduleResult = ScheduleResult$3;
436
483
  type ScheduleSnapshot = ScheduleSnapshot$1;
484
+ type SmithersAlertLabels = SmithersAlertLabels$1;
485
+ type SmithersAlertPolicy = SmithersAlertPolicy$1;
486
+ type SmithersAlertPolicyDefaults = SmithersAlertPolicyDefaults$1;
487
+ type SmithersAlertPolicyRule = SmithersAlertPolicyRule$1;
488
+ type SmithersAlertReaction = SmithersAlertReaction$1;
489
+ type SmithersAlertReactionKind = SmithersAlertReactionKind$1;
490
+ type SmithersAlertReactionRef = SmithersAlertReactionRef$1;
491
+ type SmithersAlertSeverity = SmithersAlertSeverity$1;
437
492
  type SmithersWorkflowOptions = SmithersWorkflowOptions$1;
438
493
  type TaskFailure = TaskFailure$1;
439
494
  type TaskOutput = TaskOutput$1;
@@ -65,9 +65,10 @@ function mountedSignature(graph) {
65
65
  /**
66
66
  * @param {SessionState} state
67
67
  * @param {number} [iterationOverride]
68
+ * @param {RenderContext["trigger"]} [trigger]
68
69
  * @returns {RenderContext}
69
70
  */
70
- function renderContext(state, iterationOverride) {
71
+ function renderContext(state, iterationOverride, trigger) {
71
72
  const ralphIterations = [...state.ralphState.values()].map((value) => value.iteration);
72
73
  return {
73
74
  runId: state.runId,
@@ -77,6 +78,7 @@ function renderContext(state, iterationOverride) {
77
78
  taskStates: cloneTaskStateMap(state.states),
78
79
  outputs: new Map(state.outputs),
79
80
  ralphIterations: new Map([...state.ralphState.entries()].map(([id, value]) => [id, value.iteration])),
81
+ ...(trigger ? { trigger } : {}),
80
82
  };
81
83
  }
82
84
  /**
@@ -85,23 +87,48 @@ function renderContext(state, iterationOverride) {
85
87
  * @returns {WaitReason | undefined}
86
88
  */
87
89
  function findWaitingReason(state, currentTimeMs) {
90
+ // Do a full pass to accumulate quota count and find the highest-priority
91
+ // non-quota wait reason. This prevents an early-return from shadowing
92
+ // quota-blocked tasks when mixed wait types coexist in the same run.
93
+ let primaryReason;
94
+ let quotaBlockedCount = 0;
95
+ let earliestQuotaResetAtMs;
88
96
  for (const descriptor of state.descriptors.values()) {
89
97
  const taskState = state.states.get(stateKeyFor(descriptor));
90
- if (taskState === "waiting-approval") {
91
- return { _tag: "Approval", nodeId: descriptor.nodeId };
98
+ if (taskState === "waiting-approval" && !primaryReason) {
99
+ primaryReason = { _tag: "Approval", nodeId: descriptor.nodeId };
92
100
  }
93
- if (taskState === "waiting-event") {
101
+ else if (taskState === "waiting-event" && !primaryReason) {
94
102
  const eventName = typeof descriptor.meta?.__eventName === "string"
95
103
  ? descriptor.meta.__eventName
96
104
  : "";
97
- return { _tag: "Event", eventName };
105
+ primaryReason = { _tag: "Event", eventName };
98
106
  }
99
- if (taskState === "waiting-timer") {
100
- return {
107
+ else if (taskState === "waiting-timer" && !primaryReason) {
108
+ primaryReason = {
101
109
  _tag: "Timer",
102
110
  resumeAtMs: timerResumeAtMs(descriptor, currentTimeMs),
103
111
  };
104
112
  }
113
+ else if (taskState === "waiting-quota") {
114
+ quotaBlockedCount += 1;
115
+ const resetAtMs = state.quotaResetTimes.get(stateKeyFor(descriptor));
116
+ if (resetAtMs != null) {
117
+ earliestQuotaResetAtMs = earliestQuotaResetAtMs == null
118
+ ? resetAtMs
119
+ : Math.min(earliestQuotaResetAtMs, resetAtMs);
120
+ }
121
+ }
122
+ }
123
+ if (primaryReason) {
124
+ return primaryReason;
125
+ }
126
+ if (quotaBlockedCount > 0) {
127
+ return {
128
+ _tag: "Quota",
129
+ quotaBlockedCount,
130
+ ...(earliestQuotaResetAtMs != null ? { resetAtMs: earliestQuotaResetAtMs } : {}),
131
+ };
105
132
  }
106
133
  return undefined;
107
134
  }
@@ -179,6 +206,39 @@ function isRetryableFailure(descriptor, error) {
179
206
  }
180
207
  return true;
181
208
  }
209
+ /**
210
+ * @param {unknown} error
211
+ * @returns {boolean}
212
+ */
213
+ function isQuotaFailure(error) {
214
+ const payloadCode = error && typeof error === "object" && typeof error.code === "string"
215
+ ? error.code
216
+ : undefined;
217
+ const payloadDetails = error && typeof error === "object" && error.details && typeof error.details === "object"
218
+ ? error.details
219
+ : undefined;
220
+ const normalized = toSmithersError(error);
221
+ const code = payloadCode ?? normalized.code;
222
+ if (code === "AGENT_QUOTA_EXCEEDED")
223
+ return true;
224
+ const details = payloadDetails ?? normalized.details;
225
+ return Boolean(details && typeof details === "object" && details.failureQuota === true);
226
+ }
227
+ /**
228
+ * @param {unknown} error
229
+ * @returns {number | undefined}
230
+ */
231
+ function getQuotaResetAtMs(error) {
232
+ const payloadDetails = error && typeof error === "object" && error.details && typeof error.details === "object"
233
+ ? error.details
234
+ : undefined;
235
+ const normalized = toSmithersError(error);
236
+ const details = payloadDetails ?? normalized.details;
237
+ if (!details || typeof details !== "object")
238
+ return undefined;
239
+ const resetAtMs = details.quotaResetAtMs;
240
+ return typeof resetAtMs === "number" && Number.isFinite(resetAtMs) ? resetAtMs : undefined;
241
+ }
182
242
  /**
183
243
  * @param {unknown} error
184
244
  * @returns {boolean}
@@ -220,7 +280,11 @@ function describeDeadlock(state) {
220
280
  }
221
281
  else {
222
282
  const depState = state.states.get(stateKeyFor(dep)) ?? "pending";
223
- unmet.push(`'${depId}' (${depState})`);
283
+ if (depState !== "finished" &&
284
+ depState !== "skipped" &&
285
+ !(depState === "failed" && dep.continueOnFail)) {
286
+ unmet.push(`'${depId}' (${depState})`);
287
+ }
224
288
  }
225
289
  }
226
290
  if (unmet.length > 0) {
@@ -270,6 +334,8 @@ export function makeWorkflowSession(options = {}) {
270
334
  retryWait: new Map(),
271
335
  approvals: new Set(),
272
336
  ralphState: new Map(options.initialRalphState ?? []),
337
+ /** @type {Map<string, number>} Maps state key → quota reset timestamp (ms) */
338
+ quotaResetTimes: new Map(),
273
339
  schedule: null,
274
340
  cancelled: false,
275
341
  lastMountedSignature: null,
@@ -287,14 +353,37 @@ export function makeWorkflowSession(options = {}) {
287
353
  * @returns {EngineDecision}
288
354
  */
289
355
  function finishedResult(status = "finished") {
290
- return {
291
- _tag: "Finished",
292
- result: {
293
- runId: state.runId,
294
- status,
295
- output: [...state.outputs.values()].at(-1)?.output,
296
- },
356
+ /** @type {RunResult} */
357
+ const result = {
358
+ runId: state.runId,
359
+ status,
360
+ output: [...state.outputs.values()].at(-1)?.output,
297
361
  };
362
+ if (status === "finished") {
363
+ // At a `finished` terminal, any task still in `failed` state is a
364
+ // *tolerated* failure — an unhandled one would have produced a `Failed`
365
+ // decision via unhandledFailureDecision() and never reached here. Those
366
+ // are exactly the masked children (continueOnFail tasks, transient agent
367
+ // failures) the binary run status cannot express. Surface them so callers
368
+ // can detect a run that "succeeded" while children failed. See issue #295
369
+ // and docs/runtime/run-state.mdx.
370
+ //
371
+ // Keys are the canonical task state keys (`nodeId::iteration`), not bare
372
+ // node ids: a looped/Ralph workflow can fail the same nodeId across
373
+ // iterations, and the iteration is what disambiguates which child to
374
+ // inspect.
375
+ const failedChildKeys = [];
376
+ for (const [key, taskState] of state.states) {
377
+ if (taskState === "failed") {
378
+ failedChildKeys.push(key);
379
+ }
380
+ }
381
+ if (failedChildKeys.length > 0) {
382
+ result.failedChildren = failedChildKeys.length;
383
+ result.failedChildKeys = failedChildKeys;
384
+ }
385
+ }
386
+ return { _tag: "Finished", result };
298
387
  }
299
388
  /**
300
389
  * @returns {ScheduleResult}
@@ -332,6 +421,7 @@ export function makeWorkflowSession(options = {}) {
332
421
  state.approvals.delete(key);
333
422
  state.retryCounts.delete(key);
334
423
  state.failureDescriptors.delete(key);
424
+ state.quotaResetTimes.delete(key);
335
425
  }
336
426
  }
337
427
  for (const ralph of ralphs) {
@@ -362,14 +452,16 @@ export function makeWorkflowSession(options = {}) {
362
452
  state.outputs.set(key, output);
363
453
  state.retryWait.delete(key);
364
454
  state.failureDescriptors.delete(key);
455
+ state.quotaResetTimes.delete(key);
365
456
  }
366
457
  /**
367
458
  * @param {number} [iteration]
459
+ * @param {RenderContext["trigger"]} [trigger]
368
460
  * @returns {EngineDecision}
369
461
  */
370
- function decideAfterOutputChange(iteration) {
462
+ function decideAfterOutputChange(iteration, trigger) {
371
463
  if (options.requireRerenderOnOutputChange) {
372
- return { _tag: "ReRender", context: renderContext(state, iteration) };
464
+ return { _tag: "ReRender", context: renderContext(state, iteration, trigger) };
373
465
  }
374
466
  return decide();
375
467
  }
@@ -400,12 +492,54 @@ export function makeWorkflowSession(options = {}) {
400
492
  }
401
493
  }
402
494
  /**
495
+ * @param {string} eventName
496
+ * @param {unknown} payload
497
+ * @param {string | null} correlationId
498
+ */
499
+ function applyEventReceived(eventName, payload, correlationId) {
500
+ for (const descriptor of state.descriptors.values()) {
501
+ const key = stateKeyFor(descriptor);
502
+ const taskState = state.states.get(key);
503
+ const expected = typeof descriptor.meta?.__eventName === "string"
504
+ ? descriptor.meta.__eventName
505
+ : undefined;
506
+ const expectedCorrelation = typeof descriptor.meta?.__correlationId === "string"
507
+ ? descriptor.meta.__correlationId
508
+ : undefined;
509
+ if (taskState === "waiting-event" &&
510
+ (!expected || expected === eventName) &&
511
+ (expectedCorrelation === undefined || expectedCorrelation === correlationId)) {
512
+ state.states.set(key, "finished");
513
+ state.outputs.set(key, {
514
+ nodeId: descriptor.nodeId,
515
+ iteration: descriptor.iteration,
516
+ output: payload,
517
+ });
518
+ }
519
+ }
520
+ }
521
+ /**
403
522
  * @param {TaskDescriptor} descriptor
404
523
  * @param {unknown} error
405
524
  * @returns {EngineDecision}
406
525
  */
407
526
  function applyFailure(descriptor, error) {
408
527
  const key = stateKeyFor(descriptor);
528
+ // Quota/usage-limit errors do not consume the task's retry budget.
529
+ // Instead, put the task into "waiting-quota" so the run can pause
530
+ // durably and resume cleanly after the provider resets.
531
+ if (isQuotaFailure(error)) {
532
+ state.states.set(key, "waiting-quota");
533
+ state.failures.set(key, error);
534
+ const resetAtMs = getQuotaResetAtMs(error);
535
+ if (resetAtMs != null) {
536
+ state.quotaResetTimes.set(key, resetAtMs);
537
+ }
538
+ else {
539
+ state.quotaResetTimes.delete(key);
540
+ }
541
+ return decide();
542
+ }
409
543
  const failureCount = (state.retryCounts.get(key) ?? 0) + 1;
410
544
  state.retryCounts.set(key, failureCount);
411
545
  const retryable = isRetryableFailure(descriptor, error);
@@ -425,7 +559,11 @@ export function makeWorkflowSession(options = {}) {
425
559
  state.states.set(key, "failed");
426
560
  state.failures.set(key, error);
427
561
  state.failureDescriptors.set(key, descriptor);
428
- return decide();
562
+ return decideAfterOutputChange(descriptor.iteration, {
563
+ reason: "task-finished",
564
+ nodeId: descriptor.nodeId,
565
+ iteration: descriptor.iteration,
566
+ });
429
567
  }
430
568
  /**
431
569
  * @returns {EngineDecision | null}
@@ -459,11 +597,15 @@ export function makeWorkflowSession(options = {}) {
459
597
  };
460
598
  }
461
599
  /**
600
+ * @param {number} [depth] recursion depth; guarded at 10 to catch decision cycles
462
601
  * @returns {EngineDecision}
463
602
  */
464
603
  function decide(depth = 0) {
465
604
  if (depth > 10) {
466
- return { _tag: "Wait", reason: { _tag: "ExternalTrigger" } };
605
+ return {
606
+ _tag: "Failed",
607
+ error: new SmithersError("SCHEDULER_ERROR", "Exceeded scheduler decide() depth guard.", { depth }),
608
+ };
467
609
  }
468
610
  if (state.cancelled) {
469
611
  return finishedResult("cancelled");
@@ -623,7 +765,7 @@ export function makeWorkflowSession(options = {}) {
623
765
  advanced = true;
624
766
  }
625
767
  if (advanced) {
626
- return { _tag: "ReRender", context: renderContext(state) };
768
+ return { _tag: "ReRender", context: renderContext(state, undefined, { reason: "loop-advanced" }) };
627
769
  }
628
770
  }
629
771
  if (schedule.pendingExists) {
@@ -649,7 +791,7 @@ export function makeWorkflowSession(options = {}) {
649
791
  const signature = mountedSignature(state.graph);
650
792
  if (state.lastDeadlockSignature !== signature) {
651
793
  state.lastDeadlockSignature = signature;
652
- return { _tag: "ReRender", context: renderContext(state) };
794
+ return { _tag: "ReRender", context: renderContext(state, undefined, { reason: "deadlock-check" }) };
653
795
  }
654
796
  }
655
797
  return {
@@ -670,7 +812,7 @@ export function makeWorkflowSession(options = {}) {
670
812
  const signature = mountedSignature(state.graph);
671
813
  if (state.lastMountedSignature !== signature) {
672
814
  state.lastMountedSignature = signature;
673
- return { _tag: "ReRender", context: renderContext(state) };
815
+ return { _tag: "ReRender", context: renderContext(state, undefined, { reason: "stability-check" }) };
674
816
  }
675
817
  }
676
818
  return finishedResult();
@@ -686,17 +828,26 @@ export function makeWorkflowSession(options = {}) {
686
828
  }
687
829
  }),
688
830
  taskCompleted: (output) => Effect.sync(() => {
689
- const descriptor = findDescriptor(state, output.nodeId, output.iteration);
690
- if (!descriptor) {
691
- return failedDecision(new SmithersError("NODE_NOT_FOUND", `Unknown task ${output.nodeId}`), "taskCompleted");
692
- }
831
+ // A completion can legitimately arrive for a task that is no longer in the
832
+ // current graph: a conditionally-rendered task (e.g. `{done ? <Task pr/> : null}`)
833
+ // whose parent re-rendered it out while it was still running in the background.
834
+ // That result is stale, not fatal — record it (so it is available if the task
835
+ // re-mounts) and let the current graph drive the next decision. Failing here
836
+ // would discard every other in-flight task in the run.
693
837
  markTaskFinished(output);
694
- return decideAfterOutputChange(output.iteration);
838
+ return decideAfterOutputChange(output.iteration, {
839
+ reason: "task-finished",
840
+ nodeId: output.nodeId,
841
+ iteration: output.iteration,
842
+ });
695
843
  }),
696
844
  taskFailed: (failure) => Effect.sync(() => {
697
845
  const descriptor = findDescriptor(state, failure.nodeId, failure.iteration);
698
846
  if (!descriptor) {
699
- return failedDecision(new SmithersError("NODE_NOT_FOUND", `Unknown task ${failure.nodeId}`), "taskFailed");
847
+ // Stale failure for a task that already left the graph (see taskCompleted)
848
+ // the task is gone, so its failure is moot. Re-decide on the current graph
849
+ // rather than failing the whole run.
850
+ return decide();
700
851
  }
701
852
  return applyFailure(descriptor, failure.error);
702
853
  }),
@@ -727,51 +878,11 @@ export function makeWorkflowSession(options = {}) {
727
878
  return decide();
728
879
  }),
729
880
  eventReceived: (eventName, payload, correlationId = null) => Effect.sync(() => {
730
- for (const descriptor of state.descriptors.values()) {
731
- const key = stateKeyFor(descriptor);
732
- const taskState = state.states.get(key);
733
- const expected = typeof descriptor.meta?.__eventName === "string"
734
- ? descriptor.meta.__eventName
735
- : undefined;
736
- const expectedCorrelation = typeof descriptor.meta?.__correlationId === "string"
737
- ? descriptor.meta.__correlationId
738
- : undefined;
739
- if (taskState === "waiting-event" &&
740
- (!expected || expected === eventName) &&
741
- (expectedCorrelation === undefined || expectedCorrelation === correlationId)) {
742
- state.states.set(key, "finished");
743
- state.outputs.set(key, {
744
- nodeId: descriptor.nodeId,
745
- iteration: descriptor.iteration,
746
- output: payload,
747
- });
748
- }
749
- }
881
+ applyEventReceived(eventName, payload, correlationId);
750
882
  return decide();
751
883
  }),
752
884
  signalReceived: (signalName, payload, correlationId = null) => Effect.sync(() => {
753
- for (const descriptor of state.descriptors.values()) {
754
- const key = stateKeyFor(descriptor);
755
- const taskState = state.states.get(key);
756
- const expected = typeof descriptor.meta?.__signalName === "string"
757
- ? descriptor.meta.__signalName
758
- : typeof descriptor.meta?.__eventName === "string"
759
- ? descriptor.meta.__eventName
760
- : undefined;
761
- const expectedCorrelation = typeof descriptor.meta?.__correlationId === "string"
762
- ? descriptor.meta.__correlationId
763
- : undefined;
764
- if (taskState === "waiting-event" &&
765
- (!expected || expected === signalName) &&
766
- (expectedCorrelation === undefined || expectedCorrelation === correlationId)) {
767
- state.states.set(key, "finished");
768
- state.outputs.set(key, {
769
- nodeId: descriptor.nodeId,
770
- iteration: descriptor.iteration,
771
- output: payload,
772
- });
773
- }
774
- }
885
+ applyEventReceived(signalName, payload, correlationId);
775
886
  return decide();
776
887
  }),
777
888
  timerFired: (nodeId, firedAtMs = nowMs()) => Effect.sync(() => {
@@ -788,7 +899,11 @@ export function makeWorkflowSession(options = {}) {
788
899
  iteration: descriptor.iteration,
789
900
  output: { firedAtMs },
790
901
  });
791
- return decideAfterOutputChange(descriptor.iteration);
902
+ return decideAfterOutputChange(descriptor.iteration, {
903
+ reason: "timer-fired",
904
+ nodeId: descriptor.nodeId,
905
+ iteration: descriptor.iteration,
906
+ });
792
907
  }),
793
908
  hotReloaded: (graph) => Effect.sync(() => {
794
909
  try {
@@ -822,7 +937,11 @@ export function makeWorkflowSession(options = {}) {
822
937
  usage: output.usage ?? null,
823
938
  output: output.output,
824
939
  });
825
- return decideAfterOutputChange(output.iteration);
940
+ return decideAfterOutputChange(output.iteration, {
941
+ reason: "cache-resolved",
942
+ nodeId: output.nodeId,
943
+ iteration: output.iteration,
944
+ });
826
945
  }),
827
946
  cacheMissed: (nodeId, iteration) => Effect.sync(() => {
828
947
  const descriptor = findDescriptor(state, nodeId, iteration);
@@ -123,6 +123,7 @@ export function scheduleTasks(plan, states, descriptors, ralphState, retryWait,
123
123
  }
124
124
  /**
125
125
  * @param {PlanNode} node
126
+ * @param {{ includeContinuedFailures?: boolean }} [options]
126
127
  * @returns {{ readonly terminal: boolean; readonly failed: boolean }}
127
128
  */
128
129
  function inspect(node, options = {}) {