@oni.bot/core 1.0.1 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (214) hide show
  1. package/CHANGELOG.md +146 -126
  2. package/SECURITY.md +2 -2
  3. package/dist/agents/context.d.ts +0 -2
  4. package/dist/agents/context.d.ts.map +1 -1
  5. package/dist/agents/context.js +1 -3
  6. package/dist/agents/context.js.map +1 -1
  7. package/dist/agents/define-agent.d.ts.map +1 -1
  8. package/dist/agents/define-agent.js +99 -41
  9. package/dist/agents/define-agent.js.map +1 -1
  10. package/dist/agents/functional-agent.d.ts.map +1 -1
  11. package/dist/agents/functional-agent.js +0 -7
  12. package/dist/agents/functional-agent.js.map +1 -1
  13. package/dist/agents/types.d.ts +0 -2
  14. package/dist/agents/types.d.ts.map +1 -1
  15. package/dist/checkpoint.d.ts.map +1 -1
  16. package/dist/checkpoint.js +7 -2
  17. package/dist/checkpoint.js.map +1 -1
  18. package/dist/checkpointers/postgres.d.ts.map +1 -1
  19. package/dist/checkpointers/postgres.js +47 -31
  20. package/dist/checkpointers/postgres.js.map +1 -1
  21. package/dist/checkpointers/sqlite.js +4 -4
  22. package/dist/checkpointers/sqlite.js.map +1 -1
  23. package/dist/circuit-breaker.d.ts +1 -0
  24. package/dist/circuit-breaker.d.ts.map +1 -1
  25. package/dist/circuit-breaker.js +13 -0
  26. package/dist/circuit-breaker.js.map +1 -1
  27. package/dist/cli/dev.d.ts.map +1 -1
  28. package/dist/cli/dev.js +0 -1
  29. package/dist/cli/dev.js.map +1 -1
  30. package/dist/cli/router.js +1 -1
  31. package/dist/cli/run.d.ts.map +1 -1
  32. package/dist/cli/run.js +0 -1
  33. package/dist/cli/run.js.map +1 -1
  34. package/dist/config/loader.d.ts +1 -1
  35. package/dist/config/loader.d.ts.map +1 -1
  36. package/dist/config/loader.js +35 -5
  37. package/dist/config/loader.js.map +1 -1
  38. package/dist/context.d.ts +20 -0
  39. package/dist/context.d.ts.map +1 -1
  40. package/dist/context.js.map +1 -1
  41. package/dist/coordination/pubsub.d.ts +1 -0
  42. package/dist/coordination/pubsub.d.ts.map +1 -1
  43. package/dist/coordination/pubsub.js +31 -16
  44. package/dist/coordination/pubsub.js.map +1 -1
  45. package/dist/coordination/request-reply.d.ts +7 -0
  46. package/dist/coordination/request-reply.d.ts.map +1 -1
  47. package/dist/coordination/request-reply.js +65 -11
  48. package/dist/coordination/request-reply.js.map +1 -1
  49. package/dist/events/bus.d.ts +1 -0
  50. package/dist/events/bus.d.ts.map +1 -1
  51. package/dist/events/bus.js +16 -10
  52. package/dist/events/bus.js.map +1 -1
  53. package/dist/functional.d.ts.map +1 -1
  54. package/dist/functional.js +3 -0
  55. package/dist/functional.js.map +1 -1
  56. package/dist/graph.d.ts.map +1 -1
  57. package/dist/graph.js +21 -3
  58. package/dist/graph.js.map +1 -1
  59. package/dist/guardrails/audit.d.ts +4 -1
  60. package/dist/guardrails/audit.d.ts.map +1 -1
  61. package/dist/guardrails/audit.js +18 -1
  62. package/dist/guardrails/audit.js.map +1 -1
  63. package/dist/harness/agent-loop.d.ts.map +1 -1
  64. package/dist/harness/agent-loop.js +471 -352
  65. package/dist/harness/agent-loop.js.map +1 -1
  66. package/dist/harness/context-compactor.d.ts +1 -0
  67. package/dist/harness/context-compactor.d.ts.map +1 -1
  68. package/dist/harness/context-compactor.js +43 -1
  69. package/dist/harness/context-compactor.js.map +1 -1
  70. package/dist/harness/harness.d.ts +6 -0
  71. package/dist/harness/harness.d.ts.map +1 -1
  72. package/dist/harness/harness.js +32 -5
  73. package/dist/harness/harness.js.map +1 -1
  74. package/dist/harness/hooks-engine.d.ts.map +1 -1
  75. package/dist/harness/hooks-engine.js +12 -10
  76. package/dist/harness/hooks-engine.js.map +1 -1
  77. package/dist/harness/index.d.ts +3 -1
  78. package/dist/harness/index.d.ts.map +1 -1
  79. package/dist/harness/index.js +2 -0
  80. package/dist/harness/index.js.map +1 -1
  81. package/dist/harness/memory-loader.d.ts +150 -0
  82. package/dist/harness/memory-loader.d.ts.map +1 -0
  83. package/dist/harness/memory-loader.js +714 -0
  84. package/dist/harness/memory-loader.js.map +1 -0
  85. package/dist/harness/safety-gate.d.ts.map +1 -1
  86. package/dist/harness/safety-gate.js +47 -26
  87. package/dist/harness/safety-gate.js.map +1 -1
  88. package/dist/harness/skill-loader.d.ts +7 -0
  89. package/dist/harness/skill-loader.d.ts.map +1 -1
  90. package/dist/harness/skill-loader.js +24 -8
  91. package/dist/harness/skill-loader.js.map +1 -1
  92. package/dist/harness/todo-module.d.ts.map +1 -1
  93. package/dist/harness/todo-module.js +13 -6
  94. package/dist/harness/todo-module.js.map +1 -1
  95. package/dist/harness/types.d.ts +7 -0
  96. package/dist/harness/types.d.ts.map +1 -1
  97. package/dist/harness/types.js.map +1 -1
  98. package/dist/harness/validate-args.js +18 -3
  99. package/dist/harness/validate-args.js.map +1 -1
  100. package/dist/hitl/interrupt.d.ts +2 -2
  101. package/dist/hitl/interrupt.d.ts.map +1 -1
  102. package/dist/hitl/interrupt.js +6 -4
  103. package/dist/hitl/interrupt.js.map +1 -1
  104. package/dist/hitl/resume.d.ts +10 -0
  105. package/dist/hitl/resume.d.ts.map +1 -1
  106. package/dist/hitl/resume.js +31 -0
  107. package/dist/hitl/resume.js.map +1 -1
  108. package/dist/index.js +1 -1
  109. package/dist/injected.d.ts.map +1 -1
  110. package/dist/injected.js.map +1 -1
  111. package/dist/inspect.d.ts.map +1 -1
  112. package/dist/inspect.js +28 -8
  113. package/dist/inspect.js.map +1 -1
  114. package/dist/lsp/client.d.ts +2 -0
  115. package/dist/lsp/client.d.ts.map +1 -1
  116. package/dist/lsp/client.js +62 -17
  117. package/dist/lsp/client.js.map +1 -1
  118. package/dist/lsp/index.d.ts.map +1 -1
  119. package/dist/lsp/index.js.map +1 -1
  120. package/dist/mcp/client.d.ts +2 -0
  121. package/dist/mcp/client.d.ts.map +1 -1
  122. package/dist/mcp/client.js +45 -14
  123. package/dist/mcp/client.js.map +1 -1
  124. package/dist/mcp/convert.js +1 -1
  125. package/dist/mcp/convert.js.map +1 -1
  126. package/dist/mcp/transport.d.ts +2 -0
  127. package/dist/mcp/transport.d.ts.map +1 -1
  128. package/dist/mcp/transport.js +33 -8
  129. package/dist/mcp/transport.js.map +1 -1
  130. package/dist/messages/index.d.ts.map +1 -1
  131. package/dist/messages/index.js +7 -1
  132. package/dist/messages/index.js.map +1 -1
  133. package/dist/models/anthropic.d.ts.map +1 -1
  134. package/dist/models/anthropic.js +25 -15
  135. package/dist/models/anthropic.js.map +1 -1
  136. package/dist/models/google.d.ts.map +1 -1
  137. package/dist/models/google.js +23 -7
  138. package/dist/models/google.js.map +1 -1
  139. package/dist/models/ollama.d.ts.map +1 -1
  140. package/dist/models/ollama.js +11 -1
  141. package/dist/models/ollama.js.map +1 -1
  142. package/dist/models/openai.d.ts.map +1 -1
  143. package/dist/models/openai.js +15 -3
  144. package/dist/models/openai.js.map +1 -1
  145. package/dist/models/openrouter.d.ts.map +1 -1
  146. package/dist/models/openrouter.js +14 -3
  147. package/dist/models/openrouter.js.map +1 -1
  148. package/dist/prebuilt/react-agent.d.ts.map +1 -1
  149. package/dist/prebuilt/react-agent.js +1 -0
  150. package/dist/prebuilt/react-agent.js.map +1 -1
  151. package/dist/pregel.d.ts +11 -6
  152. package/dist/pregel.d.ts.map +1 -1
  153. package/dist/pregel.js +473 -349
  154. package/dist/pregel.js.map +1 -1
  155. package/dist/retry.d.ts.map +1 -1
  156. package/dist/retry.js +7 -6
  157. package/dist/retry.js.map +1 -1
  158. package/dist/store/index.d.ts +1 -1
  159. package/dist/store/index.d.ts.map +1 -1
  160. package/dist/store/index.js +63 -13
  161. package/dist/store/index.js.map +1 -1
  162. package/dist/stream-events.d.ts.map +1 -1
  163. package/dist/stream-events.js +3 -9
  164. package/dist/stream-events.js.map +1 -1
  165. package/dist/streaming.d.ts +5 -2
  166. package/dist/streaming.d.ts.map +1 -1
  167. package/dist/streaming.js +9 -8
  168. package/dist/streaming.js.map +1 -1
  169. package/dist/swarm/graph.d.ts +16 -2
  170. package/dist/swarm/graph.d.ts.map +1 -1
  171. package/dist/swarm/graph.js +204 -53
  172. package/dist/swarm/graph.js.map +1 -1
  173. package/dist/swarm/index.d.ts +2 -1
  174. package/dist/swarm/index.d.ts.map +1 -1
  175. package/dist/swarm/index.js.map +1 -1
  176. package/dist/swarm/mailbox.d.ts.map +1 -1
  177. package/dist/swarm/mailbox.js +3 -1
  178. package/dist/swarm/mailbox.js.map +1 -1
  179. package/dist/swarm/mermaid.d.ts +2 -1
  180. package/dist/swarm/mermaid.d.ts.map +1 -1
  181. package/dist/swarm/mermaid.js +6 -3
  182. package/dist/swarm/mermaid.js.map +1 -1
  183. package/dist/swarm/pool.d.ts.map +1 -1
  184. package/dist/swarm/pool.js +30 -5
  185. package/dist/swarm/pool.js.map +1 -1
  186. package/dist/swarm/registry.d.ts.map +1 -1
  187. package/dist/swarm/registry.js +7 -0
  188. package/dist/swarm/registry.js.map +1 -1
  189. package/dist/swarm/scaling.d.ts +10 -1
  190. package/dist/swarm/scaling.d.ts.map +1 -1
  191. package/dist/swarm/scaling.js +85 -14
  192. package/dist/swarm/scaling.js.map +1 -1
  193. package/dist/swarm/snapshot.d.ts.map +1 -1
  194. package/dist/swarm/snapshot.js +10 -1
  195. package/dist/swarm/snapshot.js.map +1 -1
  196. package/dist/swarm/supervisor.js +20 -12
  197. package/dist/swarm/supervisor.js.map +1 -1
  198. package/dist/swarm/tracer.d.ts +3 -1
  199. package/dist/swarm/tracer.d.ts.map +1 -1
  200. package/dist/swarm/tracer.js +66 -15
  201. package/dist/swarm/tracer.js.map +1 -1
  202. package/dist/swarm/types.d.ts +1 -6
  203. package/dist/swarm/types.d.ts.map +1 -1
  204. package/dist/testing/index.d.ts +2 -2
  205. package/dist/testing/index.d.ts.map +1 -1
  206. package/dist/testing/index.js.map +1 -1
  207. package/dist/tools/define.d.ts.map +1 -1
  208. package/dist/tools/define.js +1 -0
  209. package/dist/tools/define.js.map +1 -1
  210. package/dist/tools/types.d.ts +2 -0
  211. package/dist/tools/types.d.ts.map +1 -1
  212. package/dist/types.d.ts +3 -1
  213. package/dist/types.d.ts.map +1 -1
  214. package/package.json +7 -1
package/dist/pregel.js CHANGED
@@ -3,18 +3,21 @@
3
3
  // New: interrupt() context management, HITL resume, token streaming
4
4
  // ============================================================
5
5
  import { START, END, Send, Command, } from "./types.js";
6
- import { RecursionLimitError, NodeNotFoundError, ONIInterrupt, NodeTimeoutError, ONIError, NodeExecutionError } from "./errors.js";
6
+ import { RecursionLimitError, NodeNotFoundError, ONIInterrupt, NodeTimeoutError, ONIError, NodeExecutionError, CircuitBreakerOpenError } from "./errors.js";
7
7
  import { CircuitBreaker } from "./circuit-breaker.js";
8
8
  import { withRetry } from "./retry.js";
9
9
  import { NamespacedCheckpointer } from "./checkpointers/namespaced.js";
10
10
  import { _runWithContext } from "./context.js";
11
- import { StreamWriterImpl, _installTokenHandler, _clearTokenHandler } from "./streaming.js";
11
+ import { StreamWriterImpl, _withTokenHandler } from "./streaming.js";
12
12
  import { NodeInterruptSignal, HITLInterruptException, HITLSessionStore, _installInterruptContext, _clearInterruptContext, } from "./hitl/index.js";
13
13
  import { EventBus } from "./events/bus.js";
14
14
  import { AuditLog } from "./guardrails/audit.js";
15
15
  import { BudgetTracker } from "./guardrails/budget.js";
16
+ import { runFilters } from "./guardrails/filters.js";
16
17
  import { ONITracer } from "./telemetry.js";
17
18
  const DEFAULT_RECURSION_LIMIT = 25;
19
+ /** Maximum entries in the node result cache — oldest entries evicted FIFO when full. */
20
+ const NODE_CACHE_MAX_SIZE = 256;
18
21
  export class ONIPregelRunner {
19
22
  nodes;
20
23
  edges;
@@ -27,13 +30,17 @@ export class ONIPregelRunner {
27
30
  hitlStore = new HITLSessionStore();
28
31
  nodeCache = new Map();
29
32
  circuitBreakers = new Map();
30
- /** Set to true when this runner is being invoked as a subgraph */
31
- _isSubgraph = false;
32
- /** Accumulated parent updates from Command.PARENT during subgraph execution */
33
- _parentUpdates = [];
33
+ /** Count of concurrent subgraph invocations active on this runner. >0 means running as subgraph. */
34
+ _subgraphRefCount = 0;
35
+ /** Per-invocation parent updates from Command.PARENT, keyed by parent threadId. */
36
+ _perInvocationParentUpdates = new Map();
37
+ /** Per-invocation checkpointer override for subgraph isolation, keyed by threadId. */
38
+ _perInvocationCheckpointer = new Map();
34
39
  eventBus;
35
40
  auditLog;
36
41
  budgetTracker;
42
+ contentFilters;
43
+ toolPermissions;
37
44
  tracer;
38
45
  /** Pre-indexed edges by source node — O(1) lookup instead of O(n) filter */
39
46
  _edgesBySource;
@@ -51,6 +58,8 @@ export class ONIPregelRunner {
51
58
  this.eventBus = new EventBus(listeners);
52
59
  this.auditLog = guardrails?.audit ? new AuditLog() : null;
53
60
  this.budgetTracker = guardrails?.budget ? new BudgetTracker(guardrails.budget) : null;
61
+ this.contentFilters = guardrails?.filters ?? [];
62
+ this.toolPermissions = guardrails?.toolPermissions;
54
63
  this.tracer = new ONITracer(tracer ?? null);
55
64
  // Pre-index edges by source for O(1) lookups in getNextNodes
56
65
  this._edgesBySource = new Map();
@@ -143,6 +152,7 @@ export class ONIPregelRunner {
143
152
  this.nodeCache.delete(cacheKey);
144
153
  }
145
154
  }
155
+ const _tid = config?.threadId ?? "unknown";
146
156
  const ctx = {
147
157
  config: config ?? {},
148
158
  store: this.store,
@@ -152,14 +162,41 @@ export class ONIPregelRunner {
152
162
  parentUpdates: [],
153
163
  step: step ?? 0,
154
164
  recursionLimit: recursionLimit ?? DEFAULT_RECURSION_LIMIT,
165
+ toolPermissions: this.toolPermissions,
166
+ _recordUsage: (agentName, modelId, usage) => {
167
+ if (!this.budgetTracker)
168
+ return;
169
+ const entries = this.budgetTracker.record(agentName, modelId, usage);
170
+ for (const e of entries)
171
+ this.auditLog?.record(_tid, e);
172
+ },
173
+ _emitEvent: (event) => this.eventBus.emit(event),
174
+ _auditRecord: (entry) => this.auditLog?.record(_tid, entry),
155
175
  };
156
176
  return _runWithContext(ctx, async () => {
157
177
  _installInterruptContext({
158
178
  nodeName: nodeDef.name,
159
- resumeValue: resumeValue,
160
- hasResume: hasResume ?? false,
179
+ resumeValues: hasResume ? [resumeValue] : [],
161
180
  });
162
181
  try {
182
+ // Content filter — input direction
183
+ if (this.contentFilters.length > 0) {
184
+ const inputStr = JSON.stringify(state);
185
+ const inputCheck = runFilters(this.contentFilters, inputStr, "input");
186
+ if (!inputCheck.passed) {
187
+ const threadId = config?.threadId ?? "unknown";
188
+ this.eventBus.emit({ type: "filter.blocked", filter: inputCheck.blockedBy, agent: nodeDef.name, direction: "input", reason: inputCheck.reason, timestamp: Date.now() });
189
+ this.auditLog?.record(threadId, { timestamp: Date.now(), agent: nodeDef.name, action: "filter.blocked", data: { filter: inputCheck.blockedBy, direction: "input", reason: inputCheck.reason } });
190
+ throw new Error(`Content blocked by filter "${inputCheck.blockedBy}" on input to node "${nodeDef.name}": ${inputCheck.reason}`);
191
+ }
192
+ // Apply redaction if content was rewritten by a redacting filter
193
+ if (inputCheck.content !== inputStr) {
194
+ try {
195
+ state = JSON.parse(inputCheck.content);
196
+ }
197
+ catch { /* leave state unchanged on parse failure */ }
198
+ }
199
+ }
163
200
  const run = () => Promise.resolve(nodeDef.fn(state, config));
164
201
  // Core execute call: retry-aware
165
202
  const executeCall = async () => {
@@ -204,15 +241,44 @@ export class ONIPregelRunner {
204
241
  // Pass through interrupt signals (thrown by interrupt() inside nodes)
205
242
  if (err instanceof NodeInterruptSignal)
206
243
  throw err;
207
- // Pass through structured ONI errors (NodeExecutionError from retry, NodeTimeoutError, etc.)
208
- if (err instanceof ONIError)
209
- throw err;
210
- // Wrap raw errors and non-Error throws in NodeExecutionError
211
- const cause = err instanceof Error ? err : new Error(String(err));
212
- throw new NodeExecutionError(nodeDef.name, cause);
244
+ // Circuit breaker open invoke user fallback with real state + error
245
+ if (err instanceof CircuitBreakerOpenError && nodeDef.circuitBreaker?.fallback) {
246
+ result = nodeDef.circuitBreaker.fallback(state, err);
247
+ }
248
+ else {
249
+ // Pass through structured ONI errors (NodeExecutionError from retry, NodeTimeoutError, etc.)
250
+ if (err instanceof ONIError)
251
+ throw err;
252
+ // Wrap raw errors and non-Error throws in NodeExecutionError
253
+ const cause = err instanceof Error ? err : new Error(String(err));
254
+ throw new NodeExecutionError(nodeDef.name, cause);
255
+ }
256
+ }
257
+ // Content filter — output direction
258
+ if (this.contentFilters.length > 0 && result != null) {
259
+ const outputStr = JSON.stringify(result);
260
+ const outputCheck = runFilters(this.contentFilters, outputStr, "output");
261
+ if (!outputCheck.passed) {
262
+ const threadId = config?.threadId ?? "unknown";
263
+ this.eventBus.emit({ type: "filter.blocked", filter: outputCheck.blockedBy, agent: nodeDef.name, direction: "output", reason: outputCheck.reason, timestamp: Date.now() });
264
+ this.auditLog?.record(threadId, { timestamp: Date.now(), agent: nodeDef.name, action: "filter.blocked", data: { filter: outputCheck.blockedBy, direction: "output", reason: outputCheck.reason } });
265
+ throw new Error(`Content blocked by filter "${outputCheck.blockedBy}" on output of node "${nodeDef.name}": ${outputCheck.reason}`);
266
+ }
267
+ // Apply redaction if content was rewritten by a redacting filter
268
+ if (outputCheck.content !== outputStr) {
269
+ try {
270
+ result = JSON.parse(outputCheck.content);
271
+ }
272
+ catch { /* leave result unchanged on parse failure */ }
273
+ }
213
274
  }
214
- // Store in cache (reuse key computed above)
275
+ // Store in cache (reuse key computed above); evict oldest entry when full
215
276
  if (nodeDef.cache && cacheKey) {
277
+ if (this.nodeCache.size >= NODE_CACHE_MAX_SIZE) {
278
+ const oldest = this.nodeCache.keys().next().value;
279
+ if (oldest !== undefined)
280
+ this.nodeCache.delete(oldest);
281
+ }
216
282
  this.nodeCache.set(cacheKey, { result, timestamp: Date.now() });
217
283
  }
218
284
  return result;
@@ -253,20 +319,29 @@ export class ONIPregelRunner {
253
319
  const modeMessages = modes.has("messages");
254
320
  // Telemetry: graph-level span
255
321
  const graphSpan = this.tracer.startGraphSpan("invoke", { threadId, agentId });
256
- // Load resume values from config (set by resume() call)
257
- const resumeMap = config?.__resumeValues ?? {};
258
- // Load or init state
259
- let state;
260
- let step = 0;
261
- let pendingNodes = [];
262
- let pendingSends = [];
263
- if (this.checkpointer && config?.threadId) {
264
- const cp = await this.checkpointer.get(threadId);
265
- if (cp) {
266
- state = this.applyUpdate(cp.state, input);
267
- step = cp.step;
268
- pendingNodes = cp.nextNodes;
269
- pendingSends = cp.pendingSends ?? [];
322
+ let step = 0; // declared before try so finally can read it for setAttribute
323
+ try {
324
+ // Load resume values from config (set by resume() call)
325
+ const resumeMap = config?.__resumeValues ?? {};
326
+ // Load or init state
327
+ let state;
328
+ let pendingNodes = [];
329
+ let pendingSends = [];
330
+ const effectiveCheckpointer = (this._perInvocationCheckpointer.get(threadId) ?? this.checkpointer);
331
+ if (effectiveCheckpointer && config?.threadId) {
332
+ const cp = await effectiveCheckpointer.get(threadId);
333
+ if (cp) {
334
+ state = this.applyUpdate(cp.state, input);
335
+ step = cp.step;
336
+ pendingNodes = cp.nextNodes;
337
+ pendingSends = cp.pendingSends ?? [];
338
+ }
339
+ else {
340
+ state = this.applyUpdate(this.buildInitialState(), input);
341
+ const init = this.getNextNodes(START, state, config);
342
+ pendingNodes = init.nodes;
343
+ pendingSends = init.sends;
344
+ }
270
345
  }
271
346
  else {
272
347
  state = this.applyUpdate(this.buildInitialState(), input);
@@ -274,58 +349,318 @@ export class ONIPregelRunner {
274
349
  pendingNodes = init.nodes;
275
350
  pendingSends = init.sends;
276
351
  }
277
- }
278
- else {
279
- state = this.applyUpdate(this.buildInitialState(), input);
280
- const init = this.getNextNodes(START, state, config);
281
- pendingNodes = init.nodes;
282
- pendingSends = init.sends;
283
- }
284
- if (modeValues)
285
- yield tag(this.evt("state_update", state, step, agentId), "values");
286
- // ---- Main superstep loop ----
287
- while (true) {
288
- const nextNodes = [];
289
- const nextSends = [];
290
- state = this.resetEphemeral(state);
291
- // Drain sends group by target node for parallel fan-out execution
292
- const sendGroups = new Map();
293
- for (const send of pendingSends) {
294
- if (!sendGroups.has(send.node))
295
- sendGroups.set(send.node, []);
296
- sendGroups.get(send.node).push(send);
297
- if (modeDebug)
298
- yield tag(this.evt("send", send, step, agentId, send.node), "debug");
299
- }
300
- // Execute fan-out sends (each Send → separate node execution with its own state)
301
- if (sendGroups.size > 0) {
302
- // Build promises directly — avoids spread+flatMap intermediate arrays
303
- const sendPromises = [];
304
- for (const [node, sends] of sendGroups) {
305
- const nodeDef = this.nodes.get(node);
352
+ if (modeValues)
353
+ yield tag(this.evt("state_update", state, step, agentId), "values");
354
+ // ---- Main superstep loop ----
355
+ while (true) {
356
+ const nextNodes = [];
357
+ const nextSends = [];
358
+ state = this.resetEphemeral(state);
359
+ // Drain sends — group by target node for parallel fan-out execution
360
+ const sendGroups = new Map();
361
+ for (const send of pendingSends) {
362
+ if (!sendGroups.has(send.node))
363
+ sendGroups.set(send.node, []);
364
+ sendGroups.get(send.node).push(send);
365
+ if (modeDebug)
366
+ yield tag(this.evt("send", send, step, agentId, send.node), "debug");
367
+ }
368
+ // Recursion limit guard must fire before any sends execute so that
369
+ // node side-effects are not applied to a step that will be discarded.
370
+ if (step >= recursionLimit)
371
+ throw new RecursionLimitError(recursionLimit);
372
+ // Execute fan-out sends (each Send → separate node execution with its own state)
373
+ if (sendGroups.size > 0) {
374
+ // Build promises directly — avoids spread+flatMap intermediate arrays
375
+ const sendPromises = [];
376
+ for (const [node, sends] of sendGroups) {
377
+ const nodeDef = this.nodes.get(node);
378
+ if (!nodeDef)
379
+ throw new NodeNotFoundError(node);
380
+ for (const send of sends) {
381
+ sendPromises.push((async () => {
382
+ const sendState = this.applyUpdate(state, send.args);
383
+ const result = await this.executeNode(nodeDef, sendState, config, undefined, undefined, undefined, step, recursionLimit);
384
+ return { name: node, result };
385
+ })());
386
+ }
387
+ }
388
+ const sendResults = await Promise.all(sendPromises);
389
+ // Reduce all send results through channels
390
+ for (const { name, result } of sendResults) {
391
+ if (result instanceof Command) {
392
+ if (result.update)
393
+ state = this.applyUpdate(state, result.update);
394
+ const gotos = result.goto
395
+ ? (Array.isArray(result.goto) ? result.goto : [result.goto])
396
+ : this.getNextNodes(name, state, config).nodes;
397
+ nextNodes.push(...gotos);
398
+ }
399
+ else if (result && typeof result === "object") {
400
+ state = this.applyUpdate(state, result);
401
+ const { nodes, sends } = this.getNextNodes(name, state, config);
402
+ nextNodes.push(...nodes);
403
+ nextSends.push(...sends);
404
+ }
405
+ else {
406
+ const { nodes, sends } = this.getNextNodes(name, state, config);
407
+ nextNodes.push(...nodes);
408
+ nextSends.push(...sends);
409
+ }
410
+ if (modeUpdates || modeDebug) {
411
+ const delta = result instanceof Command ? (result.update ?? {}) : (result ?? {});
412
+ if (modeUpdates)
413
+ yield tag(this.evt("node_end", delta, step, agentId, name), "updates");
414
+ if (modeDebug)
415
+ yield tag(this.evt("node_end", delta, step, agentId, name), "debug");
416
+ }
417
+ }
418
+ }
419
+ pendingSends = [];
420
+ // Filter executable nodes (non-END), excluding nodes already handled by sends
421
+ const executableNodes = pendingNodes.filter((n) => n !== END && !sendGroups.has(n));
422
+ if (executableNodes.length === 0 && sendGroups.size === 0)
423
+ break;
424
+ // Emit debug node_start events before parallel execution
425
+ if (modeDebug) {
426
+ for (const nodeName of executableNodes) {
427
+ const name = nodeName;
428
+ if (!this.nodes.has(name))
429
+ throw new NodeNotFoundError(name);
430
+ // Static interrupt BEFORE (check before emitting start)
431
+ if (this.interruptConfig.interruptBefore?.includes(name))
432
+ throw new ONIInterrupt(name, "before", state);
433
+ this.checkDynamicInterrupt(name, "before", state, config);
434
+ yield tag(this.evt("node_start", {}, step, agentId, name), "debug");
435
+ }
436
+ }
437
+ // Execute all active nodes in parallel
438
+ const allCustomEvents = [];
439
+ const allMessageEvents = [];
440
+ const allSubgraphEvents = [];
441
+ const nodeWriters = new Map();
442
+ // Track the first HITL interrupt across all parallel nodes. We use
443
+ // allSettled (not Promise.all) so that when one node raises an interrupt,
444
+ // all other in-flight nodes complete before the interrupt is surfaced.
445
+ // This prevents orphaned background executions that would apply side
446
+ // effects without being checkpointed, causing double-application on resume.
447
+ let pendingInterrupt = null;
448
+ const allSettledResults = await Promise.allSettled(executableNodes.map(async (nodeName) => {
449
+ const name = nodeName;
450
+ const nodeDef = this.nodes.get(name);
306
451
  if (!nodeDef)
307
- throw new NodeNotFoundError(node);
308
- for (const send of sends) {
309
- sendPromises.push((async () => {
310
- const sendState = this.applyUpdate(state, send.args);
311
- const result = await this.executeNode(nodeDef, sendState, config, undefined, undefined, undefined, step, recursionLimit);
312
- return { name: node, result };
313
- })());
452
+ throw new NodeNotFoundError(name);
453
+ // Static interrupt BEFORE (non-debug mode)
454
+ if (!modeDebug) {
455
+ if (this.interruptConfig.interruptBefore?.includes(name))
456
+ throw new ONIInterrupt(name, "before", state);
457
+ this.checkDynamicInterrupt(name, "before", state, config);
458
+ }
459
+ // Create a StreamWriter for this node
460
+ const messageId = `msg-${threadId}-${step}-${name}`;
461
+ const customEvents = [];
462
+ const messageEvents = [];
463
+ const writerImpl = new StreamWriterImpl((evt) => customEvents.push(evt), (_token) => { }, (evt) => messageEvents.push(evt), name, step, messageId, agentId);
464
+ nodeWriters.set(name, writerImpl);
465
+ // Check if this node has a pending resume value
466
+ const resumeValue = resumeMap[name];
467
+ const hasResume = name in resumeMap;
468
+ // Emit agent.start lifecycle event
469
+ const nodeStartTime = Date.now();
470
+ this.eventBus.emit({ type: "agent.start", agent: name, timestamp: nodeStartTime, step });
471
+ // Telemetry: node-level span
472
+ const nodeSpan = this.tracer.startNodeSpan(name, { threadId, step, agentId });
473
+ let result;
474
+ let subParentUpdates = [];
475
+ try {
476
+ // Scope emitToken to this node's async context via ALS — parallel nodes each
477
+ // get their own handler so tokens are never dropped or misrouted.
478
+ result = await _withTokenHandler((token) => writerImpl.token(token), async () => {
479
+ if (nodeDef.subgraph) {
480
+ const childRunner = nodeDef.subgraph._runner;
481
+ // Per-invocation key for concurrent-safe state isolation
482
+ const invocationKey = threadId;
483
+ if (childRunner) {
484
+ childRunner._subgraphRefCount++;
485
+ childRunner._perInvocationParentUpdates.set(invocationKey, []);
486
+ }
487
+ // Install a namespaced checkpointer per invocation instead of swapping a shared field
488
+ if (this.checkpointer && childRunner) {
489
+ childRunner._perInvocationCheckpointer.set(invocationKey, new NamespacedCheckpointer(this.checkpointer, name));
490
+ }
491
+ // Stream the subgraph — always clean up per-invocation state, even on throw/interrupt
492
+ let subFinalState;
493
+ try {
494
+ const childStreamMode = ["debug", "values"];
495
+ for await (const evt of nodeDef.subgraph.stream(state, {
496
+ ...config,
497
+ // Pass the parent's effective threadId explicitly so the child's
498
+ // _perInvocationParentUpdates lookup at Command.PARENT time uses
499
+ // the same key that was registered in invocationKey above.
500
+ threadId: invocationKey,
501
+ parentRunId: config?.threadId,
502
+ streamMode: childStreamMode,
503
+ })) {
504
+ // Namespace-prefix the node name
505
+ allSubgraphEvents.push({
506
+ ...evt,
507
+ node: evt.node ? `${name}:${evt.node}` : name,
508
+ });
509
+ // Track the last state_update as the final subgraph state
510
+ if (evt.event === "state_update") {
511
+ subFinalState = evt.data;
512
+ }
513
+ }
514
+ if (childRunner) {
515
+ subParentUpdates = childRunner._perInvocationParentUpdates.get(invocationKey) ?? [];
516
+ }
517
+ }
518
+ finally {
519
+ // Clean up per-invocation state — decrement ref count, remove Maps entries
520
+ if (childRunner) {
521
+ childRunner._subgraphRefCount--;
522
+ childRunner._perInvocationParentUpdates.delete(invocationKey);
523
+ childRunner._perInvocationCheckpointer.delete(invocationKey);
524
+ }
525
+ }
526
+ return subFinalState ?? {};
527
+ }
528
+ else {
529
+ return this.executeNode(nodeDef, state, config, resumeValue, hasResume, writerImpl, step, recursionLimit);
530
+ }
531
+ });
532
+ }
533
+ catch (err) {
534
+ // Catch interrupt() signals thrown from inside nodes
535
+ if (err instanceof NodeInterruptSignal) {
536
+ const iv = {
537
+ value: err.value,
538
+ node: name,
539
+ resumeId: err.resumeId,
540
+ timestamp: Date.now(),
541
+ };
542
+ const exc = new HITLInterruptException(threadId, iv, state);
543
+ // Claim the first-interrupt slot SYNCHRONOUSLY before any await.
544
+ // Both concurrent interrupt handlers check this flag before yielding,
545
+ // so whichever catch block runs first exclusively owns the checkpoint
546
+ // save. Without this guard, the last saveCheckpoint wins and its
547
+ // nextNodes diverge from pendingInterrupt's node, corrupting resume().
548
+ const isFirstInterrupt = !pendingInterrupt;
549
+ if (isFirstInterrupt)
550
+ pendingInterrupt = exc;
551
+ // Only save checkpoint for the first interrupt — the stored nextNodes
552
+ // must match pendingInterrupt's node so resume() restores correctly.
553
+ if (isFirstInterrupt) {
554
+ await this.saveCheckpoint(threadId, step, state, [name], pendingSends, agentId, config?.metadata);
555
+ // Record HITL session if checkpointer exists
556
+ if (effectiveCheckpointer) {
557
+ const cp = await effectiveCheckpointer.get(threadId);
558
+ if (cp)
559
+ this.hitlStore.record(threadId, iv, cp);
560
+ }
561
+ }
562
+ throw exc; // marks this node's settled result as rejected
563
+ }
564
+ // Record to DLQ before re-throwing — use original cause if wrapped
565
+ if (this.dlq && err instanceof Error) {
566
+ const dlqErr = (err instanceof NodeExecutionError && err.cause instanceof Error) ? err.cause : err;
567
+ this.dlq.record(threadId, name, state, dlqErr, nodeDef.retry?.maxAttempts ?? 1);
568
+ }
569
+ // Telemetry: record error on node span — use original cause if wrapped
570
+ if (err instanceof Error) {
571
+ const telErr = (err instanceof NodeExecutionError && err.cause instanceof Error) ? err.cause : err;
572
+ this.tracer.recordError(nodeSpan, telErr);
573
+ }
574
+ this.tracer.endSpan(nodeSpan);
575
+ // Lifecycle event: emit error for non-interrupt failures
576
+ if (err instanceof Error) {
577
+ this.eventBus.emit({ type: "error", agent: name, error: err, timestamp: Date.now() });
578
+ }
579
+ throw err;
580
+ }
581
+ // Telemetry: end node span
582
+ this.tracer.endSpan(nodeSpan);
583
+ // Emit agent.end lifecycle event
584
+ this.eventBus.emit({ type: "agent.end", agent: name, timestamp: Date.now(), step, duration: Date.now() - nodeStartTime });
585
+ // Collect events for yielding after parallel execution
586
+ allCustomEvents.push(...customEvents);
587
+ allMessageEvents.push(...messageEvents);
588
+ return { name, result, subParentUpdates };
589
+ }));
590
+ // Extract results now that all nodes have settled.
591
+ // Re-throw the first non-interrupt error (DLQ/telemetry already handled
592
+ // inside each node's catch block), then surface any HITL interrupt.
593
+ const nodeResults = [];
594
+ for (const settled of allSettledResults) {
595
+ if (settled.status === "fulfilled") {
596
+ nodeResults.push(settled.value);
597
+ }
598
+ else if (!(settled.reason instanceof HITLInterruptException)) {
599
+ throw settled.reason; // first non-interrupt error
600
+ }
601
+ }
602
+ if (pendingInterrupt)
603
+ throw pendingInterrupt;
604
+ // Yield buffered subgraph events — filtered by parent's active modes
605
+ for (const evt of allSubgraphEvents) {
606
+ const e = evt;
607
+ if (modeDebug) {
608
+ yield tag(e, "debug");
609
+ }
610
+ else if (modeUpdates && (e.event === "node_end")) {
611
+ yield tag(e, "updates");
612
+ }
613
+ else if (modeValues && e.event === "state_update") {
614
+ yield tag(e, "values");
615
+ }
616
+ // Custom and message events from subgraphs are forwarded if those modes are active
617
+ if (modeCustom && e.event === "custom") {
618
+ yield tag(e, "custom");
619
+ }
620
+ if (modeMessages && (e.event === "messages" || e.event === "messages/complete")) {
621
+ yield tag(e, "messages");
314
622
  }
315
623
  }
316
- const sendResults = await Promise.all(sendPromises);
317
- // Reduce all send results through channels
318
- for (const { name, result } of sendResults) {
624
+ // Apply results
625
+ const stepWrites = [];
626
+ for (const { name, result, subParentUpdates: parentUpdates } of nodeResults) {
319
627
  if (result instanceof Command) {
320
- if (result.update)
321
- state = this.applyUpdate(state, result.update);
322
- const gotos = result.goto
323
- ? (Array.isArray(result.goto) ? result.goto : [result.goto])
324
- : this.getNextNodes(name, state, config).nodes;
325
- nextNodes.push(...gotos);
628
+ if (result.graph === Command.PARENT) {
629
+ // Push update to parent — do NOT apply locally
630
+ if (!this._subgraphRefCount) {
631
+ throw new Error("Command.PARENT used but graph is not running as a subgraph");
632
+ }
633
+ if (result.update) {
634
+ const myParentUpdates = this._perInvocationParentUpdates.get(threadId);
635
+ if (myParentUpdates)
636
+ myParentUpdates.push(result.update);
637
+ }
638
+ // Still resolve next nodes normally
639
+ const { nodes, sends } = this.getNextNodes(name, state, config);
640
+ nextNodes.push(...nodes);
641
+ nextSends.push(...sends);
642
+ }
643
+ else {
644
+ if (result.update) {
645
+ state = this.applyUpdate(state, result.update);
646
+ if (Object.keys(result.update).length > 0) {
647
+ stepWrites.push({ nodeId: name, writes: result.update });
648
+ }
649
+ }
650
+ const gotos = result.goto
651
+ ? (Array.isArray(result.goto) ? result.goto : [result.goto])
652
+ : this.getNextNodes(name, state, config).nodes;
653
+ nextNodes.push(...gotos);
654
+ if (result.send)
655
+ nextSends.push(...result.send.map((s) => ({ node: s.node, args: s.args })));
656
+ }
326
657
  }
327
658
  else if (result && typeof result === "object") {
328
659
  state = this.applyUpdate(state, result);
660
+ const writes = result;
661
+ if (Object.keys(writes).length > 0) {
662
+ stepWrites.push({ nodeId: name, writes });
663
+ }
329
664
  const { nodes, sends } = this.getNextNodes(name, state, config);
330
665
  nextNodes.push(...nodes);
331
666
  nextSends.push(...sends);
@@ -335,6 +670,10 @@ export class ONIPregelRunner {
335
670
  nextNodes.push(...nodes);
336
671
  nextSends.push(...sends);
337
672
  }
673
+ // Apply parent updates from subgraph Command.PARENT (after normal result)
674
+ for (const pu of parentUpdates) {
675
+ state = this.applyUpdate(state, pu);
676
+ }
338
677
  if (modeUpdates || modeDebug) {
339
678
  const delta = result instanceof Command ? (result.update ?? {}) : (result ?? {});
340
679
  if (modeUpdates)
@@ -342,286 +681,69 @@ export class ONIPregelRunner {
342
681
  if (modeDebug)
343
682
  yield tag(this.evt("node_end", delta, step, agentId, name), "debug");
344
683
  }
345
- }
346
- }
347
- pendingSends = [];
348
- // Filter executable nodes (non-END), excluding nodes already handled by sends
349
- const executableNodes = pendingNodes.filter((n) => n !== END && !sendGroups.has(n));
350
- if (executableNodes.length === 0 && sendGroups.size === 0)
351
- break;
352
- if (step >= recursionLimit)
353
- throw new RecursionLimitError(recursionLimit);
354
- // Emit debug node_start events before parallel execution
355
- if (modeDebug) {
356
- for (const nodeName of executableNodes) {
357
- const name = nodeName;
358
- if (!this.nodes.has(name))
359
- throw new NodeNotFoundError(name);
360
- // Static interrupt BEFORE (check before emitting start)
361
- if (this.interruptConfig.interruptBefore?.includes(name))
362
- throw new ONIInterrupt(name, "before", state);
363
- this.checkDynamicInterrupt(name, "before", state, config);
364
- yield tag(this.evt("node_start", {}, step, agentId, name), "debug");
365
- }
366
- }
367
- // Execute all active nodes in parallel
368
- const allCustomEvents = [];
369
- const allMessageEvents = [];
370
- const allSubgraphEvents = [];
371
- const nodeWriters = new Map();
372
- const nodeResults = await Promise.all(executableNodes.map(async (nodeName) => {
373
- const name = nodeName;
374
- const nodeDef = this.nodes.get(name);
375
- if (!nodeDef)
376
- throw new NodeNotFoundError(name);
377
- // Static interrupt BEFORE (non-debug mode)
378
- if (!modeDebug) {
379
- if (this.interruptConfig.interruptBefore?.includes(name))
380
- throw new ONIInterrupt(name, "before", state);
381
- this.checkDynamicInterrupt(name, "before", state, config);
382
- }
383
- // Create a StreamWriter for this node
384
- const messageId = `msg-${threadId}-${step}-${name}`;
385
- const customEvents = [];
386
- const messageEvents = [];
387
- const writerImpl = new StreamWriterImpl((evt) => customEvents.push(evt), (token) => { }, (evt) => messageEvents.push(evt), name, step, messageId, agentId);
388
- nodeWriters.set(name, writerImpl);
389
- // Install global emitToken handler so emitToken() routes to this node's writer
390
- _installTokenHandler((token) => writerImpl.token(token));
391
- // Check if this node has a pending resume value
392
- const resumeValue = resumeMap[name];
393
- const hasResume = name in resumeMap;
394
- // Emit agent.start lifecycle event
395
- const nodeStartTime = Date.now();
396
- this.eventBus.emit({ type: "agent.start", agent: name, timestamp: nodeStartTime, step });
397
- // Telemetry: node-level span
398
- const nodeSpan = this.tracer.startNodeSpan(name, { threadId, step, agentId });
399
- let result;
400
- let subParentUpdates = [];
401
- try {
402
- if (nodeDef.subgraph) {
403
- // Mark child runner as a subgraph so Command.PARENT works
404
- const childRunner = nodeDef.subgraph._runner;
405
- if (childRunner) {
406
- childRunner._isSubgraph = true;
407
- childRunner._parentUpdates = [];
408
- }
409
- // Namespace the subgraph's checkpointer for isolation
410
- if (this.checkpointer && childRunner) {
411
- childRunner.checkpointer = new NamespacedCheckpointer(this.checkpointer, name);
412
- }
413
- // Stream the subgraph instead of invoke — buffer events for yielding after Promise.all
414
- // Use debug+values so we capture ALL event types AND state_update for final state
415
- let subFinalState;
416
- const childStreamMode = ["debug", "values"];
417
- for await (const evt of nodeDef.subgraph.stream(state, {
418
- ...config,
419
- parentRunId: config?.threadId,
420
- streamMode: childStreamMode,
421
- })) {
422
- // Namespace-prefix the node name
423
- allSubgraphEvents.push({
424
- ...evt,
425
- node: evt.node ? `${name}:${evt.node}` : name,
426
- });
427
- // Track the last state_update as the final subgraph state
428
- if (evt.event === "state_update") {
429
- subFinalState = evt.data;
430
- }
431
- }
432
- // Collect parent updates from child
433
- if (childRunner) {
434
- subParentUpdates = childRunner._parentUpdates;
435
- childRunner._isSubgraph = false;
436
- childRunner._parentUpdates = [];
437
- }
438
- result = subFinalState ?? {};
439
- }
440
- else {
441
- result = await this.executeNode(nodeDef, state, config, resumeValue, hasResume, writerImpl, step, recursionLimit);
684
+ // Static interrupt AFTER
685
+ if (this.interruptConfig.interruptAfter?.includes(name)) {
686
+ await this.saveCheckpoint(threadId, step, state, nextNodes, nextSends, agentId, config?.metadata);
687
+ throw new ONIInterrupt(name, "after", state);
442
688
  }
689
+ this.checkDynamicInterrupt(name, "after", state, config);
443
690
  }
444
- catch (err) {
445
- // Catch interrupt() signals thrown from inside nodes
446
- if (err instanceof NodeInterruptSignal) {
447
- const iv = {
448
- value: err.value,
449
- node: name,
450
- resumeId: err.resumeId,
451
- timestamp: Date.now(),
452
- };
453
- // Save checkpoint before surfacing interrupt
454
- await this.saveCheckpoint(threadId, step, state, [name], pendingSends, agentId, config?.metadata);
455
- // Record HITL session if checkpointer exists
456
- if (this.checkpointer) {
457
- const cp = await this.checkpointer.get(threadId);
458
- if (cp)
459
- this.hitlStore.record(threadId, iv, cp);
460
- }
461
- throw new HITLInterruptException(threadId, iv, state);
462
- }
463
- // Record to DLQ before re-throwing — use original cause if wrapped
464
- if (this.dlq && err instanceof Error) {
465
- const dlqErr = (err instanceof NodeExecutionError && err.cause instanceof Error) ? err.cause : err;
466
- this.dlq.record(threadId, name, state, dlqErr, nodeDef.retry?.maxAttempts ?? 1);
467
- }
468
- // Telemetry: record error on node span — use original cause if wrapped
469
- if (err instanceof Error) {
470
- const telErr = (err instanceof NodeExecutionError && err.cause instanceof Error) ? err.cause : err;
471
- this.tracer.recordError(nodeSpan, telErr);
691
+ // Yield buffered custom/message events based on stream mode
692
+ if (modeCustom || modeDebug) {
693
+ for (const evt of allCustomEvents) {
694
+ if (modeCustom)
695
+ yield tag(evt, "custom");
696
+ if (modeDebug)
697
+ yield tag(evt, "debug");
472
698
  }
473
- this.tracer.endSpan(nodeSpan);
474
- _clearTokenHandler();
475
- throw err;
476
- }
477
- // Clear emitToken handler after node completes
478
- _clearTokenHandler();
479
- // Telemetry: end node span
480
- this.tracer.endSpan(nodeSpan);
481
- // Emit agent.end lifecycle event
482
- this.eventBus.emit({ type: "agent.end", agent: name, timestamp: Date.now(), step, duration: Date.now() - nodeStartTime });
483
- // Collect events for yielding after parallel execution
484
- allCustomEvents.push(...customEvents);
485
- allMessageEvents.push(...messageEvents);
486
- return { name, result, subParentUpdates };
487
- }));
488
- // Yield buffered subgraph events — filtered by parent's active modes
489
- for (const evt of allSubgraphEvents) {
490
- const e = evt;
491
- if (modeDebug) {
492
- yield tag(e, "debug");
493
- }
494
- else if (modeUpdates && (e.event === "node_end")) {
495
- yield tag(e, "updates");
496
- }
497
- else if (modeValues && e.event === "state_update") {
498
- yield tag(e, "values");
499
- }
500
- // Custom and message events from subgraphs are forwarded if those modes are active
501
- if (modeCustom && e.event === "custom") {
502
- yield tag(e, "custom");
503
699
  }
504
- if (modeMessages && (e.event === "messages" || e.event === "messages/complete")) {
505
- yield tag(e, "messages");
506
- }
507
- }
508
- // Apply results
509
- const stepWrites = [];
510
- for (const { name, result, subParentUpdates: parentUpdates } of nodeResults) {
511
- if (result instanceof Command) {
512
- if (result.graph === Command.PARENT) {
513
- // Push update to parent — do NOT apply locally
514
- if (!this._isSubgraph) {
515
- throw new Error("Command.PARENT used but graph is not running as a subgraph");
516
- }
517
- if (result.update)
518
- this._parentUpdates.push(result.update);
519
- // Still resolve next nodes normally
520
- const { nodes, sends } = this.getNextNodes(name, state, config);
521
- nextNodes.push(...nodes);
522
- nextSends.push(...sends);
700
+ if (modeMessages || modeDebug) {
701
+ for (const evt of allMessageEvents) {
702
+ if (modeMessages)
703
+ yield tag(evt, "messages");
704
+ if (modeDebug)
705
+ yield tag(evt, "debug");
523
706
  }
524
- else {
525
- if (result.update) {
526
- state = this.applyUpdate(state, result.update);
527
- if (Object.keys(result.update).length > 0) {
528
- stepWrites.push({ nodeId: name, writes: result.update });
529
- }
707
+ // Emit messages/complete for each node that produced tokens
708
+ for (const [, writer] of nodeWriters) {
709
+ const complete = writer._complete();
710
+ if (complete) {
711
+ if (modeMessages)
712
+ yield tag(complete, "messages");
713
+ if (modeDebug)
714
+ yield tag(complete, "debug");
530
715
  }
531
- const gotos = result.goto
532
- ? (Array.isArray(result.goto) ? result.goto : [result.goto])
533
- : this.getNextNodes(name, state, config).nodes;
534
- nextNodes.push(...gotos);
535
- if (result.send)
536
- nextSends.push(...result.send.map((s) => ({ node: s.node, args: s.args })));
537
716
  }
538
717
  }
539
- else if (result && typeof result === "object") {
540
- state = this.applyUpdate(state, result);
541
- const writes = result;
542
- if (Object.keys(writes).length > 0) {
543
- stepWrites.push({ nodeId: name, writes });
544
- }
545
- const { nodes, sends } = this.getNextNodes(name, state, config);
546
- nextNodes.push(...nodes);
547
- nextSends.push(...sends);
718
+ if (modeValues)
719
+ yield tag(this.evt("state_update", state, step, agentId), "values");
720
+ // Deduplicate nextNodes — avoid Set+spread when no dupes (common case)
721
+ if (nextNodes.length <= 1) {
722
+ pendingNodes = nextNodes;
548
723
  }
549
724
  else {
550
- const { nodes, sends } = this.getNextNodes(name, state, config);
551
- nextNodes.push(...nodes);
552
- nextSends.push(...sends);
553
- }
554
- // Apply parent updates from subgraph Command.PARENT (after normal result)
555
- for (const pu of parentUpdates) {
556
- state = this.applyUpdate(state, pu);
557
- }
558
- if (modeUpdates || modeDebug) {
559
- const delta = result instanceof Command ? (result.update ?? {}) : (result ?? {});
560
- if (modeUpdates)
561
- yield tag(this.evt("node_end", delta, step, agentId, name), "updates");
562
- if (modeDebug)
563
- yield tag(this.evt("node_end", delta, step, agentId, name), "debug");
564
- }
565
- // Static interrupt AFTER
566
- if (this.interruptConfig.interruptAfter?.includes(name)) {
567
- await this.saveCheckpoint(threadId, step, state, nextNodes, nextSends, agentId, config?.metadata);
568
- throw new ONIInterrupt(name, "after", state);
569
- }
570
- this.checkDynamicInterrupt(name, "after", state, config);
571
- }
572
- // Yield buffered custom/message events based on stream mode
573
- if (modeCustom || modeDebug) {
574
- for (const evt of allCustomEvents) {
575
- if (modeCustom)
576
- yield tag(evt, "custom");
577
- if (modeDebug)
578
- yield tag(evt, "debug");
579
- }
580
- }
581
- if (modeMessages || modeDebug) {
582
- for (const evt of allMessageEvents) {
583
- if (modeMessages)
584
- yield tag(evt, "messages");
585
- if (modeDebug)
586
- yield tag(evt, "debug");
587
- }
588
- // Emit messages/complete for each node that produced tokens
589
- for (const [, writer] of nodeWriters) {
590
- const complete = writer._complete();
591
- if (complete) {
592
- if (modeMessages)
593
- yield tag(complete, "messages");
594
- if (modeDebug)
595
- yield tag(complete, "debug");
725
+ const seen = new Set();
726
+ pendingNodes = [];
727
+ for (const n of nextNodes) {
728
+ const key = n;
729
+ if (!seen.has(key)) {
730
+ seen.add(key);
731
+ pendingNodes.push(n);
732
+ }
596
733
  }
597
734
  }
735
+ pendingSends = nextSends;
736
+ step++;
737
+ await this.saveCheckpoint(threadId, step, state, pendingNodes, pendingSends, agentId, config?.metadata, stepWrites);
598
738
  }
599
739
  if (modeValues)
600
740
  yield tag(this.evt("state_update", state, step, agentId), "values");
601
- // Deduplicate nextNodes — avoid Set+spread when no dupes (common case)
602
- if (nextNodes.length <= 1) {
603
- pendingNodes = nextNodes;
604
- }
605
- else {
606
- const seen = new Set();
607
- pendingNodes = [];
608
- for (const n of nextNodes) {
609
- const key = n;
610
- if (!seen.has(key)) {
611
- seen.add(key);
612
- pendingNodes.push(n);
613
- }
614
- }
615
- }
616
- pendingSends = nextSends;
617
- step++;
618
- await this.saveCheckpoint(threadId, step, state, pendingNodes, pendingSends, agentId, config?.metadata, stepWrites);
619
741
  }
620
- if (modeValues)
621
- yield tag(this.evt("state_update", state, step, agentId), "values");
622
- // Telemetry: end graph span
623
- graphSpan.setAttribute("oni.steps", step);
624
- this.tracer.endSpan(graphSpan);
742
+ finally {
743
+ // Telemetry: end graph span — always runs, even on error or interrupt
744
+ graphSpan.setAttribute("oni.steps", step);
745
+ this.tracer.endSpan(graphSpan);
746
+ }
625
747
  }
626
748
  // ----------------------------------------------------------------
627
749
  // Public API
@@ -699,9 +821,6 @@ export class ONIPregelRunner {
699
821
  cb = new CircuitBreaker({
700
822
  threshold: nodeDef.circuitBreaker.threshold,
701
823
  resetAfter: nodeDef.circuitBreaker.resetAfter,
702
- fallback: nodeDef.circuitBreaker.fallback
703
- ? () => nodeDef.circuitBreaker.fallback(undefined, undefined)
704
- : undefined,
705
824
  }, nodeDef.name);
706
825
  this.circuitBreakers.set(nodeDef.name, cb);
707
826
  }
@@ -713,16 +832,21 @@ export class ONIPregelRunner {
713
832
  }
714
833
  // ---- Helpers ----
715
834
  async saveCheckpoint(threadId, step, state, nextNodes, pendingSends, agentId, metadata, pendingWrites) {
716
- if (!this.checkpointer)
835
+ const cp = (this._perInvocationCheckpointer.get(threadId) ?? this.checkpointer);
836
+ if (!cp)
717
837
  return;
718
838
  const cpSpan = this.tracer.startCheckpointSpan("put", { threadId });
719
- await this.checkpointer.put({
720
- threadId, step, state, agentId, metadata, pendingWrites,
721
- nextNodes: nextNodes.map(String),
722
- pendingSends: pendingSends,
723
- timestamp: Date.now(),
724
- });
725
- this.tracer.endSpan(cpSpan);
839
+ try {
840
+ await cp.put({
841
+ threadId, step, state, agentId, metadata, pendingWrites,
842
+ nextNodes: nextNodes.map(String),
843
+ pendingSends: pendingSends,
844
+ timestamp: Date.now(),
845
+ });
846
+ }
847
+ finally {
848
+ this.tracer.endSpan(cpSpan);
849
+ }
726
850
  }
727
851
  evt(event, data, step, agentId, node) {
728
852
  return { event, data, step, timestamp: Date.now(), agentId, node };