npm - @oni.bot/core - Versions diffs - 1.0.1 → 1.0.3 - Mend

@oni.bot/core 1.0.1 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (214) hide show

package/CHANGELOG.md +146 -126
package/SECURITY.md +2 -2
package/dist/agents/context.d.ts +0 -2
package/dist/agents/context.d.ts.map +1 -1
package/dist/agents/context.js +1 -3
package/dist/agents/context.js.map +1 -1
package/dist/agents/define-agent.d.ts.map +1 -1
package/dist/agents/define-agent.js +99 -41
package/dist/agents/define-agent.js.map +1 -1
package/dist/agents/functional-agent.d.ts.map +1 -1
package/dist/agents/functional-agent.js +0 -7
package/dist/agents/functional-agent.js.map +1 -1
package/dist/agents/types.d.ts +0 -2
package/dist/agents/types.d.ts.map +1 -1
package/dist/checkpoint.d.ts.map +1 -1
package/dist/checkpoint.js +7 -2
package/dist/checkpoint.js.map +1 -1
package/dist/checkpointers/postgres.d.ts.map +1 -1
package/dist/checkpointers/postgres.js +47 -31
package/dist/checkpointers/postgres.js.map +1 -1
package/dist/checkpointers/sqlite.js +4 -4
package/dist/checkpointers/sqlite.js.map +1 -1
package/dist/circuit-breaker.d.ts +1 -0
package/dist/circuit-breaker.d.ts.map +1 -1
package/dist/circuit-breaker.js +13 -0
package/dist/circuit-breaker.js.map +1 -1
package/dist/cli/dev.d.ts.map +1 -1
package/dist/cli/dev.js +0 -1
package/dist/cli/dev.js.map +1 -1
package/dist/cli/router.js +1 -1
package/dist/cli/run.d.ts.map +1 -1
package/dist/cli/run.js +0 -1
package/dist/cli/run.js.map +1 -1
package/dist/config/loader.d.ts +1 -1
package/dist/config/loader.d.ts.map +1 -1
package/dist/config/loader.js +35 -5
package/dist/config/loader.js.map +1 -1
package/dist/context.d.ts +20 -0
package/dist/context.d.ts.map +1 -1
package/dist/context.js.map +1 -1
package/dist/coordination/pubsub.d.ts +1 -0
package/dist/coordination/pubsub.d.ts.map +1 -1
package/dist/coordination/pubsub.js +31 -16
package/dist/coordination/pubsub.js.map +1 -1
package/dist/coordination/request-reply.d.ts +7 -0
package/dist/coordination/request-reply.d.ts.map +1 -1
package/dist/coordination/request-reply.js +65 -11
package/dist/coordination/request-reply.js.map +1 -1
package/dist/events/bus.d.ts +1 -0
package/dist/events/bus.d.ts.map +1 -1
package/dist/events/bus.js +16 -10
package/dist/events/bus.js.map +1 -1
package/dist/functional.d.ts.map +1 -1
package/dist/functional.js +3 -0
package/dist/functional.js.map +1 -1
package/dist/graph.d.ts.map +1 -1
package/dist/graph.js +21 -3
package/dist/graph.js.map +1 -1
package/dist/guardrails/audit.d.ts +4 -1
package/dist/guardrails/audit.d.ts.map +1 -1
package/dist/guardrails/audit.js +18 -1
package/dist/guardrails/audit.js.map +1 -1
package/dist/harness/agent-loop.d.ts.map +1 -1
package/dist/harness/agent-loop.js +471 -352
package/dist/harness/agent-loop.js.map +1 -1
package/dist/harness/context-compactor.d.ts +1 -0
package/dist/harness/context-compactor.d.ts.map +1 -1
package/dist/harness/context-compactor.js +43 -1
package/dist/harness/context-compactor.js.map +1 -1
package/dist/harness/harness.d.ts +6 -0
package/dist/harness/harness.d.ts.map +1 -1
package/dist/harness/harness.js +32 -5
package/dist/harness/harness.js.map +1 -1
package/dist/harness/hooks-engine.d.ts.map +1 -1
package/dist/harness/hooks-engine.js +12 -10
package/dist/harness/hooks-engine.js.map +1 -1
package/dist/harness/index.d.ts +3 -1
package/dist/harness/index.d.ts.map +1 -1
package/dist/harness/index.js +2 -0
package/dist/harness/index.js.map +1 -1
package/dist/harness/memory-loader.d.ts +150 -0
package/dist/harness/memory-loader.d.ts.map +1 -0
package/dist/harness/memory-loader.js +714 -0
package/dist/harness/memory-loader.js.map +1 -0
package/dist/harness/safety-gate.d.ts.map +1 -1
package/dist/harness/safety-gate.js +47 -26
package/dist/harness/safety-gate.js.map +1 -1
package/dist/harness/skill-loader.d.ts +7 -0
package/dist/harness/skill-loader.d.ts.map +1 -1
package/dist/harness/skill-loader.js +24 -8
package/dist/harness/skill-loader.js.map +1 -1
package/dist/harness/todo-module.d.ts.map +1 -1
package/dist/harness/todo-module.js +13 -6
package/dist/harness/todo-module.js.map +1 -1
package/dist/harness/types.d.ts +7 -0
package/dist/harness/types.d.ts.map +1 -1
package/dist/harness/types.js.map +1 -1
package/dist/harness/validate-args.js +18 -3
package/dist/harness/validate-args.js.map +1 -1
package/dist/hitl/interrupt.d.ts +2 -2
package/dist/hitl/interrupt.d.ts.map +1 -1
package/dist/hitl/interrupt.js +6 -4
package/dist/hitl/interrupt.js.map +1 -1
package/dist/hitl/resume.d.ts +10 -0
package/dist/hitl/resume.d.ts.map +1 -1
package/dist/hitl/resume.js +31 -0
package/dist/hitl/resume.js.map +1 -1
package/dist/index.js +1 -1
package/dist/injected.d.ts.map +1 -1
package/dist/injected.js.map +1 -1
package/dist/inspect.d.ts.map +1 -1
package/dist/inspect.js +28 -8
package/dist/inspect.js.map +1 -1
package/dist/lsp/client.d.ts +2 -0
package/dist/lsp/client.d.ts.map +1 -1
package/dist/lsp/client.js +62 -17
package/dist/lsp/client.js.map +1 -1
package/dist/lsp/index.d.ts.map +1 -1
package/dist/lsp/index.js.map +1 -1
package/dist/mcp/client.d.ts +2 -0
package/dist/mcp/client.d.ts.map +1 -1
package/dist/mcp/client.js +45 -14
package/dist/mcp/client.js.map +1 -1
package/dist/mcp/convert.js +1 -1
package/dist/mcp/convert.js.map +1 -1
package/dist/mcp/transport.d.ts +2 -0
package/dist/mcp/transport.d.ts.map +1 -1
package/dist/mcp/transport.js +33 -8
package/dist/mcp/transport.js.map +1 -1
package/dist/messages/index.d.ts.map +1 -1
package/dist/messages/index.js +7 -1
package/dist/messages/index.js.map +1 -1
package/dist/models/anthropic.d.ts.map +1 -1
package/dist/models/anthropic.js +25 -15
package/dist/models/anthropic.js.map +1 -1
package/dist/models/google.d.ts.map +1 -1
package/dist/models/google.js +23 -7
package/dist/models/google.js.map +1 -1
package/dist/models/ollama.d.ts.map +1 -1
package/dist/models/ollama.js +11 -1
package/dist/models/ollama.js.map +1 -1
package/dist/models/openai.d.ts.map +1 -1
package/dist/models/openai.js +15 -3
package/dist/models/openai.js.map +1 -1
package/dist/models/openrouter.d.ts.map +1 -1
package/dist/models/openrouter.js +14 -3
package/dist/models/openrouter.js.map +1 -1
package/dist/prebuilt/react-agent.d.ts.map +1 -1
package/dist/prebuilt/react-agent.js +1 -0
package/dist/prebuilt/react-agent.js.map +1 -1
package/dist/pregel.d.ts +11 -6
package/dist/pregel.d.ts.map +1 -1
package/dist/pregel.js +473 -349
package/dist/pregel.js.map +1 -1
package/dist/retry.d.ts.map +1 -1
package/dist/retry.js +7 -6
package/dist/retry.js.map +1 -1
package/dist/store/index.d.ts +1 -1
package/dist/store/index.d.ts.map +1 -1
package/dist/store/index.js +63 -13
package/dist/store/index.js.map +1 -1
package/dist/stream-events.d.ts.map +1 -1
package/dist/stream-events.js +3 -9
package/dist/stream-events.js.map +1 -1
package/dist/streaming.d.ts +5 -2
package/dist/streaming.d.ts.map +1 -1
package/dist/streaming.js +9 -8
package/dist/streaming.js.map +1 -1
package/dist/swarm/graph.d.ts +16 -2
package/dist/swarm/graph.d.ts.map +1 -1
package/dist/swarm/graph.js +204 -53
package/dist/swarm/graph.js.map +1 -1
package/dist/swarm/index.d.ts +2 -1
package/dist/swarm/index.d.ts.map +1 -1
package/dist/swarm/index.js.map +1 -1
package/dist/swarm/mailbox.d.ts.map +1 -1
package/dist/swarm/mailbox.js +3 -1
package/dist/swarm/mailbox.js.map +1 -1
package/dist/swarm/mermaid.d.ts +2 -1
package/dist/swarm/mermaid.d.ts.map +1 -1
package/dist/swarm/mermaid.js +6 -3
package/dist/swarm/mermaid.js.map +1 -1
package/dist/swarm/pool.d.ts.map +1 -1
package/dist/swarm/pool.js +30 -5
package/dist/swarm/pool.js.map +1 -1
package/dist/swarm/registry.d.ts.map +1 -1
package/dist/swarm/registry.js +7 -0
package/dist/swarm/registry.js.map +1 -1
package/dist/swarm/scaling.d.ts +10 -1
package/dist/swarm/scaling.d.ts.map +1 -1
package/dist/swarm/scaling.js +85 -14
package/dist/swarm/scaling.js.map +1 -1
package/dist/swarm/snapshot.d.ts.map +1 -1
package/dist/swarm/snapshot.js +10 -1
package/dist/swarm/snapshot.js.map +1 -1
package/dist/swarm/supervisor.js +20 -12
package/dist/swarm/supervisor.js.map +1 -1
package/dist/swarm/tracer.d.ts +3 -1
package/dist/swarm/tracer.d.ts.map +1 -1
package/dist/swarm/tracer.js +66 -15
package/dist/swarm/tracer.js.map +1 -1
package/dist/swarm/types.d.ts +1 -6
package/dist/swarm/types.d.ts.map +1 -1
package/dist/testing/index.d.ts +2 -2
package/dist/testing/index.d.ts.map +1 -1
package/dist/testing/index.js.map +1 -1
package/dist/tools/define.d.ts.map +1 -1
package/dist/tools/define.js +1 -0
package/dist/tools/define.js.map +1 -1
package/dist/tools/types.d.ts +2 -0
package/dist/tools/types.d.ts.map +1 -1
package/dist/types.d.ts +3 -1
package/dist/types.d.ts.map +1 -1
package/package.json +7 -1

package/dist/pregel.js CHANGED Viewed

@@ -3,18 +3,21 @@
 // New: interrupt() context management, HITL resume, token streaming
 // ============================================================
 import { START, END, Send, Command, } from "./types.js";
-import { RecursionLimitError, NodeNotFoundError, ONIInterrupt, NodeTimeoutError, ONIError, NodeExecutionError } from "./errors.js";
+import { RecursionLimitError, NodeNotFoundError, ONIInterrupt, NodeTimeoutError, ONIError, NodeExecutionError, CircuitBreakerOpenError } from "./errors.js";
 import { CircuitBreaker } from "./circuit-breaker.js";
 import { withRetry } from "./retry.js";
 import { NamespacedCheckpointer } from "./checkpointers/namespaced.js";
 import { _runWithContext } from "./context.js";
-import { StreamWriterImpl, _installTokenHandler, _clearTokenHandler } from "./streaming.js";
+import { StreamWriterImpl, _withTokenHandler } from "./streaming.js";
 import { NodeInterruptSignal, HITLInterruptException, HITLSessionStore, _installInterruptContext, _clearInterruptContext, } from "./hitl/index.js";
 import { EventBus } from "./events/bus.js";
 import { AuditLog } from "./guardrails/audit.js";
 import { BudgetTracker } from "./guardrails/budget.js";
+import { runFilters } from "./guardrails/filters.js";
 import { ONITracer } from "./telemetry.js";
 const DEFAULT_RECURSION_LIMIT = 25;
+/** Maximum entries in the node result cache — oldest entries evicted FIFO when full. */
+const NODE_CACHE_MAX_SIZE = 256;
 export class ONIPregelRunner {
     nodes;
     edges;
@@ -27,13 +30,17 @@ export class ONIPregelRunner {
     hitlStore = new HITLSessionStore();
     nodeCache = new Map();
     circuitBreakers = new Map();
-    /** Set to true when this runner is being invoked as a subgraph */
-    _isSubgraph = false;
-    /** Accumulated parent updates from Command.PARENT during subgraph execution */
-    _parentUpdates = [];
+    /** Count of concurrent subgraph invocations active on this runner. >0 means running as subgraph. */
+    _subgraphRefCount = 0;
+    /** Per-invocation parent updates from Command.PARENT, keyed by parent threadId. */
+    _perInvocationParentUpdates = new Map();
+    /** Per-invocation checkpointer override for subgraph isolation, keyed by threadId. */
+    _perInvocationCheckpointer = new Map();
     eventBus;
     auditLog;
     budgetTracker;
+    contentFilters;
+    toolPermissions;
     tracer;
     /** Pre-indexed edges by source node — O(1) lookup instead of O(n) filter */
     _edgesBySource;
@@ -51,6 +58,8 @@ export class ONIPregelRunner {
         this.eventBus = new EventBus(listeners);
         this.auditLog = guardrails?.audit ? new AuditLog() : null;
         this.budgetTracker = guardrails?.budget ? new BudgetTracker(guardrails.budget) : null;
+        this.contentFilters = guardrails?.filters ?? [];
+        this.toolPermissions = guardrails?.toolPermissions;
         this.tracer = new ONITracer(tracer ?? null);
         // Pre-index edges by source for O(1) lookups in getNextNodes
         this._edgesBySource = new Map();
@@ -143,6 +152,7 @@ export class ONIPregelRunner {
                 this.nodeCache.delete(cacheKey);
             }
         }
+        const _tid = config?.threadId ?? "unknown";
         const ctx = {
             config: config ?? {},
             store: this.store,
@@ -152,14 +162,41 @@ export class ONIPregelRunner {
             parentUpdates: [],
             step: step ?? 0,
             recursionLimit: recursionLimit ?? DEFAULT_RECURSION_LIMIT,
+            toolPermissions: this.toolPermissions,
+            _recordUsage: (agentName, modelId, usage) => {
+                if (!this.budgetTracker)
+                    return;
+                const entries = this.budgetTracker.record(agentName, modelId, usage);
+                for (const e of entries)
+                    this.auditLog?.record(_tid, e);
+            },
+            _emitEvent: (event) => this.eventBus.emit(event),
+            _auditRecord: (entry) => this.auditLog?.record(_tid, entry),
         };
         return _runWithContext(ctx, async () => {
             _installInterruptContext({
                 nodeName: nodeDef.name,
-                resumeValue: resumeValue,
-                hasResume: hasResume ?? false,
+                resumeValues: hasResume ? [resumeValue] : [],
             });
             try {
+                // Content filter — input direction
+                if (this.contentFilters.length > 0) {
+                    const inputStr = JSON.stringify(state);
+                    const inputCheck = runFilters(this.contentFilters, inputStr, "input");
+                    if (!inputCheck.passed) {
+                        const threadId = config?.threadId ?? "unknown";
+                        this.eventBus.emit({ type: "filter.blocked", filter: inputCheck.blockedBy, agent: nodeDef.name, direction: "input", reason: inputCheck.reason, timestamp: Date.now() });
+                        this.auditLog?.record(threadId, { timestamp: Date.now(), agent: nodeDef.name, action: "filter.blocked", data: { filter: inputCheck.blockedBy, direction: "input", reason: inputCheck.reason } });
+                        throw new Error(`Content blocked by filter "${inputCheck.blockedBy}" on input to node "${nodeDef.name}": ${inputCheck.reason}`);
+                    }
+                    // Apply redaction if content was rewritten by a redacting filter
+                    if (inputCheck.content !== inputStr) {
+                        try {
+                            state = JSON.parse(inputCheck.content);
+                        }
+                        catch { /* leave state unchanged on parse failure */ }
+                    }
+                }
                 const run = () => Promise.resolve(nodeDef.fn(state, config));
                 // Core execute call: retry-aware
                 const executeCall = async () => {
@@ -204,15 +241,44 @@ export class ONIPregelRunner {
                     // Pass through interrupt signals (thrown by interrupt() inside nodes)
                     if (err instanceof NodeInterruptSignal)
                         throw err;
-                    // Pass through structured ONI errors (NodeExecutionError from retry, NodeTimeoutError, etc.)
-                    if (err instanceof ONIError)
-                        throw err;
-                    // Wrap raw errors and non-Error throws in NodeExecutionError
-                    const cause = err instanceof Error ? err : new Error(String(err));
-                    throw new NodeExecutionError(nodeDef.name, cause);
+                    // Circuit breaker open — invoke user fallback with real state + error
+                    if (err instanceof CircuitBreakerOpenError && nodeDef.circuitBreaker?.fallback) {
+                        result = nodeDef.circuitBreaker.fallback(state, err);
+                    }
+                    else {
+                        // Pass through structured ONI errors (NodeExecutionError from retry, NodeTimeoutError, etc.)
+                        if (err instanceof ONIError)
+                            throw err;
+                        // Wrap raw errors and non-Error throws in NodeExecutionError
+                        const cause = err instanceof Error ? err : new Error(String(err));
+                        throw new NodeExecutionError(nodeDef.name, cause);
+                    }
+                }
+                // Content filter — output direction
+                if (this.contentFilters.length > 0 && result != null) {
+                    const outputStr = JSON.stringify(result);
+                    const outputCheck = runFilters(this.contentFilters, outputStr, "output");
+                    if (!outputCheck.passed) {
+                        const threadId = config?.threadId ?? "unknown";
+                        this.eventBus.emit({ type: "filter.blocked", filter: outputCheck.blockedBy, agent: nodeDef.name, direction: "output", reason: outputCheck.reason, timestamp: Date.now() });
+                        this.auditLog?.record(threadId, { timestamp: Date.now(), agent: nodeDef.name, action: "filter.blocked", data: { filter: outputCheck.blockedBy, direction: "output", reason: outputCheck.reason } });
+                        throw new Error(`Content blocked by filter "${outputCheck.blockedBy}" on output of node "${nodeDef.name}": ${outputCheck.reason}`);
+                    }
+                    // Apply redaction if content was rewritten by a redacting filter
+                    if (outputCheck.content !== outputStr) {
+                        try {
+                            result = JSON.parse(outputCheck.content);
+                        }
+                        catch { /* leave result unchanged on parse failure */ }
+                    }
                 }
-                // Store in cache (reuse key computed above)
+                // Store in cache (reuse key computed above); evict oldest entry when full
                 if (nodeDef.cache && cacheKey) {
+                    if (this.nodeCache.size >= NODE_CACHE_MAX_SIZE) {
+                        const oldest = this.nodeCache.keys().next().value;
+                        if (oldest !== undefined)
+                            this.nodeCache.delete(oldest);
+                    }
                     this.nodeCache.set(cacheKey, { result, timestamp: Date.now() });
                 }
                 return result;
@@ -253,20 +319,29 @@ export class ONIPregelRunner {
         const modeMessages = modes.has("messages");
         // Telemetry: graph-level span
         const graphSpan = this.tracer.startGraphSpan("invoke", { threadId, agentId });
-        // Load resume values from config (set by resume() call)
-        const resumeMap = config?.__resumeValues ?? {};
-        // Load or init state
-        let state;
-        let step = 0;
-        let pendingNodes = [];
-        let pendingSends = [];
-        if (this.checkpointer && config?.threadId) {
-            const cp = await this.checkpointer.get(threadId);
-            if (cp) {
-                state = this.applyUpdate(cp.state, input);
-                step = cp.step;
-                pendingNodes = cp.nextNodes;
-                pendingSends = cp.pendingSends ?? [];
+        let step = 0; // declared before try so finally can read it for setAttribute
+        try {
+            // Load resume values from config (set by resume() call)
+            const resumeMap = config?.__resumeValues ?? {};
+            // Load or init state
+            let state;
+            let pendingNodes = [];
+            let pendingSends = [];
+            const effectiveCheckpointer = (this._perInvocationCheckpointer.get(threadId) ?? this.checkpointer);
+            if (effectiveCheckpointer && config?.threadId) {
+                const cp = await effectiveCheckpointer.get(threadId);
+                if (cp) {
+                    state = this.applyUpdate(cp.state, input);
+                    step = cp.step;
+                    pendingNodes = cp.nextNodes;
+                    pendingSends = cp.pendingSends ?? [];
+                }
+                else {
+                    state = this.applyUpdate(this.buildInitialState(), input);
+                    const init = this.getNextNodes(START, state, config);
+                    pendingNodes = init.nodes;
+                    pendingSends = init.sends;
+                }
             }
             else {
                 state = this.applyUpdate(this.buildInitialState(), input);
@@ -274,58 +349,318 @@ export class ONIPregelRunner {
                 pendingNodes = init.nodes;
                 pendingSends = init.sends;
             }
-        }
-        else {
-            state = this.applyUpdate(this.buildInitialState(), input);
-            const init = this.getNextNodes(START, state, config);
-            pendingNodes = init.nodes;
-            pendingSends = init.sends;
-        }
-        if (modeValues)
-            yield tag(this.evt("state_update", state, step, agentId), "values");
-        // ---- Main superstep loop ----
-        while (true) {
-            const nextNodes = [];
-            const nextSends = [];
-            state = this.resetEphemeral(state);
-            // Drain sends — group by target node for parallel fan-out execution
-            const sendGroups = new Map();
-            for (const send of pendingSends) {
-                if (!sendGroups.has(send.node))
-                    sendGroups.set(send.node, []);
-                sendGroups.get(send.node).push(send);
-                if (modeDebug)
-                    yield tag(this.evt("send", send, step, agentId, send.node), "debug");
-            }
-            // Execute fan-out sends (each Send → separate node execution with its own state)
-            if (sendGroups.size > 0) {
-                // Build promises directly — avoids spread+flatMap intermediate arrays
-                const sendPromises = [];
-                for (const [node, sends] of sendGroups) {
-                    const nodeDef = this.nodes.get(node);
+            if (modeValues)
+                yield tag(this.evt("state_update", state, step, agentId), "values");
+            // ---- Main superstep loop ----
+            while (true) {
+                const nextNodes = [];
+                const nextSends = [];
+                state = this.resetEphemeral(state);
+                // Drain sends — group by target node for parallel fan-out execution
+                const sendGroups = new Map();
+                for (const send of pendingSends) {
+                    if (!sendGroups.has(send.node))
+                        sendGroups.set(send.node, []);
+                    sendGroups.get(send.node).push(send);
+                    if (modeDebug)
+                        yield tag(this.evt("send", send, step, agentId, send.node), "debug");
+                }
+                // Recursion limit guard — must fire before any sends execute so that
+                // node side-effects are not applied to a step that will be discarded.
+                if (step >= recursionLimit)
+                    throw new RecursionLimitError(recursionLimit);
+                // Execute fan-out sends (each Send → separate node execution with its own state)
+                if (sendGroups.size > 0) {
+                    // Build promises directly — avoids spread+flatMap intermediate arrays
+                    const sendPromises = [];
+                    for (const [node, sends] of sendGroups) {
+                        const nodeDef = this.nodes.get(node);
+                        if (!nodeDef)
+                            throw new NodeNotFoundError(node);
+                        for (const send of sends) {
+                            sendPromises.push((async () => {
+                                const sendState = this.applyUpdate(state, send.args);
+                                const result = await this.executeNode(nodeDef, sendState, config, undefined, undefined, undefined, step, recursionLimit);
+                                return { name: node, result };
+                            })());
+                        }
+                    }
+                    const sendResults = await Promise.all(sendPromises);
+                    // Reduce all send results through channels
+                    for (const { name, result } of sendResults) {
+                        if (result instanceof Command) {
+                            if (result.update)
+                                state = this.applyUpdate(state, result.update);
+                            const gotos = result.goto
+                                ? (Array.isArray(result.goto) ? result.goto : [result.goto])
+                                : this.getNextNodes(name, state, config).nodes;
+                            nextNodes.push(...gotos);
+                        }
+                        else if (result && typeof result === "object") {
+                            state = this.applyUpdate(state, result);
+                            const { nodes, sends } = this.getNextNodes(name, state, config);
+                            nextNodes.push(...nodes);
+                            nextSends.push(...sends);
+                        }
+                        else {
+                            const { nodes, sends } = this.getNextNodes(name, state, config);
+                            nextNodes.push(...nodes);
+                            nextSends.push(...sends);
+                        }
+                        if (modeUpdates || modeDebug) {
+                            const delta = result instanceof Command ? (result.update ?? {}) : (result ?? {});
+                            if (modeUpdates)
+                                yield tag(this.evt("node_end", delta, step, agentId, name), "updates");
+                            if (modeDebug)
+                                yield tag(this.evt("node_end", delta, step, agentId, name), "debug");
+                        }
+                    }
+                }
+                pendingSends = [];
+                // Filter executable nodes (non-END), excluding nodes already handled by sends
+                const executableNodes = pendingNodes.filter((n) => n !== END && !sendGroups.has(n));
+                if (executableNodes.length === 0 && sendGroups.size === 0)
+                    break;
+                // Emit debug node_start events before parallel execution
+                if (modeDebug) {
+                    for (const nodeName of executableNodes) {
+                        const name = nodeName;
+                        if (!this.nodes.has(name))
+                            throw new NodeNotFoundError(name);
+                        // Static interrupt BEFORE (check before emitting start)
+                        if (this.interruptConfig.interruptBefore?.includes(name))
+                            throw new ONIInterrupt(name, "before", state);
+                        this.checkDynamicInterrupt(name, "before", state, config);
+                        yield tag(this.evt("node_start", {}, step, agentId, name), "debug");
+                    }
+                }
+                // Execute all active nodes in parallel
+                const allCustomEvents = [];
+                const allMessageEvents = [];
+                const allSubgraphEvents = [];
+                const nodeWriters = new Map();
+                // Track the first HITL interrupt across all parallel nodes. We use
+                // allSettled (not Promise.all) so that when one node raises an interrupt,
+                // all other in-flight nodes complete before the interrupt is surfaced.
+                // This prevents orphaned background executions that would apply side
+                // effects without being checkpointed, causing double-application on resume.
+                let pendingInterrupt = null;
+                const allSettledResults = await Promise.allSettled(executableNodes.map(async (nodeName) => {
+                    const name = nodeName;
+                    const nodeDef = this.nodes.get(name);
                     if (!nodeDef)
-                        throw new NodeNotFoundError(node);
-                    for (const send of sends) {
-                        sendPromises.push((async () => {
-                            const sendState = this.applyUpdate(state, send.args);
-                            const result = await this.executeNode(nodeDef, sendState, config, undefined, undefined, undefined, step, recursionLimit);
-                            return { name: node, result };
-                        })());
+                        throw new NodeNotFoundError(name);
+                    // Static interrupt BEFORE (non-debug mode)
+                    if (!modeDebug) {
+                        if (this.interruptConfig.interruptBefore?.includes(name))
+                            throw new ONIInterrupt(name, "before", state);
+                        this.checkDynamicInterrupt(name, "before", state, config);
+                    }
+                    // Create a StreamWriter for this node
+                    const messageId = `msg-${threadId}-${step}-${name}`;
+                    const customEvents = [];
+                    const messageEvents = [];
+                    const writerImpl = new StreamWriterImpl((evt) => customEvents.push(evt), (_token) => { }, (evt) => messageEvents.push(evt), name, step, messageId, agentId);
+                    nodeWriters.set(name, writerImpl);
+                    // Check if this node has a pending resume value
+                    const resumeValue = resumeMap[name];
+                    const hasResume = name in resumeMap;
+                    // Emit agent.start lifecycle event
+                    const nodeStartTime = Date.now();
+                    this.eventBus.emit({ type: "agent.start", agent: name, timestamp: nodeStartTime, step });
+                    // Telemetry: node-level span
+                    const nodeSpan = this.tracer.startNodeSpan(name, { threadId, step, agentId });
+                    let result;
+                    let subParentUpdates = [];
+                    try {
+                        // Scope emitToken to this node's async context via ALS — parallel nodes each
+                        // get their own handler so tokens are never dropped or misrouted.
+                        result = await _withTokenHandler((token) => writerImpl.token(token), async () => {
+                            if (nodeDef.subgraph) {
+                                const childRunner = nodeDef.subgraph._runner;
+                                // Per-invocation key for concurrent-safe state isolation
+                                const invocationKey = threadId;
+                                if (childRunner) {
+                                    childRunner._subgraphRefCount++;
+                                    childRunner._perInvocationParentUpdates.set(invocationKey, []);
+                                }
+                                // Install a namespaced checkpointer per invocation instead of swapping a shared field
+                                if (this.checkpointer && childRunner) {
+                                    childRunner._perInvocationCheckpointer.set(invocationKey, new NamespacedCheckpointer(this.checkpointer, name));
+                                }
+                                // Stream the subgraph — always clean up per-invocation state, even on throw/interrupt
+                                let subFinalState;
+                                try {
+                                    const childStreamMode = ["debug", "values"];
+                                    for await (const evt of nodeDef.subgraph.stream(state, {
+                                        ...config,
+                                        // Pass the parent's effective threadId explicitly so the child's
+                                        // _perInvocationParentUpdates lookup at Command.PARENT time uses
+                                        // the same key that was registered in invocationKey above.
+                                        threadId: invocationKey,
+                                        parentRunId: config?.threadId,
+                                        streamMode: childStreamMode,
+                                    })) {
+                                        // Namespace-prefix the node name
+                                        allSubgraphEvents.push({
+                                            ...evt,
+                                            node: evt.node ? `${name}:${evt.node}` : name,
+                                        });
+                                        // Track the last state_update as the final subgraph state
+                                        if (evt.event === "state_update") {
+                                            subFinalState = evt.data;
+                                        }
+                                    }
+                                    if (childRunner) {
+                                        subParentUpdates = childRunner._perInvocationParentUpdates.get(invocationKey) ?? [];
+                                    }
+                                }
+                                finally {
+                                    // Clean up per-invocation state — decrement ref count, remove Maps entries
+                                    if (childRunner) {
+                                        childRunner._subgraphRefCount--;
+                                        childRunner._perInvocationParentUpdates.delete(invocationKey);
+                                        childRunner._perInvocationCheckpointer.delete(invocationKey);
+                                    }
+                                }
+                                return subFinalState ?? {};
+                            }
+                            else {
+                                return this.executeNode(nodeDef, state, config, resumeValue, hasResume, writerImpl, step, recursionLimit);
+                            }
+                        });
+                    }
+                    catch (err) {
+                        // Catch interrupt() signals thrown from inside nodes
+                        if (err instanceof NodeInterruptSignal) {
+                            const iv = {
+                                value: err.value,
+                                node: name,
+                                resumeId: err.resumeId,
+                                timestamp: Date.now(),
+                            };
+                            const exc = new HITLInterruptException(threadId, iv, state);
+                            // Claim the first-interrupt slot SYNCHRONOUSLY before any await.
+                            // Both concurrent interrupt handlers check this flag before yielding,
+                            // so whichever catch block runs first exclusively owns the checkpoint
+                            // save. Without this guard, the last saveCheckpoint wins and its
+                            // nextNodes diverge from pendingInterrupt's node, corrupting resume().
+                            const isFirstInterrupt = !pendingInterrupt;
+                            if (isFirstInterrupt)
+                                pendingInterrupt = exc;
+                            // Only save checkpoint for the first interrupt — the stored nextNodes
+                            // must match pendingInterrupt's node so resume() restores correctly.
+                            if (isFirstInterrupt) {
+                                await this.saveCheckpoint(threadId, step, state, [name], pendingSends, agentId, config?.metadata);
+                                // Record HITL session if checkpointer exists
+                                if (effectiveCheckpointer) {
+                                    const cp = await effectiveCheckpointer.get(threadId);
+                                    if (cp)
+                                        this.hitlStore.record(threadId, iv, cp);
+                                }
+                            }
+                            throw exc; // marks this node's settled result as rejected
+                        }
+                        // Record to DLQ before re-throwing — use original cause if wrapped
+                        if (this.dlq && err instanceof Error) {
+                            const dlqErr = (err instanceof NodeExecutionError && err.cause instanceof Error) ? err.cause : err;
+                            this.dlq.record(threadId, name, state, dlqErr, nodeDef.retry?.maxAttempts ?? 1);
+                        }
+                        // Telemetry: record error on node span — use original cause if wrapped
+                        if (err instanceof Error) {
+                            const telErr = (err instanceof NodeExecutionError && err.cause instanceof Error) ? err.cause : err;
+                            this.tracer.recordError(nodeSpan, telErr);
+                        }
+                        this.tracer.endSpan(nodeSpan);
+                        // Lifecycle event: emit error for non-interrupt failures
+                        if (err instanceof Error) {
+                            this.eventBus.emit({ type: "error", agent: name, error: err, timestamp: Date.now() });
+                        }
+                        throw err;
+                    }
+                    // Telemetry: end node span
+                    this.tracer.endSpan(nodeSpan);
+                    // Emit agent.end lifecycle event
+                    this.eventBus.emit({ type: "agent.end", agent: name, timestamp: Date.now(), step, duration: Date.now() - nodeStartTime });
+                    // Collect events for yielding after parallel execution
+                    allCustomEvents.push(...customEvents);
+                    allMessageEvents.push(...messageEvents);
+                    return { name, result, subParentUpdates };
+                }));
+                // Extract results now that all nodes have settled.
+                // Re-throw the first non-interrupt error (DLQ/telemetry already handled
+                // inside each node's catch block), then surface any HITL interrupt.
+                const nodeResults = [];
+                for (const settled of allSettledResults) {
+                    if (settled.status === "fulfilled") {
+                        nodeResults.push(settled.value);
+                    }
+                    else if (!(settled.reason instanceof HITLInterruptException)) {
+                        throw settled.reason; // first non-interrupt error
+                    }
+                }
+                if (pendingInterrupt)
+                    throw pendingInterrupt;
+                // Yield buffered subgraph events — filtered by parent's active modes
+                for (const evt of allSubgraphEvents) {
+                    const e = evt;
+                    if (modeDebug) {
+                        yield tag(e, "debug");
+                    }
+                    else if (modeUpdates && (e.event === "node_end")) {
+                        yield tag(e, "updates");
+                    }
+                    else if (modeValues && e.event === "state_update") {
+                        yield tag(e, "values");
+                    }
+                    // Custom and message events from subgraphs are forwarded if those modes are active
+                    if (modeCustom && e.event === "custom") {
+                        yield tag(e, "custom");
+                    }
+                    if (modeMessages && (e.event === "messages" || e.event === "messages/complete")) {
+                        yield tag(e, "messages");
                     }
                 }
-                const sendResults = await Promise.all(sendPromises);
-                // Reduce all send results through channels
-                for (const { name, result } of sendResults) {
+                // Apply results
+                const stepWrites = [];
+                for (const { name, result, subParentUpdates: parentUpdates } of nodeResults) {
                     if (result instanceof Command) {
-                        if (result.update)
-                            state = this.applyUpdate(state, result.update);
-                        const gotos = result.goto
-                            ? (Array.isArray(result.goto) ? result.goto : [result.goto])
-                            : this.getNextNodes(name, state, config).nodes;
-                        nextNodes.push(...gotos);
+                        if (result.graph === Command.PARENT) {
+                            // Push update to parent — do NOT apply locally
+                            if (!this._subgraphRefCount) {
+                                throw new Error("Command.PARENT used but graph is not running as a subgraph");
+                            }
+                            if (result.update) {
+                                const myParentUpdates = this._perInvocationParentUpdates.get(threadId);
+                                if (myParentUpdates)
+                                    myParentUpdates.push(result.update);
+                            }
+                            // Still resolve next nodes normally
+                            const { nodes, sends } = this.getNextNodes(name, state, config);
+                            nextNodes.push(...nodes);
+                            nextSends.push(...sends);
+                        }
+                        else {
+                            if (result.update) {
+                                state = this.applyUpdate(state, result.update);
+                                if (Object.keys(result.update).length > 0) {
+                                    stepWrites.push({ nodeId: name, writes: result.update });
+                                }
+                            }
+                            const gotos = result.goto
+                                ? (Array.isArray(result.goto) ? result.goto : [result.goto])
+                                : this.getNextNodes(name, state, config).nodes;
+                            nextNodes.push(...gotos);
+                            if (result.send)
+                                nextSends.push(...result.send.map((s) => ({ node: s.node, args: s.args })));
+                        }
                     }
                     else if (result && typeof result === "object") {
                         state = this.applyUpdate(state, result);
+                        const writes = result;
+                        if (Object.keys(writes).length > 0) {
+                            stepWrites.push({ nodeId: name, writes });
+                        }
                         const { nodes, sends } = this.getNextNodes(name, state, config);
                         nextNodes.push(...nodes);
                         nextSends.push(...sends);
@@ -335,6 +670,10 @@ export class ONIPregelRunner {
                         nextNodes.push(...nodes);
                         nextSends.push(...sends);
                     }
+                    // Apply parent updates from subgraph Command.PARENT (after normal result)
+                    for (const pu of parentUpdates) {
+                        state = this.applyUpdate(state, pu);
+                    }
                     if (modeUpdates || modeDebug) {
                         const delta = result instanceof Command ? (result.update ?? {}) : (result ?? {});
                         if (modeUpdates)
@@ -342,286 +681,69 @@ export class ONIPregelRunner {
                         if (modeDebug)
                             yield tag(this.evt("node_end", delta, step, agentId, name), "debug");
                     }
-                }
-            }
-            pendingSends = [];
-            // Filter executable nodes (non-END), excluding nodes already handled by sends
-            const executableNodes = pendingNodes.filter((n) => n !== END && !sendGroups.has(n));
-            if (executableNodes.length === 0 && sendGroups.size === 0)
-                break;
-            if (step >= recursionLimit)
-                throw new RecursionLimitError(recursionLimit);
-            // Emit debug node_start events before parallel execution
-            if (modeDebug) {
-                for (const nodeName of executableNodes) {
-                    const name = nodeName;
-                    if (!this.nodes.has(name))
-                        throw new NodeNotFoundError(name);
-                    // Static interrupt BEFORE (check before emitting start)
-                    if (this.interruptConfig.interruptBefore?.includes(name))
-                        throw new ONIInterrupt(name, "before", state);
-                    this.checkDynamicInterrupt(name, "before", state, config);
-                    yield tag(this.evt("node_start", {}, step, agentId, name), "debug");
-                }
-            }
-            // Execute all active nodes in parallel
-            const allCustomEvents = [];
-            const allMessageEvents = [];
-            const allSubgraphEvents = [];
-            const nodeWriters = new Map();
-            const nodeResults = await Promise.all(executableNodes.map(async (nodeName) => {
-                const name = nodeName;
-                const nodeDef = this.nodes.get(name);
-                if (!nodeDef)
-                    throw new NodeNotFoundError(name);
-                // Static interrupt BEFORE (non-debug mode)
-                if (!modeDebug) {
-                    if (this.interruptConfig.interruptBefore?.includes(name))
-                        throw new ONIInterrupt(name, "before", state);
-                    this.checkDynamicInterrupt(name, "before", state, config);
-                }
-                // Create a StreamWriter for this node
-                const messageId = `msg-${threadId}-${step}-${name}`;
-                const customEvents = [];
-                const messageEvents = [];
-                const writerImpl = new StreamWriterImpl((evt) => customEvents.push(evt), (token) => { }, (evt) => messageEvents.push(evt), name, step, messageId, agentId);
-                nodeWriters.set(name, writerImpl);
-                // Install global emitToken handler so emitToken() routes to this node's writer
-                _installTokenHandler((token) => writerImpl.token(token));
-                // Check if this node has a pending resume value
-                const resumeValue = resumeMap[name];
-                const hasResume = name in resumeMap;
-                // Emit agent.start lifecycle event
-                const nodeStartTime = Date.now();
-                this.eventBus.emit({ type: "agent.start", agent: name, timestamp: nodeStartTime, step });
-                // Telemetry: node-level span
-                const nodeSpan = this.tracer.startNodeSpan(name, { threadId, step, agentId });
-                let result;
-                let subParentUpdates = [];
-                try {
-                    if (nodeDef.subgraph) {
-                        // Mark child runner as a subgraph so Command.PARENT works
-                        const childRunner = nodeDef.subgraph._runner;
-                        if (childRunner) {
-                            childRunner._isSubgraph = true;
-                            childRunner._parentUpdates = [];
-                        }
-                        // Namespace the subgraph's checkpointer for isolation
-                        if (this.checkpointer && childRunner) {
-                            childRunner.checkpointer = new NamespacedCheckpointer(this.checkpointer, name);
-                        }
-                        // Stream the subgraph instead of invoke — buffer events for yielding after Promise.all
-                        // Use debug+values so we capture ALL event types AND state_update for final state
-                        let subFinalState;
-                        const childStreamMode = ["debug", "values"];
-                        for await (const evt of nodeDef.subgraph.stream(state, {
-                            ...config,
-                            parentRunId: config?.threadId,
-                            streamMode: childStreamMode,
-                        })) {
-                            // Namespace-prefix the node name
-                            allSubgraphEvents.push({
-                                ...evt,
-                                node: evt.node ? `${name}:${evt.node}` : name,
-                            });
-                            // Track the last state_update as the final subgraph state
-                            if (evt.event === "state_update") {
-                                subFinalState = evt.data;
-                            }
-                        }
-                        // Collect parent updates from child
-                        if (childRunner) {
-                            subParentUpdates = childRunner._parentUpdates;
-                            childRunner._isSubgraph = false;
-                            childRunner._parentUpdates = [];
-                        }
-                        result = subFinalState ?? {};
-                    }
-                    else {
-                        result = await this.executeNode(nodeDef, state, config, resumeValue, hasResume, writerImpl, step, recursionLimit);
+                    // Static interrupt AFTER
+                    if (this.interruptConfig.interruptAfter?.includes(name)) {
+                        await this.saveCheckpoint(threadId, step, state, nextNodes, nextSends, agentId, config?.metadata);
+                        throw new ONIInterrupt(name, "after", state);
                     }
+                    this.checkDynamicInterrupt(name, "after", state, config);
                 }
-                catch (err) {
-                    // Catch interrupt() signals thrown from inside nodes
-                    if (err instanceof NodeInterruptSignal) {
-                        const iv = {
-                            value: err.value,
-                            node: name,
-                            resumeId: err.resumeId,
-                            timestamp: Date.now(),
-                        };
-                        // Save checkpoint before surfacing interrupt
-                        await this.saveCheckpoint(threadId, step, state, [name], pendingSends, agentId, config?.metadata);
-                        // Record HITL session if checkpointer exists
-                        if (this.checkpointer) {
-                            const cp = await this.checkpointer.get(threadId);
-                            if (cp)
-                                this.hitlStore.record(threadId, iv, cp);
-                        }
-                        throw new HITLInterruptException(threadId, iv, state);
-                    }
-                    // Record to DLQ before re-throwing — use original cause if wrapped
-                    if (this.dlq && err instanceof Error) {
-                        const dlqErr = (err instanceof NodeExecutionError && err.cause instanceof Error) ? err.cause : err;
-                        this.dlq.record(threadId, name, state, dlqErr, nodeDef.retry?.maxAttempts ?? 1);
-                    }
-                    // Telemetry: record error on node span — use original cause if wrapped
-                    if (err instanceof Error) {
-                        const telErr = (err instanceof NodeExecutionError && err.cause instanceof Error) ? err.cause : err;
-                        this.tracer.recordError(nodeSpan, telErr);
+                // Yield buffered custom/message events based on stream mode
+                if (modeCustom || modeDebug) {
+                    for (const evt of allCustomEvents) {
+                        if (modeCustom)
+                            yield tag(evt, "custom");
+                        if (modeDebug)
+                            yield tag(evt, "debug");
                     }
-                    this.tracer.endSpan(nodeSpan);
-                    _clearTokenHandler();
-                    throw err;
-                }
-                // Clear emitToken handler after node completes
-                _clearTokenHandler();
-                // Telemetry: end node span
-                this.tracer.endSpan(nodeSpan);
-                // Emit agent.end lifecycle event
-                this.eventBus.emit({ type: "agent.end", agent: name, timestamp: Date.now(), step, duration: Date.now() - nodeStartTime });
-                // Collect events for yielding after parallel execution
-                allCustomEvents.push(...customEvents);
-                allMessageEvents.push(...messageEvents);
-                return { name, result, subParentUpdates };
-            }));
-            // Yield buffered subgraph events — filtered by parent's active modes
-            for (const evt of allSubgraphEvents) {
-                const e = evt;
-                if (modeDebug) {
-                    yield tag(e, "debug");
-                }
-                else if (modeUpdates && (e.event === "node_end")) {
-                    yield tag(e, "updates");
-                }
-                else if (modeValues && e.event === "state_update") {
-                    yield tag(e, "values");
-                }
-                // Custom and message events from subgraphs are forwarded if those modes are active
-                if (modeCustom && e.event === "custom") {
-                    yield tag(e, "custom");
                 }
-                if (modeMessages && (e.event === "messages" || e.event === "messages/complete")) {
-                    yield tag(e, "messages");
-                }
-            }
-            // Apply results
-            const stepWrites = [];
-            for (const { name, result, subParentUpdates: parentUpdates } of nodeResults) {
-                if (result instanceof Command) {
-                    if (result.graph === Command.PARENT) {
-                        // Push update to parent — do NOT apply locally
-                        if (!this._isSubgraph) {
-                            throw new Error("Command.PARENT used but graph is not running as a subgraph");
-                        }
-                        if (result.update)
-                            this._parentUpdates.push(result.update);
-                        // Still resolve next nodes normally
-                        const { nodes, sends } = this.getNextNodes(name, state, config);
-                        nextNodes.push(...nodes);
-                        nextSends.push(...sends);
+                if (modeMessages || modeDebug) {
+                    for (const evt of allMessageEvents) {
+                        if (modeMessages)
+                            yield tag(evt, "messages");
+                        if (modeDebug)
+                            yield tag(evt, "debug");
                     }
-                    else {
-                        if (result.update) {
-                            state = this.applyUpdate(state, result.update);
-                            if (Object.keys(result.update).length > 0) {
-                                stepWrites.push({ nodeId: name, writes: result.update });
-                            }
+                    // Emit messages/complete for each node that produced tokens
+                    for (const [, writer] of nodeWriters) {
+                        const complete = writer._complete();
+                        if (complete) {
+                            if (modeMessages)
+                                yield tag(complete, "messages");
+                            if (modeDebug)
+                                yield tag(complete, "debug");
                         }
-                        const gotos = result.goto
-                            ? (Array.isArray(result.goto) ? result.goto : [result.goto])
-                            : this.getNextNodes(name, state, config).nodes;
-                        nextNodes.push(...gotos);
-                        if (result.send)
-                            nextSends.push(...result.send.map((s) => ({ node: s.node, args: s.args })));
                     }
                 }
-                else if (result && typeof result === "object") {
-                    state = this.applyUpdate(state, result);
-                    const writes = result;
-                    if (Object.keys(writes).length > 0) {
-                        stepWrites.push({ nodeId: name, writes });
-                    }
-                    const { nodes, sends } = this.getNextNodes(name, state, config);
-                    nextNodes.push(...nodes);
-                    nextSends.push(...sends);
+                if (modeValues)
+                    yield tag(this.evt("state_update", state, step, agentId), "values");
+                // Deduplicate nextNodes — avoid Set+spread when no dupes (common case)
+                if (nextNodes.length <= 1) {
+                    pendingNodes = nextNodes;
                 }
                 else {
-                    const { nodes, sends } = this.getNextNodes(name, state, config);
-                    nextNodes.push(...nodes);
-                    nextSends.push(...sends);
-                }
-                // Apply parent updates from subgraph Command.PARENT (after normal result)
-                for (const pu of parentUpdates) {
-                    state = this.applyUpdate(state, pu);
-                }
-                if (modeUpdates || modeDebug) {
-                    const delta = result instanceof Command ? (result.update ?? {}) : (result ?? {});
-                    if (modeUpdates)
-                        yield tag(this.evt("node_end", delta, step, agentId, name), "updates");
-                    if (modeDebug)
-                        yield tag(this.evt("node_end", delta, step, agentId, name), "debug");
-                }
-                // Static interrupt AFTER
-                if (this.interruptConfig.interruptAfter?.includes(name)) {
-                    await this.saveCheckpoint(threadId, step, state, nextNodes, nextSends, agentId, config?.metadata);
-                    throw new ONIInterrupt(name, "after", state);
-                }
-                this.checkDynamicInterrupt(name, "after", state, config);
-            }
-            // Yield buffered custom/message events based on stream mode
-            if (modeCustom || modeDebug) {
-                for (const evt of allCustomEvents) {
-                    if (modeCustom)
-                        yield tag(evt, "custom");
-                    if (modeDebug)
-                        yield tag(evt, "debug");
-                }
-            }
-            if (modeMessages || modeDebug) {
-                for (const evt of allMessageEvents) {
-                    if (modeMessages)
-                        yield tag(evt, "messages");
-                    if (modeDebug)
-                        yield tag(evt, "debug");
-                }
-                // Emit messages/complete for each node that produced tokens
-                for (const [, writer] of nodeWriters) {
-                    const complete = writer._complete();
-                    if (complete) {
-                        if (modeMessages)
-                            yield tag(complete, "messages");
-                        if (modeDebug)
-                            yield tag(complete, "debug");
+                    const seen = new Set();
+                    pendingNodes = [];
+                    for (const n of nextNodes) {
+                        const key = n;
+                        if (!seen.has(key)) {
+                            seen.add(key);
+                            pendingNodes.push(n);
+                        }
                     }
                 }
+                pendingSends = nextSends;
+                step++;
+                await this.saveCheckpoint(threadId, step, state, pendingNodes, pendingSends, agentId, config?.metadata, stepWrites);
             }
             if (modeValues)
                 yield tag(this.evt("state_update", state, step, agentId), "values");
-            // Deduplicate nextNodes — avoid Set+spread when no dupes (common case)
-            if (nextNodes.length <= 1) {
-                pendingNodes = nextNodes;
-            }
-            else {
-                const seen = new Set();
-                pendingNodes = [];
-                for (const n of nextNodes) {
-                    const key = n;
-                    if (!seen.has(key)) {
-                        seen.add(key);
-                        pendingNodes.push(n);
-                    }
-                }
-            }
-            pendingSends = nextSends;
-            step++;
-            await this.saveCheckpoint(threadId, step, state, pendingNodes, pendingSends, agentId, config?.metadata, stepWrites);
         }
-        if (modeValues)
-            yield tag(this.evt("state_update", state, step, agentId), "values");
-        // Telemetry: end graph span
-        graphSpan.setAttribute("oni.steps", step);
-        this.tracer.endSpan(graphSpan);
+        finally {
+            // Telemetry: end graph span — always runs, even on error or interrupt
+            graphSpan.setAttribute("oni.steps", step);
+            this.tracer.endSpan(graphSpan);
+        }
     }
     // ----------------------------------------------------------------
     // Public API
@@ -699,9 +821,6 @@ export class ONIPregelRunner {
             cb = new CircuitBreaker({
                 threshold: nodeDef.circuitBreaker.threshold,
                 resetAfter: nodeDef.circuitBreaker.resetAfter,
-                fallback: nodeDef.circuitBreaker.fallback
-                    ? () => nodeDef.circuitBreaker.fallback(undefined, undefined)
-                    : undefined,
             }, nodeDef.name);
             this.circuitBreakers.set(nodeDef.name, cb);
         }
@@ -713,16 +832,21 @@ export class ONIPregelRunner {
     }
     // ---- Helpers ----
     async saveCheckpoint(threadId, step, state, nextNodes, pendingSends, agentId, metadata, pendingWrites) {
-        if (!this.checkpointer)
+        const cp = (this._perInvocationCheckpointer.get(threadId) ?? this.checkpointer);
+        if (!cp)
             return;
         const cpSpan = this.tracer.startCheckpointSpan("put", { threadId });
-        await this.checkpointer.put({
-            threadId, step, state, agentId, metadata, pendingWrites,
-            nextNodes: nextNodes.map(String),
-            pendingSends: pendingSends,
-            timestamp: Date.now(),
-        });
-        this.tracer.endSpan(cpSpan);
+        try {
+            await cp.put({
+                threadId, step, state, agentId, metadata, pendingWrites,
+                nextNodes: nextNodes.map(String),
+                pendingSends: pendingSends,
+                timestamp: Date.now(),
+            });
+        }
+        finally {
+            this.tracer.endSpan(cpSpan);
+        }
     }
     evt(event, data, step, agentId, node) {
         return { event, data, step, timestamp: Date.now(), agentId, node };