npm - @oh-my-pi/pi-coding-agent - Versions diffs - 15.13.2 → 15.13.3 - Mend

@oh-my-pi/pi-coding-agent 15.13.2 → 15.13.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

package/CHANGELOG.md +22 -0
package/dist/cli.js +147 -122
package/dist/types/config/settings-schema.d.ts +31 -0
package/dist/types/eval/js/context-manager.d.ts +15 -0
package/dist/types/modes/interactive-mode.d.ts +1 -0
package/dist/types/modes/types.d.ts +6 -0
package/dist/types/session/unexpected-stop-classifier.d.ts +13 -0
package/dist/types/stt/asr-client.d.ts +1 -1
package/dist/types/tiny/title-client.d.ts +1 -1
package/dist/types/tools/job.d.ts +1 -0
package/dist/types/tts/tts-client.d.ts +1 -1
package/dist/types/utils/thinking-display.d.ts +1 -17
package/package.json +12 -12
package/src/cli.ts +25 -12
package/src/config/model-registry.ts +6 -2
package/src/config/settings-schema.ts +25 -0
package/src/eval/__tests__/agent-bridge.test.ts +106 -46
package/src/eval/__tests__/js-context-manager.test.ts +12 -2
package/src/eval/js/context-manager.ts +40 -3
package/src/eval/js/worker-entry.ts +7 -0
package/src/export/html/template.js +18 -22
package/src/internal-urls/docs-index.generated.ts +5 -3
package/src/main.ts +15 -5
package/src/modes/acp/acp-agent.ts +2 -2
package/src/modes/acp/acp-event-mapper.ts +2 -2
package/src/modes/components/agent-hub.ts +31 -7
package/src/modes/components/assistant-message.ts +24 -15
package/src/modes/components/snapcompact-shape-preview-doc.md +2 -2
package/src/modes/components/snapcompact-shape-preview.ts +2 -2
package/src/modes/components/tree-selector.ts +3 -2
package/src/modes/controllers/event-controller.ts +3 -3
package/src/modes/controllers/input-controller.ts +7 -1
package/src/modes/controllers/streaming-reveal.ts +4 -4
package/src/modes/interactive-mode.ts +2 -0
package/src/modes/types.ts +6 -0
package/src/modes/utils/ui-helpers.ts +3 -3
package/src/prompts/agents/oracle.md +0 -1
package/src/prompts/agents/reviewer.md +0 -1
package/src/prompts/system/unexpected-stop-classifier.md +17 -0
package/src/prompts/system/unexpected-stop-retry.md +4 -0
package/src/session/agent-session.ts +164 -10
package/src/session/session-dump-format.ts +8 -19
package/src/session/unexpected-stop-classifier.ts +129 -0
package/src/stt/asr-client.ts +1 -1
package/src/tiny/title-client.ts +1 -1
package/src/tools/browser/tab-supervisor.ts +1 -1
package/src/tools/browser/tab-worker-entry.ts +12 -4
package/src/tools/job.ts +1 -0
package/src/tts/tts-client.ts +1 -1
package/src/utils/thinking-display.ts +8 -34

package/dist/types/config/settings-schema.d.ts CHANGED Viewed

@@ -4416,6 +4416,37 @@ export declare const SETTINGS_SCHEMA: {
             })[];
         };
     };
+    readonly "features.unexpectedStopDetection": {
+        readonly type: "boolean";
+        readonly default: false;
+        readonly ui: {
+            readonly tab: "interaction";
+            readonly group: "Agent";
+            readonly label: "Detect unexpected stops";
+            readonly description: "Use a small model to detect when the assistant says it will continue but stops without tool calls; automatically prompt it to continue.";
+        };
+    };
+    readonly "providers.unexpectedStopModel": {
+        readonly type: "enum";
+        readonly values: readonly ["online", "qwen3-1.7b", "gemma-3-1b", "qwen2.5-1.5b", "lfm2-1.2b"];
+        readonly default: "online";
+        readonly ui: {
+            readonly tab: "providers";
+            readonly group: "Tiny Model";
+            readonly label: "Unexpected Stop Model";
+            readonly description: "Classifier for unexpected-stop detection: online smol by default, or a local on-device model.";
+            readonly condition: "unexpectedStopDetection";
+            readonly options: ({
+                value: "online";
+                label: string;
+                description: string;
+            } | {
+                value: "gemma-3-1b" | "lfm2-1.2b" | "qwen2.5-1.5b" | "qwen3-1.7b";
+                label: "Gemma 3 1B" | "LFM2 1.2B" | "Qwen2.5 1.5B" | "Qwen3 1.7B";
+                description: "Best consolidation/dedup; lighter footprint, but leaks small talk during extraction." | "Best extraction granularity (atomic facts); weaker consolidation." | "Fastest load; solid all-rounder, slightly noisier extraction labels." | "Recommended; most disciplined extraction (ignores chit-chat), good consolidation, about 1.1 GB cached.";
+            })[];
+        };
+    };
     readonly "providers.kimiApiFormat": {
         readonly type: "enum";
         readonly values: readonly ["openai", "anthropic"];

package/dist/types/eval/js/context-manager.d.ts CHANGED Viewed

@@ -7,6 +7,12 @@ export interface VmRunState {
     onText?: (chunk: string) => void;
     onDisplay?: (output: JsDisplayOutput) => void;
 }
+/**
+ * Test-only seam: override the graceful-close grace period (ms). Returns the
+ * previous value so callers can restore it. Production always uses
+ * {@link WORKER_CLOSE_TIMEOUT_MS}; never call this outside tests.
+ */
+export declare function setWorkerCloseTimeoutMsForTests(ms: number): number;
 export declare function executeInVmContext(options: {
     sessionKey: string;
     sessionId: string;
@@ -23,3 +29,12 @@ export declare function executeInVmContext(options: {
 }>;
 export declare function resetVmContext(sessionKey: string): Promise<void>;
 export declare function disposeAllVmContexts(): Promise<void>;
+/**
+ * Smoke probe: spawn the JS eval worker through the worker-host entry and prove
+ * it answers the `init` handshake on a real worker thread (not the inline
+ * fallback). Catches the silent worker-load and init-message-drop regressions
+ * that otherwise strand every cell on the init timeout in a distribution build —
+ * the failure mode that motivated `installWorkerInbox`. Wired into
+ * `omp --smoke-test` so binary / source / tarball installs all exercise it.
+ */
+export declare function smokeTestJsEvalWorker(): Promise<void>;

package/dist/types/modes/interactive-mode.d.ts CHANGED Viewed

@@ -186,6 +186,7 @@ export declare class InteractiveMode implements InteractiveModeContext {
         imageLinks?: (string | undefined)[];
         customType?: string;
         display?: boolean;
+        streamingBehavior?: "steer" | "followUp";
     }): SubmittedUserInput;
     cancelPendingSubmission(): boolean;
     markPendingSubmissionStarted(input: SubmittedUserInput): boolean;

package/dist/types/modes/types.d.ts CHANGED Viewed

@@ -46,6 +46,11 @@ export type SubmittedUserInput = {
      *  turn. Used by the `c`/`.` continue shortcut. */
     synthetic?: boolean;
     display?: boolean;
+    /** Queue intent if the session is (or becomes) busy when this submission is
+     *  dispatched: "steer" (interrupt the active turn) or "followUp" (process after
+     *  it). Normal user Enter carries "steer" to match the streaming-branch Enter;
+     *  background/continuation submits omit it and default to "followUp". */
+    streamingBehavior?: "steer" | "followUp";
     cancelled: boolean;
     started: boolean;
 };
@@ -199,6 +204,7 @@ export interface InteractiveModeContext {
         imageLinks?: (string | undefined)[];
         customType?: string;
         display?: boolean;
+        streamingBehavior?: "steer" | "followUp";
     }): SubmittedUserInput;
     cancelPendingSubmission(): boolean;
     markPendingSubmissionStarted(input: SubmittedUserInput): boolean;

package/dist/types/session/unexpected-stop-classifier.d.ts ADDED Viewed

@@ -0,0 +1,13 @@
+import { type AssistantMessage } from "@oh-my-pi/pi-ai";
+import type { ModelRegistry } from "../config/model-registry";
+import type { Settings } from "../config/settings";
+export interface ClassifyUnexpectedStopDeps {
+    settings: Settings;
+    registry: ModelRegistry;
+    sessionId: string;
+    metadataResolver?: (provider: string) => Record<string, unknown> | undefined;
+    signal?: AbortSignal;
+}
+export declare function isUnexpectedStopCandidate(message: AssistantMessage): boolean;
+export declare function classifyUnexpectedStop(text: string, deps: ClassifyUnexpectedStopDeps): Promise<boolean | undefined>;
+export declare function parseUnexpectedStopClassification(text: string): boolean | undefined;

package/dist/types/stt/asr-client.d.ts CHANGED Viewed

@@ -43,7 +43,7 @@ export interface SttStreamOptions {
  * Hidden subcommand on the main CLI that boots the speech-recognition worker in
  * the spawned subprocess. Kept in sync with the dispatch in `cli.ts`.
  */
-export declare const STT_WORKER_ARG = "__omp_stt_worker";
+export declare const STT_WORKER_ARG = "__omp_worker_stt";
 interface SpawnedSubprocess {
     proc: Subprocess<"ignore", "ignore", "ignore">;
     inbound: Set<(message: SttWorkerOutbound) => void>;

package/dist/types/tiny/title-client.d.ts CHANGED Viewed

@@ -32,7 +32,7 @@ export interface TinyTitleGenerateOptions {
  * Hidden subcommand on the main CLI that boots the tiny-model worker in the
  * spawned subprocess. Kept in sync with the dispatch in `cli.ts`.
  */
-export declare const TINY_WORKER_ARG = "--tiny-worker";
+export declare const TINY_WORKER_ARG = "__omp_worker_tiny_inference";
 /**
  * Decide which `PI_TINY_DEVICE` / `PI_TINY_DTYPE` vars to overlay onto the worker
  * env. A present env var wins (left untouched); otherwise the mapped persisted

package/dist/types/tools/job.d.ts CHANGED Viewed

@@ -48,6 +48,7 @@ export declare class JobTool implements AgentTool<typeof jobSchema, JobToolDetai
         list: z.ZodOptional<z.ZodBoolean>;
     }, z.core.$strip>;
     readonly strict = true;
+    readonly interruptible = true;
     readonly loadMode = "discoverable";
     constructor(session: ToolSession);
     execute(_toolCallId: string, params: JobParams, signal?: AbortSignal, onUpdate?: AgentToolUpdateCallback<JobToolDetails>, _context?: AgentToolContext): Promise<AgentToolResult<JobToolDetails>>;

package/dist/types/tts/tts-client.d.ts CHANGED Viewed

@@ -54,7 +54,7 @@ export interface TtsStreamHandle {
  * Hidden subcommand on the main CLI that boots the TTS worker in the spawned
  * subprocess. Kept in sync with the dispatch in `cli.ts` (Main-owned).
  */
-export declare const TTS_WORKER_ARG = "__omp_tts_worker";
+export declare const TTS_WORKER_ARG = "__omp_worker_tts";
 interface SpawnedSubprocess {
     proc: Subprocess<"ignore", "ignore", "ignore">;
     inbound: Set<(message: TtsWorkerOutbound) => void>;

package/dist/types/utils/thinking-display.d.ts CHANGED Viewed

@@ -1,17 +1 @@
-import type { AssistantMessage } from "@oh-my-pi/pi-ai";
-type AssistantContentBlock = AssistantMessage["content"][number];
-type ThinkingBlock = Extract<AssistantContentBlock, {
-    type: "thinking";
-}>;
-/**
- * Returns the operator-visible thinking text for a block.
- *
- * Some OpenAI-compatible reasoning gateways require a non-empty
- * `reasoning_content` field on historical assistant tool-call turns even when
- * the model did not emit any reasoning. The provider adapter uses a single dot
- * as the wire-only placeholder those gateways accept; if that value is later
- * replayed or echoed as a thinking block, it should not render as model thought.
- */
-export declare function getVisibleThinkingText(block: ThinkingBlock): string;
-export declare function hasVisibleThinking(block: ThinkingBlock): boolean;
-export {};
+export declare function canonicalizeMessage(text: string | null | undefined): string;

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
 	"type": "module",
 	"name": "@oh-my-pi/pi-coding-agent",
-	"version": "15.13.2",
+	"version": "15.13.3",
 	"description": "Coding agent CLI with read, bash, edit, write tools and session management",
 	"homepage": "https://omp.sh",
 	"author": "Can Boluk",
@@ -47,17 +47,17 @@
 		"@agentclientprotocol/sdk": "0.25.0",
 		"@babel/parser": "^7.29.7",
 		"@mozilla/readability": "^0.6.0",
-		"@oh-my-pi/hashline": "15.13.2",
-		"@oh-my-pi/omp-stats": "15.13.2",
-		"@oh-my-pi/pi-agent-core": "15.13.2",
-		"@oh-my-pi/pi-ai": "15.13.2",
-		"@oh-my-pi/pi-catalog": "15.13.2",
-		"@oh-my-pi/pi-mnemopi": "15.13.2",
-		"@oh-my-pi/pi-natives": "15.13.2",
-		"@oh-my-pi/pi-tui": "15.13.2",
-		"@oh-my-pi/pi-utils": "15.13.2",
-		"@oh-my-pi/pi-wire": "15.13.2",
-		"@oh-my-pi/snapcompact": "15.13.2",
+		"@oh-my-pi/hashline": "15.13.3",
+		"@oh-my-pi/omp-stats": "15.13.3",
+		"@oh-my-pi/pi-agent-core": "15.13.3",
+		"@oh-my-pi/pi-ai": "15.13.3",
+		"@oh-my-pi/pi-catalog": "15.13.3",
+		"@oh-my-pi/pi-mnemopi": "15.13.3",
+		"@oh-my-pi/pi-natives": "15.13.3",
+		"@oh-my-pi/pi-tui": "15.13.3",
+		"@oh-my-pi/pi-utils": "15.13.3",
+		"@oh-my-pi/pi-wire": "15.13.3",
+		"@oh-my-pi/snapcompact": "15.13.3",
 		"@opentelemetry/api": "^1.9.1",
 		"@opentelemetry/context-async-hooks": "^2.7.1",
 		"@opentelemetry/exporter-trace-otlp-proto": "^0.218.0",

package/src/cli.ts CHANGED Viewed

@@ -14,6 +14,7 @@ try {
  * CLI entry point — registers all commands explicitly and delegates to the
  * lightweight CLI runner from pi-utils.
  */
+import { parentPort } from "node:worker_threads";
 import type { CliConfig } from "@oh-my-pi/pi-utils/cli";
 import {
 	APP_NAME,
@@ -23,7 +24,7 @@ import {
 	setProfile,
 	VERSION,
 } from "@oh-my-pi/pi-utils/dirs";
-import { declareWorkerHostEntry } from "@oh-my-pi/pi-utils/worker-host";
+import { declareWorkerHostEntry, installWorkerInbox } from "@oh-my-pi/pi-utils/worker-host";
 import { installProfileAlias, resolveProfileAliasCommandFromProcess } from "./cli/profile-alias";
 import { extractProfileFlags } from "./cli/profile-bootstrap";
@@ -67,6 +68,7 @@ async function runSmokeTest(): Promise<void> {
 	const { smokeTestTinyTitleWorker } = await import("./tiny/title-client");
 	const { smokeTestSttWorker } = await import("./stt/asr-client");
 	const { smokeTestTtsWorker } = await import("./tts/tts-client");
+	const { smokeTestJsEvalWorker } = await import("./eval/js/context-manager");
 	await smokeTestSyncWorker();
 	const statsServer = await startServer(0);
@@ -83,18 +85,23 @@ async function runSmokeTest(): Promise<void> {
 	await smokeTestTinyTitleWorker();
 	await smokeTestSttWorker();
+	await smokeTestJsEvalWorker();
 	await smokeTestTtsWorker();
 	process.stdout.write("smoke-test: ok\n");
 }
-const TINY_WORKER_ARGS = new Set(["--tiny-worker", "__tiny_worker"]);
-const STATS_SYNC_WORKER_ARG = "__omp_stats_sync_worker";
-const TAB_WORKER_ARG = "__omp_tab_worker";
-const JS_EVAL_WORKER_ARG = "__omp_js_eval_worker";
-const STT_WORKER_ARG = "__omp_stt_worker";
-const TTS_WORKER_ARG = "__omp_tts_worker";
+const TINY_WORKER_ARG = "__omp_worker_tiny_inference";
+const STATS_SYNC_WORKER_ARG = "__omp_worker_stats_sync";
+const TAB_WORKER_ARG = "__omp_worker_tab";
+const JS_EVAL_WORKER_ARG = "__omp_worker_js_eval";
+const STT_WORKER_ARG = "__omp_worker_stt";
+const TTS_WORKER_ARG = "__omp_worker_tts";
 async function runWorkerEntrypoint(arg: string | undefined): Promise<boolean> {
+	if (arg === TINY_WORKER_ARG) {
+		await runTinyWorker();
+		return true;
+	}
 	if (arg === STATS_SYNC_WORKER_ARG) {
 		// The sync worker handles messages via `self.onmessage`, assigned during
 		// this *async* dynamic import. Bun flushes the worker's initial message
@@ -117,11 +124,20 @@ async function runWorkerEntrypoint(arg: string | undefined): Promise<boolean> {
 		}
 		return true;
 	}
+	// Bun flushes messages the parent posted before spawn once this entry's
+	// top-level evaluation completes, delivering them only to listeners present
+	// at that moment. These worker modules are imported dynamically below, so
+	// their own `parentPort.on("message")` lands after the flush and the parent's
+	// synchronous `init` is dropped. Install a buffering inbox synchronously here
+	// (still inside the entry's sync prefix) so the handshake survives; the worker
+	// module binds the real handler once loaded.
 	if (arg === TAB_WORKER_ARG) {
+		if (parentPort) installWorkerInbox(parentPort);
 		await import("./tools/browser/tab-worker-entry");
 		return true;
 	}
 	if (arg === JS_EVAL_WORKER_ARG) {
+		if (parentPort) installWorkerInbox(parentPort);
 		await import("./eval/js/worker-entry");
 		return true;
 	}
@@ -251,11 +267,8 @@ export async function runCli(argv: string[]): Promise<void> {
 	// synchronous prefix of `runWorkerEntrypoint`, and Bun flushes the
 	// worker's parked initial messages as soon as the entry module's
 	// top-level evaluation finishes.
-	if (TINY_WORKER_ARGS.has(resolvedArgv[0] ?? "")) {
-		await runTinyWorker();
-		return;
-	}
-	if (await runWorkerEntrypoint(resolvedArgv[0])) {
+	if (resolvedArgv[0]?.startsWith("__omp_worker_")) {
+		await runWorkerEntrypoint(resolvedArgv[0]);
 		return;
 	}

package/src/config/model-registry.ts CHANGED Viewed

@@ -59,7 +59,7 @@ import {
 	resolveCanonicalVariant,
 	resolveModelReference,
 } from "@oh-my-pi/pi-catalog/identity";
-import { isRecord, logger } from "@oh-my-pi/pi-utils";
+import { isBunTestRuntime, isRecord, logger } from "@oh-my-pi/pi-utils";
 import { parseModelString, resolveProviderModelReference } from "../config/model-resolver";
 import type { AuthStorage, OAuthCredential } from "../session/auth-storage";
 import { type ApiKeyResolverModel, type ApiKeyResolverOptions, createApiKeyResolver } from "./api-key-resolver";
@@ -690,7 +690,11 @@ export class ModelRegistry {
 		modelsPath?: string,
 		options?: { fetch?: FetchImpl },
 	) {
-		this.#fetch = options?.fetch ?? fetch;
+		this.#fetch =
+			options?.fetch ??
+			(isBunTestRuntime()
+				? () => Promise.reject(new Error("network disabled in model-registry runtime test"))
+				: fetch);
 		this.#modelsConfigFile = ModelsConfigFile.relocate(modelsPath);
 		this.#cacheDbPath = modelsPath ? path.join(path.dirname(modelsPath), "models.db") : undefined;
 		// Set up fallback resolver for custom provider API keys

package/src/config/settings-schema.ts CHANGED Viewed

@@ -116,6 +116,7 @@ export const TAB_GROUPS: Record<SettingTab, readonly string[]> = {
 		"Magic Keywords",
 		"Startup & Updates",
 		"Power (macOS)",
+		"Agent",
 	],
 	context: ["General", "Compaction", "Rules (TTSR)", "Experimental"],
 	memory: ["General", "Auto-Learn", "Mnemopi", "Hindsight"],
@@ -3993,6 +3994,30 @@ export const SETTINGS_SCHEMA = {
 			options: AUTO_THINKING_MODEL_OPTIONS,
 		},
 	},
+	"features.unexpectedStopDetection": {
+		type: "boolean",
+		default: false,
+		ui: {
+			tab: "interaction",
+			group: "Agent",
+			label: "Detect unexpected stops",
+			description:
+				"Use a small model to detect when the assistant says it will continue but stops without tool calls; automatically prompt it to continue.",
+		},
+	},
+	"providers.unexpectedStopModel": {
+		type: "enum",
+		values: TINY_MEMORY_MODEL_VALUES,
+		default: ONLINE_MEMORY_MODEL_KEY,
+		ui: {
+			tab: "providers",
+			group: "Tiny Model",
+			label: "Unexpected Stop Model",
+			description: "Classifier for unexpected-stop detection: online smol by default, or a local on-device model.",
+			condition: "unexpectedStopDetection",
+			options: TINY_MEMORY_MODEL_OPTIONS,
+		},
+	},
 	"providers.kimiApiFormat": {
 		type: "enum",

package/src/eval/__tests__/agent-bridge.test.ts CHANGED Viewed

@@ -121,6 +121,34 @@ function makeEvalSession(
 	return { session, sessionFile, sessionId: `${prefix}:${crypto.randomUUID()}` };
 }
+/**
+ * Spy `runSubprocess` so a `parallel()` fan-out overlaps deterministically: every
+ * bridge call parks until the pool saturates at `limit` concurrent calls in flight,
+ * then all proceed. Proves the pool reaches its ceiling without a wall-clock sleep —
+ * the pool itself caps how many run at once, so an unbounded pool would drive
+ * `maxInFlight` past `limit` and fail the bound.
+ */
+function spyConcurrencyBarrier(limit: number): { maxInFlight: () => number } {
+	let inFlight = 0;
+	let max = 0;
+	let saturate: (() => void) | undefined;
+	const saturated = new Promise<void>(resolve => {
+		saturate = resolve;
+	});
+	vi.spyOn(taskExecutor, "runSubprocess").mockImplementation(async options => {
+		inFlight++;
+		max = Math.max(max, inFlight);
+		if (inFlight >= limit) saturate?.();
+		try {
+			await saturated;
+			return singleResult(options, { output: options.assignment ?? "" });
+		} finally {
+			inFlight--;
+		}
+	});
+	return { maxInFlight: () => max };
+}
 describe("runEvalAgent", () => {
 	afterEach(() => {
 		vi.restoreAllMocks();
@@ -298,8 +326,17 @@ describe("runEvalAgent", () => {
 });
 describe("agent() through eval runtimes", () => {
+	// One shared JS worker backs every agent() JavaScript test below. Spawning a
+	// worker (thread + module-graph import) is fixed infrastructure cost, not
+	// behavior under test; reusing it keeps the suite fast. Each run still threads
+	// its own ToolSession (settings/mock are read live through the bridge per call)
+	// and top-level `const`/`let` are demoted to `var`, so reuse never leaks state
+	// these tests observe. Torn down in afterAll via disposeAllVmContexts().
+	const sharedJsSessionId = "agent-bridge-shared-js";
 	afterEach(() => {
 		vi.restoreAllMocks();
+		vi.useRealTimers();
 	});
 	afterAll(async () => {
@@ -309,7 +346,7 @@ describe("agent() through eval runtimes", () => {
 	it("exposes agent() in JavaScript and parses structured output", async () => {
 		using tempDir = TempDir.createSync("@omp-eval-agent-js-");
-		const { session, sessionFile, sessionId } = makeEvalSession(tempDir, "js-agent");
+		const { session, sessionFile } = makeEvalSession(tempDir, "js-agent");
 		mockAgents();
 		vi.spyOn(taskExecutor, "runSubprocess").mockImplementation(async options =>
 			singleResult(options, {
@@ -319,7 +356,7 @@ describe("agent() through eval runtimes", () => {
 		const result = await executeJs(
 			'const text = await agent("hi"); const data = await agent("json", { schema: { type: "object" } }); return JSON.stringify([text, data]);',
-			{ cwd: tempDir.path(), sessionId, session, sessionFile },
+			{ cwd: tempDir.path(), sessionId: sharedJsSessionId, session, sessionFile },
 		);
 		expect(result.exitCode).toBe(0);
@@ -334,35 +371,24 @@ describe("agent() through eval runtimes", () => {
 			"task.enableLsp": true,
 			"task.maxConcurrency": 2,
 		});
-		const { session, sessionFile, sessionId } = makeEvalSession(tempDir, "js-agent-parallel", settings);
+		const { session, sessionFile } = makeEvalSession(tempDir, "js-agent-parallel", settings);
 		mockAgents();
-		let inFlight = 0;
-		let maxInFlight = 0;
-		vi.spyOn(taskExecutor, "runSubprocess").mockImplementation(async options => {
-			inFlight++;
-			maxInFlight = Math.max(maxInFlight, inFlight);
-			try {
-				await Bun.sleep(options.assignment === "a" ? 30 : 10);
-				return singleResult(options, { output: options.assignment ?? "" });
-			} finally {
-				inFlight--;
-			}
-		});
+		const barrier = spyConcurrencyBarrier(2);
 		const result = await executeJs(
 			'const values = await parallel(["a", "b", "c", "d"].map(name => () => agent(name))); return JSON.stringify(values);',
-			{ cwd: tempDir.path(), sessionId, session, sessionFile },
+			{ cwd: tempDir.path(), sessionId: sharedJsSessionId, session, sessionFile },
 		);
 		expect(result.exitCode).toBe(0);
 		expect(JSON.parse(result.output.trim())).toEqual(["a", "b", "c", "d"]);
-		expect(maxInFlight).toBeGreaterThan(1);
-		expect(maxInFlight).toBeLessThanOrEqual(2);
+		expect(barrier.maxInFlight()).toBeGreaterThan(1);
+		expect(barrier.maxInFlight()).toBeLessThanOrEqual(2);
 	});
 	it("propagates JavaScript parallel() rejections", async () => {
 		using tempDir = TempDir.createSync("@omp-eval-agent-js-reject-");
-		const { session, sessionFile, sessionId } = makeEvalSession(tempDir, "js-agent-reject");
+		const { session, sessionFile } = makeEvalSession(tempDir, "js-agent-reject");
 		mockAgents();
 		vi.spyOn(taskExecutor, "runSubprocess").mockImplementation(async options => {
 			if (options.assignment === "bad") {
@@ -373,7 +399,7 @@ describe("agent() through eval runtimes", () => {
 		const result = await executeJs('await parallel([() => agent("ok"), () => agent("bad")]);', {
 			cwd: tempDir.path(),
-			sessionId,
+			sessionId: sharedJsSessionId,
 			session,
 			sessionFile,
 		});
@@ -416,18 +442,7 @@ describe("agent() through eval runtimes", () => {
 		});
 		const { session, sessionFile, sessionId } = makeEvalSession(tempDir, "py-agent-parallel", settings);
 		mockAgents();
-		let inFlight = 0;
-		let maxInFlight = 0;
-		vi.spyOn(taskExecutor, "runSubprocess").mockImplementation(async options => {
-			inFlight++;
-			maxInFlight = Math.max(maxInFlight, inFlight);
-			try {
-				await Bun.sleep(options.assignment === "a" ? 30 : 10);
-				return singleResult(options, { output: options.assignment ?? "" });
-			} finally {
-				inFlight--;
-			}
-		});
+		const barrier = spyConcurrencyBarrier(2);
 		const result = await executePython(
 			'import json\nprint(json.dumps(parallel([lambda n=n: agent(n) for n in ["a", "b", "c", "d"]])))',
@@ -440,8 +455,8 @@ describe("agent() through eval runtimes", () => {
 		expect(result.exitCode).toBe(0);
 		expect(JSON.parse(result.output.trim())).toEqual(["a", "b", "c", "d"]);
-		expect(maxInFlight).toBeGreaterThan(1);
-		expect(maxInFlight).toBeLessThanOrEqual(2);
+		expect(barrier.maxInFlight()).toBeGreaterThan(1);
+		expect(barrier.maxInFlight()).toBeLessThanOrEqual(2);
 	});
 	it("interrupting a Python parallel() fan-out settles the kernel cleanly and preserves session state", async () => {
@@ -526,7 +541,7 @@ describe("agent() through eval runtimes", () => {
 	it("streams enriched agent progress through onStatus before the cell finishes", async () => {
 		using tempDir = TempDir.createSync("@omp-eval-agent-progress-");
-		const { session, sessionFile, sessionId } = makeEvalSession(tempDir, "js-agent-progress");
+		const { session, sessionFile } = makeEvalSession(tempDir, "js-agent-progress");
 		mockAgents();
 		const makeProgress = (options: ExecutorOptions, overrides: Partial<AgentProgress>): AgentProgress => ({
@@ -580,7 +595,7 @@ describe("agent() through eval runtimes", () => {
 		const events: Array<{ op: string; [key: string]: unknown }> = [];
 		const result = await executeJs('await agent("investigate", { label: "Scout" });', {
 			cwd: tempDir.path(),
-			sessionId,
+			sessionId: sharedJsSessionId,
 			session,
 			sessionFile,
 			onStatus: event => events.push(event),
@@ -622,16 +637,28 @@ describe("agent() through eval runtimes", () => {
 		mockAgents();
 		// runSubprocess runs far past the eval timeout budget and emits NO progress
-		// of its own. The bridge pause must make that delegated time invisible to
-		// the watchdog.
+		// of its own; the bridge pause must make that delegated time invisible to
+		// the watchdog. Fake timers replace the real wait: the subprocess parks on
+		// `released` so the test can advance the clock past the budget while the
+		// bridge call is provably in flight, then release it deterministically.
+		let release: (() => void) | undefined;
+		const released = new Promise<void>(resolve => {
+			release = resolve;
+		});
+		let markInFlight: (() => void) | undefined;
+		const inFlight = new Promise<void>(resolve => {
+			markInFlight = resolve;
+		});
 		vi.spyOn(taskExecutor, "runSubprocess").mockImplementation(async options => {
-			await Bun.sleep(40);
+			markInFlight?.();
+			await released;
 			return singleResult(options, { output: "done" });
 		});
 		const ops: string[] = [];
+		vi.useFakeTimers();
 		using idle = new IdleTimeout(20);
-		const result = await runEvalAgent(
+		const resultPromise = runEvalAgent(
 			{ prompt: "investigate" },
 			{
 				session,
@@ -644,11 +671,22 @@ describe("agent() through eval runtimes", () => {
 			},
 		);
+		// The bridge paused the watchdog; the subprocess is now blocked in flight.
+		await inFlight;
+		// Burn far more than the 20ms budget while paused: the watchdog stays armed-off.
+		vi.advanceTimersByTime(1_000);
+		expect(idle.signal.aborted).toBe(false);
+		release?.();
+		const result = await resultPromise;
 		expect(result.text).toBe("done");
 		expect(ops).toEqual([EVAL_TIMEOUT_PAUSE_OP, EVAL_TIMEOUT_RESUME_OP]);
 		expect(idle.signal.aborted).toBe(false);
-		await Bun.sleep(60);
+		// RESUME re-armed a fresh window; once the runtime stays idle past it the
+		// watchdog finally fires.
+		vi.advanceTimersByTime(idle.idleMs + 5);
 		expect(idle.signal.aborted).toBe(true);
 	});
@@ -657,9 +695,20 @@ describe("agent() through eval runtimes", () => {
 		const { session } = makeEvalSession(tempDir, "js-agent-progress-timeout-pause");
 		mockAgents();
-		// Stream frequent progress snapshots (op:"agent") for well past the budget.
+		// Stream frequent progress snapshots (op:"agent") well past the budget.
 		// They render as status, but timeout accounting is controlled only by the
-		// bridge pause/resume events.
+		// bridge pause/resume events — so even a flood of snapshots must not re-arm
+		// the watchdog. Fake timers make "past the budget" deterministic: the
+		// subprocess emits its snapshots, parks on `released`, and the test advances
+		// the clock far past the window before releasing it.
+		let release: (() => void) | undefined;
+		const released = new Promise<void>(resolve => {
+			release = resolve;
+		});
+		let markInFlight: (() => void) | undefined;
+		const inFlight = new Promise<void>(resolve => {
+			markInFlight = resolve;
+		});
 		vi.spyOn(taskExecutor, "runSubprocess").mockImplementation(async options => {
 			for (let i = 0; i < 20; i++) {
 				options.onProgress?.({
@@ -679,15 +728,16 @@ describe("agent() through eval runtimes", () => {
 					cost: 0,
 					durationMs: i * 10,
 				});
-				await Bun.sleep(40);
 			}
+			markInFlight?.();
+			await released;
 			return singleResult(options, { output: "done" });
 		});
 		const ops: string[] = [];
-		// Timing invariant (keep, do not re-tighten): total mock work (20*40ms = 800ms) > idle window (250ms) > scheduling jitter (~tens of ms).
+		vi.useFakeTimers();
 		using idle = new IdleTimeout(250);
-		const result = await runEvalAgent(
+		const resultPromise = runEvalAgent(
 			{ prompt: "investigate" },
 			{
 				session,
@@ -700,6 +750,16 @@ describe("agent() through eval runtimes", () => {
 			},
 		);
+		// All snapshots have streamed and the subprocess is blocked in flight.
+		await inFlight;
+		// Far exceed the 250ms budget while paused: the snapshots already delivered
+		// must not have re-armed the watchdog.
+		vi.advanceTimersByTime(10_000);
+		expect(idle.signal.aborted).toBe(false);
+		release?.();
+		const result = await resultPromise;
 		expect(result.text).toBe("done");
 		expect(ops[0]).toBe(EVAL_TIMEOUT_PAUSE_OP);
 		expect(ops).toContain("agent");