@oh-my-pi/pi-coding-agent 15.13.2 → 15.13.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. package/CHANGELOG.md +22 -0
  2. package/dist/cli.js +147 -122
  3. package/dist/types/config/settings-schema.d.ts +31 -0
  4. package/dist/types/eval/js/context-manager.d.ts +15 -0
  5. package/dist/types/modes/interactive-mode.d.ts +1 -0
  6. package/dist/types/modes/types.d.ts +6 -0
  7. package/dist/types/session/unexpected-stop-classifier.d.ts +13 -0
  8. package/dist/types/stt/asr-client.d.ts +1 -1
  9. package/dist/types/tiny/title-client.d.ts +1 -1
  10. package/dist/types/tools/job.d.ts +1 -0
  11. package/dist/types/tts/tts-client.d.ts +1 -1
  12. package/dist/types/utils/thinking-display.d.ts +1 -17
  13. package/package.json +12 -12
  14. package/src/cli.ts +25 -12
  15. package/src/config/model-registry.ts +6 -2
  16. package/src/config/settings-schema.ts +25 -0
  17. package/src/eval/__tests__/agent-bridge.test.ts +106 -46
  18. package/src/eval/__tests__/js-context-manager.test.ts +12 -2
  19. package/src/eval/js/context-manager.ts +40 -3
  20. package/src/eval/js/worker-entry.ts +7 -0
  21. package/src/export/html/template.js +18 -22
  22. package/src/internal-urls/docs-index.generated.ts +5 -3
  23. package/src/main.ts +15 -5
  24. package/src/modes/acp/acp-agent.ts +2 -2
  25. package/src/modes/acp/acp-event-mapper.ts +2 -2
  26. package/src/modes/components/agent-hub.ts +31 -7
  27. package/src/modes/components/assistant-message.ts +24 -15
  28. package/src/modes/components/snapcompact-shape-preview-doc.md +2 -2
  29. package/src/modes/components/snapcompact-shape-preview.ts +2 -2
  30. package/src/modes/components/tree-selector.ts +3 -2
  31. package/src/modes/controllers/event-controller.ts +3 -3
  32. package/src/modes/controllers/input-controller.ts +7 -1
  33. package/src/modes/controllers/streaming-reveal.ts +4 -4
  34. package/src/modes/interactive-mode.ts +2 -0
  35. package/src/modes/types.ts +6 -0
  36. package/src/modes/utils/ui-helpers.ts +3 -3
  37. package/src/prompts/agents/oracle.md +0 -1
  38. package/src/prompts/agents/reviewer.md +0 -1
  39. package/src/prompts/system/unexpected-stop-classifier.md +17 -0
  40. package/src/prompts/system/unexpected-stop-retry.md +4 -0
  41. package/src/session/agent-session.ts +164 -10
  42. package/src/session/session-dump-format.ts +8 -19
  43. package/src/session/unexpected-stop-classifier.ts +129 -0
  44. package/src/stt/asr-client.ts +1 -1
  45. package/src/tiny/title-client.ts +1 -1
  46. package/src/tools/browser/tab-supervisor.ts +1 -1
  47. package/src/tools/browser/tab-worker-entry.ts +12 -4
  48. package/src/tools/job.ts +1 -0
  49. package/src/tts/tts-client.ts +1 -1
  50. package/src/utils/thinking-display.ts +8 -34
@@ -4416,6 +4416,37 @@ export declare const SETTINGS_SCHEMA: {
4416
4416
  })[];
4417
4417
  };
4418
4418
  };
4419
+ readonly "features.unexpectedStopDetection": {
4420
+ readonly type: "boolean";
4421
+ readonly default: false;
4422
+ readonly ui: {
4423
+ readonly tab: "interaction";
4424
+ readonly group: "Agent";
4425
+ readonly label: "Detect unexpected stops";
4426
+ readonly description: "Use a small model to detect when the assistant says it will continue but stops without tool calls; automatically prompt it to continue.";
4427
+ };
4428
+ };
4429
+ readonly "providers.unexpectedStopModel": {
4430
+ readonly type: "enum";
4431
+ readonly values: readonly ["online", "qwen3-1.7b", "gemma-3-1b", "qwen2.5-1.5b", "lfm2-1.2b"];
4432
+ readonly default: "online";
4433
+ readonly ui: {
4434
+ readonly tab: "providers";
4435
+ readonly group: "Tiny Model";
4436
+ readonly label: "Unexpected Stop Model";
4437
+ readonly description: "Classifier for unexpected-stop detection: online smol by default, or a local on-device model.";
4438
+ readonly condition: "unexpectedStopDetection";
4439
+ readonly options: ({
4440
+ value: "online";
4441
+ label: string;
4442
+ description: string;
4443
+ } | {
4444
+ value: "gemma-3-1b" | "lfm2-1.2b" | "qwen2.5-1.5b" | "qwen3-1.7b";
4445
+ label: "Gemma 3 1B" | "LFM2 1.2B" | "Qwen2.5 1.5B" | "Qwen3 1.7B";
4446
+ description: "Best consolidation/dedup; lighter footprint, but leaks small talk during extraction." | "Best extraction granularity (atomic facts); weaker consolidation." | "Fastest load; solid all-rounder, slightly noisier extraction labels." | "Recommended; most disciplined extraction (ignores chit-chat), good consolidation, about 1.1 GB cached.";
4447
+ })[];
4448
+ };
4449
+ };
4419
4450
  readonly "providers.kimiApiFormat": {
4420
4451
  readonly type: "enum";
4421
4452
  readonly values: readonly ["openai", "anthropic"];
@@ -7,6 +7,12 @@ export interface VmRunState {
7
7
  onText?: (chunk: string) => void;
8
8
  onDisplay?: (output: JsDisplayOutput) => void;
9
9
  }
10
+ /**
11
+ * Test-only seam: override the graceful-close grace period (ms). Returns the
12
+ * previous value so callers can restore it. Production always uses
13
+ * {@link WORKER_CLOSE_TIMEOUT_MS}; never call this outside tests.
14
+ */
15
+ export declare function setWorkerCloseTimeoutMsForTests(ms: number): number;
10
16
  export declare function executeInVmContext(options: {
11
17
  sessionKey: string;
12
18
  sessionId: string;
@@ -23,3 +29,12 @@ export declare function executeInVmContext(options: {
23
29
  }>;
24
30
  export declare function resetVmContext(sessionKey: string): Promise<void>;
25
31
  export declare function disposeAllVmContexts(): Promise<void>;
32
+ /**
33
+ * Smoke probe: spawn the JS eval worker through the worker-host entry and prove
34
+ * it answers the `init` handshake on a real worker thread (not the inline
35
+ * fallback). Catches the silent worker-load and init-message-drop regressions
36
+ * that otherwise strand every cell on the init timeout in a distribution build —
37
+ * the failure mode that motivated `installWorkerInbox`. Wired into
38
+ * `omp --smoke-test` so binary / source / tarball installs all exercise it.
39
+ */
40
+ export declare function smokeTestJsEvalWorker(): Promise<void>;
@@ -186,6 +186,7 @@ export declare class InteractiveMode implements InteractiveModeContext {
186
186
  imageLinks?: (string | undefined)[];
187
187
  customType?: string;
188
188
  display?: boolean;
189
+ streamingBehavior?: "steer" | "followUp";
189
190
  }): SubmittedUserInput;
190
191
  cancelPendingSubmission(): boolean;
191
192
  markPendingSubmissionStarted(input: SubmittedUserInput): boolean;
@@ -46,6 +46,11 @@ export type SubmittedUserInput = {
46
46
  * turn. Used by the `c`/`.` continue shortcut. */
47
47
  synthetic?: boolean;
48
48
  display?: boolean;
49
+ /** Queue intent if the session is (or becomes) busy when this submission is
50
+ * dispatched: "steer" (interrupt the active turn) or "followUp" (process after
51
+ * it). Normal user Enter carries "steer" to match the streaming-branch Enter;
52
+ * background/continuation submits omit it and default to "followUp". */
53
+ streamingBehavior?: "steer" | "followUp";
49
54
  cancelled: boolean;
50
55
  started: boolean;
51
56
  };
@@ -199,6 +204,7 @@ export interface InteractiveModeContext {
199
204
  imageLinks?: (string | undefined)[];
200
205
  customType?: string;
201
206
  display?: boolean;
207
+ streamingBehavior?: "steer" | "followUp";
202
208
  }): SubmittedUserInput;
203
209
  cancelPendingSubmission(): boolean;
204
210
  markPendingSubmissionStarted(input: SubmittedUserInput): boolean;
@@ -0,0 +1,13 @@
1
+ import { type AssistantMessage } from "@oh-my-pi/pi-ai";
2
+ import type { ModelRegistry } from "../config/model-registry";
3
+ import type { Settings } from "../config/settings";
4
+ export interface ClassifyUnexpectedStopDeps {
5
+ settings: Settings;
6
+ registry: ModelRegistry;
7
+ sessionId: string;
8
+ metadataResolver?: (provider: string) => Record<string, unknown> | undefined;
9
+ signal?: AbortSignal;
10
+ }
11
+ export declare function isUnexpectedStopCandidate(message: AssistantMessage): boolean;
12
+ export declare function classifyUnexpectedStop(text: string, deps: ClassifyUnexpectedStopDeps): Promise<boolean | undefined>;
13
+ export declare function parseUnexpectedStopClassification(text: string): boolean | undefined;
@@ -43,7 +43,7 @@ export interface SttStreamOptions {
43
43
  * Hidden subcommand on the main CLI that boots the speech-recognition worker in
44
44
  * the spawned subprocess. Kept in sync with the dispatch in `cli.ts`.
45
45
  */
46
- export declare const STT_WORKER_ARG = "__omp_stt_worker";
46
+ export declare const STT_WORKER_ARG = "__omp_worker_stt";
47
47
  interface SpawnedSubprocess {
48
48
  proc: Subprocess<"ignore", "ignore", "ignore">;
49
49
  inbound: Set<(message: SttWorkerOutbound) => void>;
@@ -32,7 +32,7 @@ export interface TinyTitleGenerateOptions {
32
32
  * Hidden subcommand on the main CLI that boots the tiny-model worker in the
33
33
  * spawned subprocess. Kept in sync with the dispatch in `cli.ts`.
34
34
  */
35
- export declare const TINY_WORKER_ARG = "--tiny-worker";
35
+ export declare const TINY_WORKER_ARG = "__omp_worker_tiny_inference";
36
36
  /**
37
37
  * Decide which `PI_TINY_DEVICE` / `PI_TINY_DTYPE` vars to overlay onto the worker
38
38
  * env. A present env var wins (left untouched); otherwise the mapped persisted
@@ -48,6 +48,7 @@ export declare class JobTool implements AgentTool<typeof jobSchema, JobToolDetai
48
48
  list: z.ZodOptional<z.ZodBoolean>;
49
49
  }, z.core.$strip>;
50
50
  readonly strict = true;
51
+ readonly interruptible = true;
51
52
  readonly loadMode = "discoverable";
52
53
  constructor(session: ToolSession);
53
54
  execute(_toolCallId: string, params: JobParams, signal?: AbortSignal, onUpdate?: AgentToolUpdateCallback<JobToolDetails>, _context?: AgentToolContext): Promise<AgentToolResult<JobToolDetails>>;
@@ -54,7 +54,7 @@ export interface TtsStreamHandle {
54
54
  * Hidden subcommand on the main CLI that boots the TTS worker in the spawned
55
55
  * subprocess. Kept in sync with the dispatch in `cli.ts` (Main-owned).
56
56
  */
57
- export declare const TTS_WORKER_ARG = "__omp_tts_worker";
57
+ export declare const TTS_WORKER_ARG = "__omp_worker_tts";
58
58
  interface SpawnedSubprocess {
59
59
  proc: Subprocess<"ignore", "ignore", "ignore">;
60
60
  inbound: Set<(message: TtsWorkerOutbound) => void>;
@@ -1,17 +1 @@
1
- import type { AssistantMessage } from "@oh-my-pi/pi-ai";
2
- type AssistantContentBlock = AssistantMessage["content"][number];
3
- type ThinkingBlock = Extract<AssistantContentBlock, {
4
- type: "thinking";
5
- }>;
6
- /**
7
- * Returns the operator-visible thinking text for a block.
8
- *
9
- * Some OpenAI-compatible reasoning gateways require a non-empty
10
- * `reasoning_content` field on historical assistant tool-call turns even when
11
- * the model did not emit any reasoning. The provider adapter uses a single dot
12
- * as the wire-only placeholder those gateways accept; if that value is later
13
- * replayed or echoed as a thinking block, it should not render as model thought.
14
- */
15
- export declare function getVisibleThinkingText(block: ThinkingBlock): string;
16
- export declare function hasVisibleThinking(block: ThinkingBlock): boolean;
17
- export {};
1
+ export declare function canonicalizeMessage(text: string | null | undefined): string;
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "type": "module",
3
3
  "name": "@oh-my-pi/pi-coding-agent",
4
- "version": "15.13.2",
4
+ "version": "15.13.3",
5
5
  "description": "Coding agent CLI with read, bash, edit, write tools and session management",
6
6
  "homepage": "https://omp.sh",
7
7
  "author": "Can Boluk",
@@ -47,17 +47,17 @@
47
47
  "@agentclientprotocol/sdk": "0.25.0",
48
48
  "@babel/parser": "^7.29.7",
49
49
  "@mozilla/readability": "^0.6.0",
50
- "@oh-my-pi/hashline": "15.13.2",
51
- "@oh-my-pi/omp-stats": "15.13.2",
52
- "@oh-my-pi/pi-agent-core": "15.13.2",
53
- "@oh-my-pi/pi-ai": "15.13.2",
54
- "@oh-my-pi/pi-catalog": "15.13.2",
55
- "@oh-my-pi/pi-mnemopi": "15.13.2",
56
- "@oh-my-pi/pi-natives": "15.13.2",
57
- "@oh-my-pi/pi-tui": "15.13.2",
58
- "@oh-my-pi/pi-utils": "15.13.2",
59
- "@oh-my-pi/pi-wire": "15.13.2",
60
- "@oh-my-pi/snapcompact": "15.13.2",
50
+ "@oh-my-pi/hashline": "15.13.3",
51
+ "@oh-my-pi/omp-stats": "15.13.3",
52
+ "@oh-my-pi/pi-agent-core": "15.13.3",
53
+ "@oh-my-pi/pi-ai": "15.13.3",
54
+ "@oh-my-pi/pi-catalog": "15.13.3",
55
+ "@oh-my-pi/pi-mnemopi": "15.13.3",
56
+ "@oh-my-pi/pi-natives": "15.13.3",
57
+ "@oh-my-pi/pi-tui": "15.13.3",
58
+ "@oh-my-pi/pi-utils": "15.13.3",
59
+ "@oh-my-pi/pi-wire": "15.13.3",
60
+ "@oh-my-pi/snapcompact": "15.13.3",
61
61
  "@opentelemetry/api": "^1.9.1",
62
62
  "@opentelemetry/context-async-hooks": "^2.7.1",
63
63
  "@opentelemetry/exporter-trace-otlp-proto": "^0.218.0",
package/src/cli.ts CHANGED
@@ -14,6 +14,7 @@ try {
14
14
  * CLI entry point — registers all commands explicitly and delegates to the
15
15
  * lightweight CLI runner from pi-utils.
16
16
  */
17
+ import { parentPort } from "node:worker_threads";
17
18
  import type { CliConfig } from "@oh-my-pi/pi-utils/cli";
18
19
  import {
19
20
  APP_NAME,
@@ -23,7 +24,7 @@ import {
23
24
  setProfile,
24
25
  VERSION,
25
26
  } from "@oh-my-pi/pi-utils/dirs";
26
- import { declareWorkerHostEntry } from "@oh-my-pi/pi-utils/worker-host";
27
+ import { declareWorkerHostEntry, installWorkerInbox } from "@oh-my-pi/pi-utils/worker-host";
27
28
  import { installProfileAlias, resolveProfileAliasCommandFromProcess } from "./cli/profile-alias";
28
29
  import { extractProfileFlags } from "./cli/profile-bootstrap";
29
30
 
@@ -67,6 +68,7 @@ async function runSmokeTest(): Promise<void> {
67
68
  const { smokeTestTinyTitleWorker } = await import("./tiny/title-client");
68
69
  const { smokeTestSttWorker } = await import("./stt/asr-client");
69
70
  const { smokeTestTtsWorker } = await import("./tts/tts-client");
71
+ const { smokeTestJsEvalWorker } = await import("./eval/js/context-manager");
70
72
  await smokeTestSyncWorker();
71
73
 
72
74
  const statsServer = await startServer(0);
@@ -83,18 +85,23 @@ async function runSmokeTest(): Promise<void> {
83
85
 
84
86
  await smokeTestTinyTitleWorker();
85
87
  await smokeTestSttWorker();
88
+ await smokeTestJsEvalWorker();
86
89
  await smokeTestTtsWorker();
87
90
  process.stdout.write("smoke-test: ok\n");
88
91
  }
89
92
 
90
- const TINY_WORKER_ARGS = new Set(["--tiny-worker", "__tiny_worker"]);
91
- const STATS_SYNC_WORKER_ARG = "__omp_stats_sync_worker";
92
- const TAB_WORKER_ARG = "__omp_tab_worker";
93
- const JS_EVAL_WORKER_ARG = "__omp_js_eval_worker";
94
- const STT_WORKER_ARG = "__omp_stt_worker";
95
- const TTS_WORKER_ARG = "__omp_tts_worker";
93
+ const TINY_WORKER_ARG = "__omp_worker_tiny_inference";
94
+ const STATS_SYNC_WORKER_ARG = "__omp_worker_stats_sync";
95
+ const TAB_WORKER_ARG = "__omp_worker_tab";
96
+ const JS_EVAL_WORKER_ARG = "__omp_worker_js_eval";
97
+ const STT_WORKER_ARG = "__omp_worker_stt";
98
+ const TTS_WORKER_ARG = "__omp_worker_tts";
96
99
 
97
100
  async function runWorkerEntrypoint(arg: string | undefined): Promise<boolean> {
101
+ if (arg === TINY_WORKER_ARG) {
102
+ await runTinyWorker();
103
+ return true;
104
+ }
98
105
  if (arg === STATS_SYNC_WORKER_ARG) {
99
106
  // The sync worker handles messages via `self.onmessage`, assigned during
100
107
  // this *async* dynamic import. Bun flushes the worker's initial message
@@ -117,11 +124,20 @@ async function runWorkerEntrypoint(arg: string | undefined): Promise<boolean> {
117
124
  }
118
125
  return true;
119
126
  }
127
+ // Bun flushes messages the parent posted before spawn once this entry's
128
+ // top-level evaluation completes, delivering them only to listeners present
129
+ // at that moment. These worker modules are imported dynamically below, so
130
+ // their own `parentPort.on("message")` lands after the flush and the parent's
131
+ // synchronous `init` is dropped. Install a buffering inbox synchronously here
132
+ // (still inside the entry's sync prefix) so the handshake survives; the worker
133
+ // module binds the real handler once loaded.
120
134
  if (arg === TAB_WORKER_ARG) {
135
+ if (parentPort) installWorkerInbox(parentPort);
121
136
  await import("./tools/browser/tab-worker-entry");
122
137
  return true;
123
138
  }
124
139
  if (arg === JS_EVAL_WORKER_ARG) {
140
+ if (parentPort) installWorkerInbox(parentPort);
125
141
  await import("./eval/js/worker-entry");
126
142
  return true;
127
143
  }
@@ -251,11 +267,8 @@ export async function runCli(argv: string[]): Promise<void> {
251
267
  // synchronous prefix of `runWorkerEntrypoint`, and Bun flushes the
252
268
  // worker's parked initial messages as soon as the entry module's
253
269
  // top-level evaluation finishes.
254
- if (TINY_WORKER_ARGS.has(resolvedArgv[0] ?? "")) {
255
- await runTinyWorker();
256
- return;
257
- }
258
- if (await runWorkerEntrypoint(resolvedArgv[0])) {
270
+ if (resolvedArgv[0]?.startsWith("__omp_worker_")) {
271
+ await runWorkerEntrypoint(resolvedArgv[0]);
259
272
  return;
260
273
  }
261
274
 
@@ -59,7 +59,7 @@ import {
59
59
  resolveCanonicalVariant,
60
60
  resolveModelReference,
61
61
  } from "@oh-my-pi/pi-catalog/identity";
62
- import { isRecord, logger } from "@oh-my-pi/pi-utils";
62
+ import { isBunTestRuntime, isRecord, logger } from "@oh-my-pi/pi-utils";
63
63
  import { parseModelString, resolveProviderModelReference } from "../config/model-resolver";
64
64
  import type { AuthStorage, OAuthCredential } from "../session/auth-storage";
65
65
  import { type ApiKeyResolverModel, type ApiKeyResolverOptions, createApiKeyResolver } from "./api-key-resolver";
@@ -690,7 +690,11 @@ export class ModelRegistry {
690
690
  modelsPath?: string,
691
691
  options?: { fetch?: FetchImpl },
692
692
  ) {
693
- this.#fetch = options?.fetch ?? fetch;
693
+ this.#fetch =
694
+ options?.fetch ??
695
+ (isBunTestRuntime()
696
+ ? () => Promise.reject(new Error("network disabled in model-registry runtime test"))
697
+ : fetch);
694
698
  this.#modelsConfigFile = ModelsConfigFile.relocate(modelsPath);
695
699
  this.#cacheDbPath = modelsPath ? path.join(path.dirname(modelsPath), "models.db") : undefined;
696
700
  // Set up fallback resolver for custom provider API keys
@@ -116,6 +116,7 @@ export const TAB_GROUPS: Record<SettingTab, readonly string[]> = {
116
116
  "Magic Keywords",
117
117
  "Startup & Updates",
118
118
  "Power (macOS)",
119
+ "Agent",
119
120
  ],
120
121
  context: ["General", "Compaction", "Rules (TTSR)", "Experimental"],
121
122
  memory: ["General", "Auto-Learn", "Mnemopi", "Hindsight"],
@@ -3993,6 +3994,30 @@ export const SETTINGS_SCHEMA = {
3993
3994
  options: AUTO_THINKING_MODEL_OPTIONS,
3994
3995
  },
3995
3996
  },
3997
+ "features.unexpectedStopDetection": {
3998
+ type: "boolean",
3999
+ default: false,
4000
+ ui: {
4001
+ tab: "interaction",
4002
+ group: "Agent",
4003
+ label: "Detect unexpected stops",
4004
+ description:
4005
+ "Use a small model to detect when the assistant says it will continue but stops without tool calls; automatically prompt it to continue.",
4006
+ },
4007
+ },
4008
+ "providers.unexpectedStopModel": {
4009
+ type: "enum",
4010
+ values: TINY_MEMORY_MODEL_VALUES,
4011
+ default: ONLINE_MEMORY_MODEL_KEY,
4012
+ ui: {
4013
+ tab: "providers",
4014
+ group: "Tiny Model",
4015
+ label: "Unexpected Stop Model",
4016
+ description: "Classifier for unexpected-stop detection: online smol by default, or a local on-device model.",
4017
+ condition: "unexpectedStopDetection",
4018
+ options: TINY_MEMORY_MODEL_OPTIONS,
4019
+ },
4020
+ },
3996
4021
 
3997
4022
  "providers.kimiApiFormat": {
3998
4023
  type: "enum",
@@ -121,6 +121,34 @@ function makeEvalSession(
121
121
  return { session, sessionFile, sessionId: `${prefix}:${crypto.randomUUID()}` };
122
122
  }
123
123
 
124
+ /**
125
+ * Spy `runSubprocess` so a `parallel()` fan-out overlaps deterministically: every
126
+ * bridge call parks until the pool saturates at `limit` concurrent calls in flight,
127
+ * then all proceed. Proves the pool reaches its ceiling without a wall-clock sleep —
128
+ * the pool itself caps how many run at once, so an unbounded pool would drive
129
+ * `maxInFlight` past `limit` and fail the bound.
130
+ */
131
+ function spyConcurrencyBarrier(limit: number): { maxInFlight: () => number } {
132
+ let inFlight = 0;
133
+ let max = 0;
134
+ let saturate: (() => void) | undefined;
135
+ const saturated = new Promise<void>(resolve => {
136
+ saturate = resolve;
137
+ });
138
+ vi.spyOn(taskExecutor, "runSubprocess").mockImplementation(async options => {
139
+ inFlight++;
140
+ max = Math.max(max, inFlight);
141
+ if (inFlight >= limit) saturate?.();
142
+ try {
143
+ await saturated;
144
+ return singleResult(options, { output: options.assignment ?? "" });
145
+ } finally {
146
+ inFlight--;
147
+ }
148
+ });
149
+ return { maxInFlight: () => max };
150
+ }
151
+
124
152
  describe("runEvalAgent", () => {
125
153
  afterEach(() => {
126
154
  vi.restoreAllMocks();
@@ -298,8 +326,17 @@ describe("runEvalAgent", () => {
298
326
  });
299
327
 
300
328
  describe("agent() through eval runtimes", () => {
329
+ // One shared JS worker backs every agent() JavaScript test below. Spawning a
330
+ // worker (thread + module-graph import) is fixed infrastructure cost, not
331
+ // behavior under test; reusing it keeps the suite fast. Each run still threads
332
+ // its own ToolSession (settings/mock are read live through the bridge per call)
333
+ // and top-level `const`/`let` are demoted to `var`, so reuse never leaks state
334
+ // these tests observe. Torn down in afterAll via disposeAllVmContexts().
335
+ const sharedJsSessionId = "agent-bridge-shared-js";
336
+
301
337
  afterEach(() => {
302
338
  vi.restoreAllMocks();
339
+ vi.useRealTimers();
303
340
  });
304
341
 
305
342
  afterAll(async () => {
@@ -309,7 +346,7 @@ describe("agent() through eval runtimes", () => {
309
346
 
310
347
  it("exposes agent() in JavaScript and parses structured output", async () => {
311
348
  using tempDir = TempDir.createSync("@omp-eval-agent-js-");
312
- const { session, sessionFile, sessionId } = makeEvalSession(tempDir, "js-agent");
349
+ const { session, sessionFile } = makeEvalSession(tempDir, "js-agent");
313
350
  mockAgents();
314
351
  vi.spyOn(taskExecutor, "runSubprocess").mockImplementation(async options =>
315
352
  singleResult(options, {
@@ -319,7 +356,7 @@ describe("agent() through eval runtimes", () => {
319
356
 
320
357
  const result = await executeJs(
321
358
  'const text = await agent("hi"); const data = await agent("json", { schema: { type: "object" } }); return JSON.stringify([text, data]);',
322
- { cwd: tempDir.path(), sessionId, session, sessionFile },
359
+ { cwd: tempDir.path(), sessionId: sharedJsSessionId, session, sessionFile },
323
360
  );
324
361
 
325
362
  expect(result.exitCode).toBe(0);
@@ -334,35 +371,24 @@ describe("agent() through eval runtimes", () => {
334
371
  "task.enableLsp": true,
335
372
  "task.maxConcurrency": 2,
336
373
  });
337
- const { session, sessionFile, sessionId } = makeEvalSession(tempDir, "js-agent-parallel", settings);
374
+ const { session, sessionFile } = makeEvalSession(tempDir, "js-agent-parallel", settings);
338
375
  mockAgents();
339
- let inFlight = 0;
340
- let maxInFlight = 0;
341
- vi.spyOn(taskExecutor, "runSubprocess").mockImplementation(async options => {
342
- inFlight++;
343
- maxInFlight = Math.max(maxInFlight, inFlight);
344
- try {
345
- await Bun.sleep(options.assignment === "a" ? 30 : 10);
346
- return singleResult(options, { output: options.assignment ?? "" });
347
- } finally {
348
- inFlight--;
349
- }
350
- });
376
+ const barrier = spyConcurrencyBarrier(2);
351
377
 
352
378
  const result = await executeJs(
353
379
  'const values = await parallel(["a", "b", "c", "d"].map(name => () => agent(name))); return JSON.stringify(values);',
354
- { cwd: tempDir.path(), sessionId, session, sessionFile },
380
+ { cwd: tempDir.path(), sessionId: sharedJsSessionId, session, sessionFile },
355
381
  );
356
382
 
357
383
  expect(result.exitCode).toBe(0);
358
384
  expect(JSON.parse(result.output.trim())).toEqual(["a", "b", "c", "d"]);
359
- expect(maxInFlight).toBeGreaterThan(1);
360
- expect(maxInFlight).toBeLessThanOrEqual(2);
385
+ expect(barrier.maxInFlight()).toBeGreaterThan(1);
386
+ expect(barrier.maxInFlight()).toBeLessThanOrEqual(2);
361
387
  });
362
388
 
363
389
  it("propagates JavaScript parallel() rejections", async () => {
364
390
  using tempDir = TempDir.createSync("@omp-eval-agent-js-reject-");
365
- const { session, sessionFile, sessionId } = makeEvalSession(tempDir, "js-agent-reject");
391
+ const { session, sessionFile } = makeEvalSession(tempDir, "js-agent-reject");
366
392
  mockAgents();
367
393
  vi.spyOn(taskExecutor, "runSubprocess").mockImplementation(async options => {
368
394
  if (options.assignment === "bad") {
@@ -373,7 +399,7 @@ describe("agent() through eval runtimes", () => {
373
399
 
374
400
  const result = await executeJs('await parallel([() => agent("ok"), () => agent("bad")]);', {
375
401
  cwd: tempDir.path(),
376
- sessionId,
402
+ sessionId: sharedJsSessionId,
377
403
  session,
378
404
  sessionFile,
379
405
  });
@@ -416,18 +442,7 @@ describe("agent() through eval runtimes", () => {
416
442
  });
417
443
  const { session, sessionFile, sessionId } = makeEvalSession(tempDir, "py-agent-parallel", settings);
418
444
  mockAgents();
419
- let inFlight = 0;
420
- let maxInFlight = 0;
421
- vi.spyOn(taskExecutor, "runSubprocess").mockImplementation(async options => {
422
- inFlight++;
423
- maxInFlight = Math.max(maxInFlight, inFlight);
424
- try {
425
- await Bun.sleep(options.assignment === "a" ? 30 : 10);
426
- return singleResult(options, { output: options.assignment ?? "" });
427
- } finally {
428
- inFlight--;
429
- }
430
- });
445
+ const barrier = spyConcurrencyBarrier(2);
431
446
 
432
447
  const result = await executePython(
433
448
  'import json\nprint(json.dumps(parallel([lambda n=n: agent(n) for n in ["a", "b", "c", "d"]])))',
@@ -440,8 +455,8 @@ describe("agent() through eval runtimes", () => {
440
455
 
441
456
  expect(result.exitCode).toBe(0);
442
457
  expect(JSON.parse(result.output.trim())).toEqual(["a", "b", "c", "d"]);
443
- expect(maxInFlight).toBeGreaterThan(1);
444
- expect(maxInFlight).toBeLessThanOrEqual(2);
458
+ expect(barrier.maxInFlight()).toBeGreaterThan(1);
459
+ expect(barrier.maxInFlight()).toBeLessThanOrEqual(2);
445
460
  });
446
461
 
447
462
  it("interrupting a Python parallel() fan-out settles the kernel cleanly and preserves session state", async () => {
@@ -526,7 +541,7 @@ describe("agent() through eval runtimes", () => {
526
541
 
527
542
  it("streams enriched agent progress through onStatus before the cell finishes", async () => {
528
543
  using tempDir = TempDir.createSync("@omp-eval-agent-progress-");
529
- const { session, sessionFile, sessionId } = makeEvalSession(tempDir, "js-agent-progress");
544
+ const { session, sessionFile } = makeEvalSession(tempDir, "js-agent-progress");
530
545
  mockAgents();
531
546
 
532
547
  const makeProgress = (options: ExecutorOptions, overrides: Partial<AgentProgress>): AgentProgress => ({
@@ -580,7 +595,7 @@ describe("agent() through eval runtimes", () => {
580
595
  const events: Array<{ op: string; [key: string]: unknown }> = [];
581
596
  const result = await executeJs('await agent("investigate", { label: "Scout" });', {
582
597
  cwd: tempDir.path(),
583
- sessionId,
598
+ sessionId: sharedJsSessionId,
584
599
  session,
585
600
  sessionFile,
586
601
  onStatus: event => events.push(event),
@@ -622,16 +637,28 @@ describe("agent() through eval runtimes", () => {
622
637
  mockAgents();
623
638
 
624
639
  // runSubprocess runs far past the eval timeout budget and emits NO progress
625
- // of its own. The bridge pause must make that delegated time invisible to
626
- // the watchdog.
640
+ // of its own; the bridge pause must make that delegated time invisible to
641
+ // the watchdog. Fake timers replace the real wait: the subprocess parks on
642
+ // `released` so the test can advance the clock past the budget while the
643
+ // bridge call is provably in flight, then release it deterministically.
644
+ let release: (() => void) | undefined;
645
+ const released = new Promise<void>(resolve => {
646
+ release = resolve;
647
+ });
648
+ let markInFlight: (() => void) | undefined;
649
+ const inFlight = new Promise<void>(resolve => {
650
+ markInFlight = resolve;
651
+ });
627
652
  vi.spyOn(taskExecutor, "runSubprocess").mockImplementation(async options => {
628
- await Bun.sleep(40);
653
+ markInFlight?.();
654
+ await released;
629
655
  return singleResult(options, { output: "done" });
630
656
  });
631
657
 
632
658
  const ops: string[] = [];
659
+ vi.useFakeTimers();
633
660
  using idle = new IdleTimeout(20);
634
- const result = await runEvalAgent(
661
+ const resultPromise = runEvalAgent(
635
662
  { prompt: "investigate" },
636
663
  {
637
664
  session,
@@ -644,11 +671,22 @@ describe("agent() through eval runtimes", () => {
644
671
  },
645
672
  );
646
673
 
674
+ // The bridge paused the watchdog; the subprocess is now blocked in flight.
675
+ await inFlight;
676
+ // Burn far more than the 20ms budget while paused: the watchdog stays armed-off.
677
+ vi.advanceTimersByTime(1_000);
678
+ expect(idle.signal.aborted).toBe(false);
679
+
680
+ release?.();
681
+ const result = await resultPromise;
682
+
647
683
  expect(result.text).toBe("done");
648
684
  expect(ops).toEqual([EVAL_TIMEOUT_PAUSE_OP, EVAL_TIMEOUT_RESUME_OP]);
649
685
  expect(idle.signal.aborted).toBe(false);
650
686
 
651
- await Bun.sleep(60);
687
+ // RESUME re-armed a fresh window; once the runtime stays idle past it the
688
+ // watchdog finally fires.
689
+ vi.advanceTimersByTime(idle.idleMs + 5);
652
690
  expect(idle.signal.aborted).toBe(true);
653
691
  });
654
692
 
@@ -657,9 +695,20 @@ describe("agent() through eval runtimes", () => {
657
695
  const { session } = makeEvalSession(tempDir, "js-agent-progress-timeout-pause");
658
696
  mockAgents();
659
697
 
660
- // Stream frequent progress snapshots (op:"agent") for well past the budget.
698
+ // Stream frequent progress snapshots (op:"agent") well past the budget.
661
699
  // They render as status, but timeout accounting is controlled only by the
662
- // bridge pause/resume events.
700
+ // bridge pause/resume events — so even a flood of snapshots must not re-arm
701
+ // the watchdog. Fake timers make "past the budget" deterministic: the
702
+ // subprocess emits its snapshots, parks on `released`, and the test advances
703
+ // the clock far past the window before releasing it.
704
+ let release: (() => void) | undefined;
705
+ const released = new Promise<void>(resolve => {
706
+ release = resolve;
707
+ });
708
+ let markInFlight: (() => void) | undefined;
709
+ const inFlight = new Promise<void>(resolve => {
710
+ markInFlight = resolve;
711
+ });
663
712
  vi.spyOn(taskExecutor, "runSubprocess").mockImplementation(async options => {
664
713
  for (let i = 0; i < 20; i++) {
665
714
  options.onProgress?.({
@@ -679,15 +728,16 @@ describe("agent() through eval runtimes", () => {
679
728
  cost: 0,
680
729
  durationMs: i * 10,
681
730
  });
682
- await Bun.sleep(40);
683
731
  }
732
+ markInFlight?.();
733
+ await released;
684
734
  return singleResult(options, { output: "done" });
685
735
  });
686
736
 
687
737
  const ops: string[] = [];
688
- // Timing invariant (keep, do not re-tighten): total mock work (20*40ms = 800ms) > idle window (250ms) > scheduling jitter (~tens of ms).
738
+ vi.useFakeTimers();
689
739
  using idle = new IdleTimeout(250);
690
- const result = await runEvalAgent(
740
+ const resultPromise = runEvalAgent(
691
741
  { prompt: "investigate" },
692
742
  {
693
743
  session,
@@ -700,6 +750,16 @@ describe("agent() through eval runtimes", () => {
700
750
  },
701
751
  );
702
752
 
753
+ // All snapshots have streamed and the subprocess is blocked in flight.
754
+ await inFlight;
755
+ // Far exceed the 250ms budget while paused: the snapshots already delivered
756
+ // must not have re-armed the watchdog.
757
+ vi.advanceTimersByTime(10_000);
758
+ expect(idle.signal.aborted).toBe(false);
759
+
760
+ release?.();
761
+ const result = await resultPromise;
762
+
703
763
  expect(result.text).toBe("done");
704
764
  expect(ops[0]).toBe(EVAL_TIMEOUT_PAUSE_OP);
705
765
  expect(ops).toContain("agent");