@oh-my-pi/pi-coding-agent 15.10.3 → 15.10.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +20 -0
- package/dist/types/eval/__tests__/js-context-manager.test.d.ts +1 -0
- package/dist/types/eval/bridge-timeout.d.ts +1 -1
- package/dist/types/eval/{llm-bridge.d.ts → completion-bridge.d.ts} +8 -8
- package/dist/types/eval/idle-timeout.d.ts +1 -1
- package/package.json +9 -9
- package/src/eval/__tests__/agent-bridge.test.ts +13 -0
- package/src/eval/__tests__/{llm-bridge.test.ts → completion-bridge.test.ts} +60 -54
- package/src/eval/__tests__/js-context-manager.test.ts +241 -0
- package/src/eval/agent-bridge.ts +6 -1
- package/src/eval/bridge-timeout.ts +1 -1
- package/src/eval/{llm-bridge.ts → completion-bridge.ts} +30 -27
- package/src/eval/idle-timeout.ts +1 -1
- package/src/eval/js/context-manager.ts +66 -6
- package/src/eval/js/shared/prelude.txt +4 -4
- package/src/eval/js/tool-bridge.ts +3 -3
- package/src/eval/js/worker-entry.ts +6 -0
- package/src/eval/py/prelude.py +3 -3
- package/src/internal-urls/docs-index.generated.ts +4 -3
- package/src/modes/components/tips.txt +1 -1
- package/src/prompts/system/tiny-title-system.md +1 -1
- package/src/prompts/system/title-system.md +16 -3
- package/src/prompts/system/workflow-notice.md +1 -1
- package/src/prompts/tools/eval.md +3 -3
- package/src/tools/eval-render.ts +2 -2
- package/src/tools/eval.ts +1 -1
- package/src/utils/title-generator.ts +2 -2
- /package/dist/types/eval/__tests__/{llm-bridge.test.d.ts → completion-bridge.test.d.ts} +0 -0
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
import { afterEach, describe, expect, it } from "bun:test";
|
|
2
|
+
import { TempDir } from "@oh-my-pi/pi-utils";
|
|
3
|
+
import { Settings } from "../../config/settings";
|
|
4
|
+
import type { ToolSession } from "../../tools";
|
|
5
|
+
import { disposeAllVmContexts } from "../js/context-manager";
|
|
6
|
+
import { executeJs } from "../js/executor";
|
|
7
|
+
|
|
8
|
+
const originalWorker = globalThis.Worker;
|
|
9
|
+
|
|
10
|
+
interface FakeWorkerStats {
|
|
11
|
+
closeRequests: number;
|
|
12
|
+
terminateCalls: number;
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
interface FakeWorkerBehavior {
|
|
16
|
+
exitOnClose: boolean;
|
|
17
|
+
settleRuns: boolean;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
function makeSession(cwd: string): ToolSession {
|
|
21
|
+
return {
|
|
22
|
+
cwd,
|
|
23
|
+
hasUI: false,
|
|
24
|
+
settings: Settings.isolated({
|
|
25
|
+
"async.enabled": false,
|
|
26
|
+
"task.isolation.mode": "none",
|
|
27
|
+
"task.enableLsp": true,
|
|
28
|
+
}),
|
|
29
|
+
taskDepth: 0,
|
|
30
|
+
enableLsp: true,
|
|
31
|
+
getSessionFile: () => null,
|
|
32
|
+
getSessionSpawns: () => "*",
|
|
33
|
+
getActiveModelString: () => "p/active",
|
|
34
|
+
getModelString: () => "p/fallback",
|
|
35
|
+
getArtifactsDir: () => null,
|
|
36
|
+
getSessionId: () => "test-session",
|
|
37
|
+
getEvalSessionId: () => "test-eval-session",
|
|
38
|
+
};
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
async function withTimeout<T>(promise: Promise<T>, ms: number, label: string): Promise<T> {
|
|
42
|
+
let timeout: NodeJS.Timeout | undefined;
|
|
43
|
+
try {
|
|
44
|
+
return await Promise.race([
|
|
45
|
+
promise,
|
|
46
|
+
new Promise<never>((_, reject) => {
|
|
47
|
+
timeout = setTimeout(() => reject(new Error(`${label} timed out`)), ms);
|
|
48
|
+
}),
|
|
49
|
+
]);
|
|
50
|
+
} finally {
|
|
51
|
+
if (timeout) clearTimeout(timeout);
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
async function waitForRealWorkerExitAfterClose(cwd: string): Promise<void> {
|
|
56
|
+
const worker = new originalWorker(new URL("../js/worker-entry.ts", import.meta.url).href, { type: "module" });
|
|
57
|
+
const ready = Promise.withResolvers<void>();
|
|
58
|
+
const runComplete = Promise.withResolvers<void>();
|
|
59
|
+
const closedAck = Promise.withResolvers<void>();
|
|
60
|
+
const workerClosed = Promise.withResolvers<void>();
|
|
61
|
+
const runId = `keep-alive:${crypto.randomUUID()}`;
|
|
62
|
+
const snapshot = { cwd, sessionId: `worker-exit:${crypto.randomUUID()}` };
|
|
63
|
+
|
|
64
|
+
worker.addEventListener("message", event => {
|
|
65
|
+
const msg = event.data as { type?: string; runId?: string; ok?: boolean };
|
|
66
|
+
if (msg.type === "ready") ready.resolve();
|
|
67
|
+
else if (msg.type === "result" && msg.runId === runId && msg.ok) runComplete.resolve();
|
|
68
|
+
else if (msg.type === "closed") closedAck.resolve();
|
|
69
|
+
});
|
|
70
|
+
worker.addEventListener("close", () => workerClosed.resolve());
|
|
71
|
+
|
|
72
|
+
try {
|
|
73
|
+
await withTimeout(ready.promise, 1_000, "worker ready");
|
|
74
|
+
worker.postMessage({
|
|
75
|
+
type: "run",
|
|
76
|
+
runId,
|
|
77
|
+
code: "globalThis.__keepAlive = setInterval(() => {}, 1000);\nundefined;",
|
|
78
|
+
filename: "keep-alive.js",
|
|
79
|
+
snapshot,
|
|
80
|
+
});
|
|
81
|
+
await withTimeout(runComplete.promise, 1_000, "worker run");
|
|
82
|
+
worker.postMessage({ type: "close" });
|
|
83
|
+
await withTimeout(closedAck.promise, 1_000, "worker closed ack");
|
|
84
|
+
await withTimeout(workerClosed.promise, 1_000, "worker close event");
|
|
85
|
+
} finally {
|
|
86
|
+
worker.terminate();
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
function installFakeWorker(stats: FakeWorkerStats, behavior: FakeWorkerBehavior): void {
|
|
91
|
+
class FakeWorker {
|
|
92
|
+
#messageListeners = new Set<(event: MessageEvent) => void>();
|
|
93
|
+
#closeListeners = new Set<(event: Event) => void>();
|
|
94
|
+
#readyQueued = false;
|
|
95
|
+
#exited = false;
|
|
96
|
+
|
|
97
|
+
postMessage(message: unknown): void {
|
|
98
|
+
if (!message || typeof message !== "object") return;
|
|
99
|
+
const typed = message as { type?: string; runId?: string };
|
|
100
|
+
if (typed.type === "run" && typed.runId && behavior.settleRuns) {
|
|
101
|
+
queueMicrotask(() => this.#emitMessage({ type: "result", runId: typed.runId, ok: true }));
|
|
102
|
+
return;
|
|
103
|
+
}
|
|
104
|
+
if (typed.type === "close") {
|
|
105
|
+
stats.closeRequests++;
|
|
106
|
+
queueMicrotask(() => {
|
|
107
|
+
this.#emitMessage({ type: "closed" });
|
|
108
|
+
if (behavior.exitOnClose) this.#emitClose();
|
|
109
|
+
});
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
addEventListener(type: string, listener: (event: MessageEvent | Event) => void): void {
|
|
114
|
+
if (type === "close") {
|
|
115
|
+
this.#closeListeners.add(listener as (event: Event) => void);
|
|
116
|
+
return;
|
|
117
|
+
}
|
|
118
|
+
if (type !== "message") return;
|
|
119
|
+
this.#messageListeners.add(listener as (event: MessageEvent) => void);
|
|
120
|
+
if (!this.#readyQueued) {
|
|
121
|
+
this.#readyQueued = true;
|
|
122
|
+
queueMicrotask(() => this.#emitMessage({ type: "ready" }));
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
removeEventListener(type: string, listener: (event: MessageEvent | Event) => void): void {
|
|
127
|
+
if (type === "close") {
|
|
128
|
+
this.#closeListeners.delete(listener as (event: Event) => void);
|
|
129
|
+
return;
|
|
130
|
+
}
|
|
131
|
+
if (type !== "message") return;
|
|
132
|
+
this.#messageListeners.delete(listener as (event: MessageEvent) => void);
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
terminate(): void {
|
|
136
|
+
stats.terminateCalls++;
|
|
137
|
+
this.#emitClose();
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
#emitMessage(data: unknown): void {
|
|
141
|
+
const event = new MessageEvent("message", { data });
|
|
142
|
+
for (const listener of this.#messageListeners) listener(event);
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
#emitClose(): void {
|
|
146
|
+
if (this.#exited) return;
|
|
147
|
+
this.#exited = true;
|
|
148
|
+
const event = new Event("close");
|
|
149
|
+
for (const listener of this.#closeListeners) listener(event);
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
Object.defineProperty(globalThis, "Worker", {
|
|
154
|
+
configurable: true,
|
|
155
|
+
writable: true,
|
|
156
|
+
value: FakeWorker as unknown as typeof Worker,
|
|
157
|
+
});
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
describe("JavaScript eval worker lifecycle", () => {
|
|
161
|
+
afterEach(async () => {
|
|
162
|
+
await disposeAllVmContexts();
|
|
163
|
+
Object.defineProperty(globalThis, "Worker", {
|
|
164
|
+
configurable: true,
|
|
165
|
+
writable: true,
|
|
166
|
+
value: originalWorker,
|
|
167
|
+
});
|
|
168
|
+
});
|
|
169
|
+
|
|
170
|
+
it("exits a real worker on graceful close even with ref'ed user handles", async () => {
|
|
171
|
+
using tempDir = TempDir.createSync("@omp-js-worker-real-close-");
|
|
172
|
+
|
|
173
|
+
await waitForRealWorkerExitAfterClose(tempDir.path());
|
|
174
|
+
});
|
|
175
|
+
|
|
176
|
+
it("waits for the worker to close on reset instead of force-terminating it", async () => {
|
|
177
|
+
using tempDir = TempDir.createSync("@omp-js-worker-close-");
|
|
178
|
+
const stats: FakeWorkerStats = { closeRequests: 0, terminateCalls: 0 };
|
|
179
|
+
installFakeWorker(stats, { exitOnClose: true, settleRuns: true });
|
|
180
|
+
|
|
181
|
+
const session = makeSession(tempDir.path());
|
|
182
|
+
const sessionId = `js-close:${crypto.randomUUID()}`;
|
|
183
|
+
|
|
184
|
+
const first = await executeJs("globalThis.marker = 1;", { cwd: tempDir.path(), sessionId, session });
|
|
185
|
+
expect(first.exitCode).toBe(0);
|
|
186
|
+
|
|
187
|
+
const second = await executeJs("globalThis.marker = 2;", {
|
|
188
|
+
cwd: tempDir.path(),
|
|
189
|
+
sessionId,
|
|
190
|
+
session,
|
|
191
|
+
reset: true,
|
|
192
|
+
});
|
|
193
|
+
expect(second.exitCode).toBe(0);
|
|
194
|
+
expect(stats.closeRequests).toBe(1);
|
|
195
|
+
expect(stats.terminateCalls).toBe(0);
|
|
196
|
+
});
|
|
197
|
+
|
|
198
|
+
it("terminates when close is acknowledged but the worker does not exit", async () => {
|
|
199
|
+
using tempDir = TempDir.createSync("@omp-js-worker-close-hung-");
|
|
200
|
+
const stats: FakeWorkerStats = { closeRequests: 0, terminateCalls: 0 };
|
|
201
|
+
installFakeWorker(stats, { exitOnClose: false, settleRuns: true });
|
|
202
|
+
|
|
203
|
+
const session = makeSession(tempDir.path());
|
|
204
|
+
const sessionId = `js-close-hung:${crypto.randomUUID()}`;
|
|
205
|
+
|
|
206
|
+
const first = await executeJs("globalThis.marker = 1;", { cwd: tempDir.path(), sessionId, session });
|
|
207
|
+
expect(first.exitCode).toBe(0);
|
|
208
|
+
|
|
209
|
+
const second = await executeJs("globalThis.marker = 2;", {
|
|
210
|
+
cwd: tempDir.path(),
|
|
211
|
+
sessionId,
|
|
212
|
+
session,
|
|
213
|
+
reset: true,
|
|
214
|
+
});
|
|
215
|
+
expect(second.exitCode).toBe(0);
|
|
216
|
+
expect(stats.closeRequests).toBe(1);
|
|
217
|
+
expect(stats.terminateCalls).toBe(1);
|
|
218
|
+
});
|
|
219
|
+
|
|
220
|
+
it("force-terminates instead of closing when an in-flight run is aborted", async () => {
|
|
221
|
+
using tempDir = TempDir.createSync("@omp-js-worker-abort-");
|
|
222
|
+
const stats: FakeWorkerStats = { closeRequests: 0, terminateCalls: 0 };
|
|
223
|
+
installFakeWorker(stats, { exitOnClose: true, settleRuns: false });
|
|
224
|
+
|
|
225
|
+
const session = makeSession(tempDir.path());
|
|
226
|
+
const sessionId = `js-abort:${crypto.randomUUID()}`;
|
|
227
|
+
const controller = new AbortController();
|
|
228
|
+
const resultPromise = executeJs("globalThis.neverFinishes = true;", {
|
|
229
|
+
cwd: tempDir.path(),
|
|
230
|
+
sessionId,
|
|
231
|
+
session,
|
|
232
|
+
signal: controller.signal,
|
|
233
|
+
});
|
|
234
|
+
setTimeout(() => controller.abort(new DOMException("Execution aborted", "AbortError")), 0);
|
|
235
|
+
|
|
236
|
+
const result = await resultPromise;
|
|
237
|
+
expect(result.cancelled).toBe(true);
|
|
238
|
+
expect(stats.closeRequests).toBe(0);
|
|
239
|
+
expect(stats.terminateCalls).toBe(1);
|
|
240
|
+
});
|
|
241
|
+
});
|
package/src/eval/agent-bridge.ts
CHANGED
|
@@ -272,7 +272,12 @@ export async function runEvalAgent(args: unknown, options: EvalAgentBridgeOption
|
|
|
272
272
|
persistArtifacts: Boolean(sessionFile),
|
|
273
273
|
artifactsDir,
|
|
274
274
|
contextFile,
|
|
275
|
-
|
|
275
|
+
// Eval `agent()` subagents are short-lived programmatic helpers (data
|
|
276
|
+
// collection, structured output, parallel() fan-out). LSP server
|
|
277
|
+
// cold-start costs tens of seconds and is pure overhead here, so it is
|
|
278
|
+
// forced off regardless of the `task.enableLsp` setting — that knob only
|
|
279
|
+
// governs LSP-aware delegation through the `task` tool.
|
|
280
|
+
enableLsp: false,
|
|
276
281
|
signal: options.signal,
|
|
277
282
|
eventBus: options.session.eventBus,
|
|
278
283
|
onProgress: progress => emitProgressStatus(options.emitStatus, progress),
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
* Timeout suspension for in-flight host-side eval bridge calls.
|
|
3
3
|
*
|
|
4
4
|
* The eval watchdog caps a cell's `timeout` as a budget on the cell runtime's
|
|
5
|
-
* own work. Host-side `agent()` / `parallel()` / `
|
|
5
|
+
* own work. Host-side `agent()` / `parallel()` / `completion()` bridge calls hand
|
|
6
6
|
* control to the outer TypeScript process, where the Python kernel or JS VM is
|
|
7
7
|
* only waiting for a result. While that delegated work is in flight, the cell
|
|
8
8
|
* timeout must be ignored completely; once the bridge returns and the runtime is
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* Host-side handler for the eval `
|
|
2
|
+
* Host-side handler for the eval `completion()` helper.
|
|
3
3
|
*
|
|
4
4
|
* Both eval runtimes (JS worker + Python kernel) route helper→host calls
|
|
5
5
|
* through {@link callSessionTool}. Reserving the synthetic tool name
|
|
6
|
-
* {@link
|
|
6
|
+
* {@link EVAL_COMPLETION_BRIDGE_NAME} lets a single host handler serve both
|
|
7
7
|
* transports without registering an agent-visible tool: cell code calls
|
|
8
|
-
* `
|
|
8
|
+
* `completion(prompt, opts)`, the prelude forwards `{ prompt, model, system?, schema? }`
|
|
9
9
|
* through the bridge, and this module performs one stateless completion.
|
|
10
10
|
*
|
|
11
11
|
* The call is oneshot and toolless from the model's perspective — pure text
|
|
@@ -27,36 +27,36 @@ import { ToolError } from "../tools/tool-errors";
|
|
|
27
27
|
import { withBridgeTimeoutPause } from "./bridge-timeout";
|
|
28
28
|
import type { JsStatusEvent } from "./js/shared/types";
|
|
29
29
|
|
|
30
|
-
/** Synthetic bridge name reserved for the `
|
|
31
|
-
export const
|
|
30
|
+
/** Synthetic bridge name reserved for the `completion()` helper across both runtimes. */
|
|
31
|
+
export const EVAL_COMPLETION_BRIDGE_NAME = "__completion__";
|
|
32
32
|
|
|
33
33
|
/** Synthetic tool the model is forced to call when a `schema` is supplied. */
|
|
34
34
|
const STRUCTURED_TOOL_NAME = "respond";
|
|
35
35
|
|
|
36
|
-
type
|
|
36
|
+
type CompletionTier = "smol" | "default" | "slow";
|
|
37
37
|
|
|
38
|
-
const TIER_TO_PATTERN: Record<
|
|
38
|
+
const TIER_TO_PATTERN: Record<CompletionTier, string> = {
|
|
39
39
|
smol: "pi/smol",
|
|
40
40
|
default: "pi/default",
|
|
41
41
|
slow: "pi/slow",
|
|
42
42
|
};
|
|
43
43
|
|
|
44
|
-
const
|
|
44
|
+
const completionArgsSchema = z.object({
|
|
45
45
|
prompt: z.string().min(1, "prompt must be a non-empty string"),
|
|
46
46
|
model: z.enum(["smol", "default", "slow"]).default("default"),
|
|
47
47
|
system: z.string().optional(),
|
|
48
48
|
schema: z.record(z.string(), z.unknown()).optional(),
|
|
49
49
|
});
|
|
50
50
|
|
|
51
|
-
export interface
|
|
51
|
+
export interface EvalCompletionBridgeOptions {
|
|
52
52
|
session: ToolSession;
|
|
53
53
|
signal?: AbortSignal;
|
|
54
54
|
emitStatus?: (event: JsStatusEvent) => void;
|
|
55
55
|
}
|
|
56
56
|
|
|
57
|
-
export interface
|
|
57
|
+
export interface EvalCompletionResult {
|
|
58
58
|
text: string;
|
|
59
|
-
details: { model: string; tier:
|
|
59
|
+
details: { model: string; tier: CompletionTier; structured: boolean };
|
|
60
60
|
}
|
|
61
61
|
|
|
62
62
|
/**
|
|
@@ -64,7 +64,7 @@ export interface EvalLlmResult {
|
|
|
64
64
|
* active model and falls back to the `pi/default` role; `smol`/`slow` resolve
|
|
65
65
|
* their respective role patterns. Returns `undefined` when nothing matches.
|
|
66
66
|
*/
|
|
67
|
-
function resolveTierModel(tier:
|
|
67
|
+
function resolveTierModel(tier: CompletionTier, session: ToolSession): Model<Api> | undefined {
|
|
68
68
|
const modelRegistry = session.modelRegistry;
|
|
69
69
|
if (!modelRegistry) return undefined;
|
|
70
70
|
const available = modelRegistry.getAvailable();
|
|
@@ -90,7 +90,7 @@ function resolveTierModel(tier: LlmTier, session: ToolSession): Model<Api> | und
|
|
|
90
90
|
* throwing downstream on models that cannot reason. Clamps to the highest
|
|
91
91
|
* supported effort so a reasoning model without `high` does not 400.
|
|
92
92
|
*/
|
|
93
|
-
function reasoningForTier(tier:
|
|
93
|
+
function reasoningForTier(tier: CompletionTier, model: Model<Api>): Effort | undefined {
|
|
94
94
|
if (tier !== "slow" || !model.reasoning) return undefined;
|
|
95
95
|
const efforts = getSupportedEfforts(model);
|
|
96
96
|
if (efforts.length === 0) return undefined;
|
|
@@ -98,23 +98,26 @@ function reasoningForTier(tier: LlmTier, model: Model<Api>): Effort | undefined
|
|
|
98
98
|
}
|
|
99
99
|
|
|
100
100
|
/**
|
|
101
|
-
* Run a single stateless completion on behalf of an eval cell's `
|
|
101
|
+
* Run a single stateless completion on behalf of an eval cell's `completion()` call.
|
|
102
102
|
* Returns a `{ text, details }` value shaped like a {@link callSessionTool}
|
|
103
103
|
* result so the existing bridge transport carries it to either runtime.
|
|
104
104
|
*/
|
|
105
|
-
export async function
|
|
106
|
-
|
|
105
|
+
export async function runEvalCompletion(
|
|
106
|
+
args: unknown,
|
|
107
|
+
options: EvalCompletionBridgeOptions,
|
|
108
|
+
): Promise<EvalCompletionResult> {
|
|
109
|
+
const parsed = completionArgsSchema.safeParse(args);
|
|
107
110
|
if (!parsed.success) {
|
|
108
111
|
const issue = parsed.error.issues[0];
|
|
109
112
|
const where = issue?.path.length ? `${issue.path.join(".")}: ` : "";
|
|
110
|
-
throw new ToolError(`
|
|
113
|
+
throw new ToolError(`completion() received invalid arguments: ${where}${issue?.message ?? "bad input"}`);
|
|
111
114
|
}
|
|
112
115
|
const { prompt, model: tier, system, schema } = parsed.data;
|
|
113
116
|
|
|
114
117
|
const model = resolveTierModel(tier, options.session);
|
|
115
118
|
if (!model) {
|
|
116
119
|
throw new ToolError(
|
|
117
|
-
`
|
|
120
|
+
`completion() could not resolve a model for the "${tier}" tier. Configure modelRoles.${tier === "default" ? "default" : tier} or ensure a provider is available.`,
|
|
118
121
|
);
|
|
119
122
|
}
|
|
120
123
|
|
|
@@ -122,7 +125,7 @@ export async function runEvalLlm(args: unknown, options: EvalLlmBridgeOptions):
|
|
|
122
125
|
const apiKey = await registry?.getApiKey(model);
|
|
123
126
|
if (!registry || !apiKey) {
|
|
124
127
|
throw new ToolError(
|
|
125
|
-
`
|
|
128
|
+
`completion() has no API key for ${formatModelString(model)}. Configure credentials for this provider or choose another tier.`,
|
|
126
129
|
);
|
|
127
130
|
}
|
|
128
131
|
|
|
@@ -141,7 +144,7 @@ export async function runEvalLlm(args: unknown, options: EvalLlmBridgeOptions):
|
|
|
141
144
|
|
|
142
145
|
// Some providers (notably openai-codex) require a non-empty `instructions`
|
|
143
146
|
// field on every Responses request and 400 with "Instructions are required"
|
|
144
|
-
// when it is missing. Fall back to a minimal default so `
|
|
147
|
+
// when it is missing. Fall back to a minimal default so `completion(prompt)` works
|
|
145
148
|
// without forcing every caller to pass a `system` prompt.
|
|
146
149
|
const systemPrompt = system ? [system] : ["You are a helpful assistant."];
|
|
147
150
|
|
|
@@ -164,15 +167,15 @@ export async function runEvalLlm(args: unknown, options: EvalLlmBridgeOptions):
|
|
|
164
167
|
reasoning: reasoningForTier(tier, model),
|
|
165
168
|
toolChoice: schema ? { type: "tool", name: STRUCTURED_TOOL_NAME } : undefined,
|
|
166
169
|
},
|
|
167
|
-
{ telemetry, oneshotKind: "
|
|
170
|
+
{ telemetry, oneshotKind: "eval_completion" },
|
|
168
171
|
),
|
|
169
172
|
);
|
|
170
173
|
|
|
171
174
|
if (response.stopReason === "error") {
|
|
172
|
-
throw new ToolError(response.errorMessage ?? "
|
|
175
|
+
throw new ToolError(response.errorMessage ?? "completion() request failed.");
|
|
173
176
|
}
|
|
174
177
|
if (response.stopReason === "aborted") {
|
|
175
|
-
throw new ToolError("
|
|
178
|
+
throw new ToolError("completion() request aborted.");
|
|
176
179
|
}
|
|
177
180
|
|
|
178
181
|
let resultText: string;
|
|
@@ -183,20 +186,20 @@ export async function runEvalLlm(args: unknown, options: EvalLlmBridgeOptions):
|
|
|
183
186
|
value = call.arguments;
|
|
184
187
|
} else {
|
|
185
188
|
const text = extractTextContent(response);
|
|
186
|
-
if (!text) throw new ToolError("
|
|
189
|
+
if (!text) throw new ToolError("completion() returned no structured response.");
|
|
187
190
|
try {
|
|
188
191
|
value = parseJsonPayload(text);
|
|
189
192
|
} catch {
|
|
190
|
-
throw new ToolError("
|
|
193
|
+
throw new ToolError("completion() did not return a structured response matching the schema.");
|
|
191
194
|
}
|
|
192
195
|
}
|
|
193
196
|
resultText = JSON.stringify(value);
|
|
194
197
|
} else {
|
|
195
198
|
resultText = extractTextContent(response);
|
|
196
|
-
if (!resultText) throw new ToolError("
|
|
199
|
+
if (!resultText) throw new ToolError("completion() returned no text output.");
|
|
197
200
|
}
|
|
198
201
|
|
|
199
|
-
options.emitStatus?.({ op: "
|
|
202
|
+
options.emitStatus?.({ op: "completion", model: formatModelString(model), tier, chars: resultText.length });
|
|
200
203
|
|
|
201
204
|
return { text: resultText, details: { model: formatModelString(model), tier, structured: Boolean(schema) } };
|
|
202
205
|
}
|
package/src/eval/idle-timeout.ts
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
*
|
|
4
4
|
* A cell's `timeout` bounds time while the Python kernel or JS VM is in control.
|
|
5
5
|
* Host-side bridge calls can {@link pause} the watchdog so delegated
|
|
6
|
-
* `agent()`/`parallel()`/`
|
|
6
|
+
* `agent()`/`parallel()`/`completion()` work is ignored completely, then {@link resume}
|
|
7
7
|
* starts a fresh timeout window once the runtime gets control back.
|
|
8
8
|
*
|
|
9
9
|
* The active timer self-reschedules instead of being torn down on every
|
|
@@ -30,6 +30,7 @@ interface WorkerHandle {
|
|
|
30
30
|
mode: "worker" | "inline";
|
|
31
31
|
send(msg: WorkerInbound): void;
|
|
32
32
|
onMessage(handler: (msg: WorkerOutbound) => void): () => void;
|
|
33
|
+
close(): Promise<boolean>;
|
|
33
34
|
terminate(): Promise<void>;
|
|
34
35
|
}
|
|
35
36
|
|
|
@@ -60,6 +61,7 @@ const resettingSessions = new Map<string, Promise<void>>();
|
|
|
60
61
|
// avoiding `vm.runInContext` (see shared/indirect-eval.ts), here surfacing as a
|
|
61
62
|
// SIGILL/SIGSEGV. Callers that pass a larger per-cell budget still dominate.
|
|
62
63
|
const WORKER_INIT_TIMEOUT_MS = 15_000;
|
|
64
|
+
const WORKER_CLOSE_TIMEOUT_MS = 1_000;
|
|
63
65
|
|
|
64
66
|
export async function executeInVmContext(options: {
|
|
65
67
|
sessionKey: string;
|
|
@@ -108,7 +110,7 @@ export async function resetVmContext(sessionKey: string): Promise<void> {
|
|
|
108
110
|
const session = sessions.get(sessionKey) ?? (await startingSessions.get(sessionKey)?.catch(() => undefined));
|
|
109
111
|
if (!session) return;
|
|
110
112
|
sessions.delete(sessionKey);
|
|
111
|
-
await killSession(session, new ToolError("JS context reset"));
|
|
113
|
+
await killSession(session, new ToolError("JS context reset"), { force: false });
|
|
112
114
|
}
|
|
113
115
|
|
|
114
116
|
export async function disposeAllVmContexts(): Promise<void> {
|
|
@@ -121,7 +123,7 @@ export async function disposeAllVmContexts(): Promise<void> {
|
|
|
121
123
|
if (!all.includes(result.value)) all.push(result.value);
|
|
122
124
|
}
|
|
123
125
|
sessions.clear();
|
|
124
|
-
await Promise.all(all.map(session => killSession(session, new ToolError("JS context disposed"))));
|
|
126
|
+
await Promise.all(all.map(session => killSession(session, new ToolError("JS context disposed"), { force: false })));
|
|
125
127
|
}
|
|
126
128
|
|
|
127
129
|
async function runOnce(
|
|
@@ -154,7 +156,7 @@ async function runOnce(
|
|
|
154
156
|
// Cancel any in-flight tool calls first.
|
|
155
157
|
for (const ctrl of pending.toolCalls.values()) ctrl.abort(abortError);
|
|
156
158
|
// Hard-kill the worker — only way to interrupt synchronous user code.
|
|
157
|
-
void killSessionFor(session, abortError);
|
|
159
|
+
void killSessionFor(session, abortError, { force: true });
|
|
158
160
|
};
|
|
159
161
|
|
|
160
162
|
if (options.runState.signal?.aborted) {
|
|
@@ -294,14 +296,14 @@ function settlePending(session: JsSession, msg: Extract<WorkerOutbound, { type:
|
|
|
294
296
|
pending.reject(errorFromPayload(msg.error));
|
|
295
297
|
}
|
|
296
298
|
|
|
297
|
-
async function killSessionFor(session: JsSession, error: Error): Promise<void> {
|
|
299
|
+
async function killSessionFor(session: JsSession, error: Error, options: { force: boolean }): Promise<void> {
|
|
298
300
|
if (sessions.get(session.sessionKey) === session) {
|
|
299
301
|
sessions.delete(session.sessionKey);
|
|
300
302
|
}
|
|
301
|
-
await killSession(session, error);
|
|
303
|
+
await killSession(session, error, options);
|
|
302
304
|
}
|
|
303
305
|
|
|
304
|
-
async function killSession(session: JsSession, error: Error): Promise<void> {
|
|
306
|
+
async function killSession(session: JsSession, error: Error, options: { force: boolean }): Promise<void> {
|
|
305
307
|
if (session.state === "dead") return;
|
|
306
308
|
session.state = "dead";
|
|
307
309
|
for (const pending of session.pending.values()) {
|
|
@@ -311,6 +313,11 @@ async function killSession(session: JsSession, error: Error): Promise<void> {
|
|
|
311
313
|
pending.reject(error);
|
|
312
314
|
}
|
|
313
315
|
session.pending.clear();
|
|
316
|
+
if (options.force) {
|
|
317
|
+
await session.worker.terminate().catch(() => undefined);
|
|
318
|
+
return;
|
|
319
|
+
}
|
|
320
|
+
if (await session.worker.close().catch(() => false)) return;
|
|
314
321
|
await session.worker.terminate().catch(() => undefined);
|
|
315
322
|
}
|
|
316
323
|
|
|
@@ -398,6 +405,38 @@ function wrapBunWorker(worker: Worker): WorkerHandle {
|
|
|
398
405
|
worker.addEventListener("message", wrap);
|
|
399
406
|
return () => worker.removeEventListener("message", wrap);
|
|
400
407
|
},
|
|
408
|
+
async close() {
|
|
409
|
+
const { promise: closed, resolve } = Promise.withResolvers<boolean>();
|
|
410
|
+
let settled = false;
|
|
411
|
+
let sawClosedAck = false;
|
|
412
|
+
let sawWorkerExit = false;
|
|
413
|
+
let timeout: NodeJS.Timeout | undefined;
|
|
414
|
+
let unsubscribe = (): void => {};
|
|
415
|
+
const finish = (value: boolean): void => {
|
|
416
|
+
if (settled) return;
|
|
417
|
+
settled = true;
|
|
418
|
+
if (timeout) clearTimeout(timeout);
|
|
419
|
+
unsubscribe();
|
|
420
|
+
worker.removeEventListener("close", onClose);
|
|
421
|
+
resolve(value);
|
|
422
|
+
};
|
|
423
|
+
const finishIfClosed = (): void => {
|
|
424
|
+
if (sawClosedAck && sawWorkerExit) finish(true);
|
|
425
|
+
};
|
|
426
|
+
const onClose = (): void => {
|
|
427
|
+
sawWorkerExit = true;
|
|
428
|
+
finishIfClosed();
|
|
429
|
+
};
|
|
430
|
+
unsubscribe = this.onMessage(msg => {
|
|
431
|
+
if (msg.type !== "closed") return;
|
|
432
|
+
sawClosedAck = true;
|
|
433
|
+
finishIfClosed();
|
|
434
|
+
});
|
|
435
|
+
worker.addEventListener("close", onClose);
|
|
436
|
+
timeout = setTimeout(() => finish(false), WORKER_CLOSE_TIMEOUT_MS);
|
|
437
|
+
worker.postMessage({ type: "close" } satisfies WorkerInbound);
|
|
438
|
+
return await closed;
|
|
439
|
+
},
|
|
401
440
|
async terminate() {
|
|
402
441
|
worker.terminate();
|
|
403
442
|
},
|
|
@@ -434,6 +473,27 @@ function spawnInlineWorker(): WorkerHandle {
|
|
|
434
473
|
hostListeners.add(handler);
|
|
435
474
|
return () => hostListeners.delete(handler);
|
|
436
475
|
},
|
|
476
|
+
async close() {
|
|
477
|
+
const { promise: closed, resolve } = Promise.withResolvers<boolean>();
|
|
478
|
+
let settled = false;
|
|
479
|
+
let timeout: NodeJS.Timeout | undefined;
|
|
480
|
+
let unsubscribe = (): void => {};
|
|
481
|
+
const finish = (value: boolean): void => {
|
|
482
|
+
if (settled) return;
|
|
483
|
+
settled = true;
|
|
484
|
+
if (timeout) clearTimeout(timeout);
|
|
485
|
+
unsubscribe();
|
|
486
|
+
hostListeners.clear();
|
|
487
|
+
workerListeners.clear();
|
|
488
|
+
resolve(value);
|
|
489
|
+
};
|
|
490
|
+
unsubscribe = this.onMessage(msg => {
|
|
491
|
+
if (msg.type === "closed") finish(true);
|
|
492
|
+
});
|
|
493
|
+
this.send({ type: "close" });
|
|
494
|
+
timeout = setTimeout(() => finish(false), WORKER_CLOSE_TIMEOUT_MS);
|
|
495
|
+
return await closed;
|
|
496
|
+
},
|
|
437
497
|
async terminate() {
|
|
438
498
|
hostListeners.clear();
|
|
439
499
|
workerListeners.clear();
|
|
@@ -57,9 +57,9 @@ if (!globalThis.__omp_js_prelude_loaded__) {
|
|
|
57
57
|
|
|
58
58
|
const hasOwn = (object, key) => Object.prototype.hasOwnProperty.call(object, key);
|
|
59
59
|
|
|
60
|
-
const
|
|
61
|
-
const o = optionsArg("
|
|
62
|
-
const res = await globalThis.__omp_call_tool__("
|
|
60
|
+
const completion = async (prompt, opts, ...rest) => {
|
|
61
|
+
const o = optionsArg("completion", opts, rest, "{ model, system, schema }");
|
|
62
|
+
const res = await globalThis.__omp_call_tool__("__completion__", { prompt, ...o });
|
|
63
63
|
const text = res && typeof res === "object" ? res.text : res;
|
|
64
64
|
return hasOwn(o, "schema") ? JSON.parse(text) : text;
|
|
65
65
|
};
|
|
@@ -164,7 +164,7 @@ if (!globalThis.__omp_js_prelude_loaded__) {
|
|
|
164
164
|
globalThis.print = consoleBridge.log;
|
|
165
165
|
globalThis.display = display;
|
|
166
166
|
globalThis.tool = tool;
|
|
167
|
-
globalThis.
|
|
167
|
+
globalThis.completion = completion;
|
|
168
168
|
globalThis.output = output;
|
|
169
169
|
globalThis.agent = agent;
|
|
170
170
|
globalThis.parallel = parallel;
|
|
@@ -3,8 +3,8 @@ import type { ToolSession } from "../../tools";
|
|
|
3
3
|
import { ToolError } from "../../tools/tool-errors";
|
|
4
4
|
import { EVAL_AGENT_BRIDGE_NAME, runEvalAgent } from "../agent-bridge";
|
|
5
5
|
import { EVAL_BUDGET_BRIDGE_NAME, type EvalBudgetResult, runEvalBudget } from "../budget-bridge";
|
|
6
|
+
import { EVAL_COMPLETION_BRIDGE_NAME, runEvalCompletion } from "../completion-bridge";
|
|
6
7
|
import { EVAL_CONCURRENCY_BRIDGE_NAME, type EvalConcurrencyResult, runEvalConcurrency } from "../concurrency-bridge";
|
|
7
|
-
import { EVAL_LLM_BRIDGE_NAME, runEvalLlm } from "../llm-bridge";
|
|
8
8
|
import type { JsStatusEvent } from "./shared/types";
|
|
9
9
|
|
|
10
10
|
export type { JsStatusEvent } from "./shared/types";
|
|
@@ -107,8 +107,8 @@ function summarizeToolResult(
|
|
|
107
107
|
}
|
|
108
108
|
|
|
109
109
|
export async function callSessionTool(name: string, args: unknown, options: ToolBridgeOptions): Promise<ToolValue> {
|
|
110
|
-
if (name ===
|
|
111
|
-
return await
|
|
110
|
+
if (name === EVAL_COMPLETION_BRIDGE_NAME) {
|
|
111
|
+
return await runEvalCompletion(args, options);
|
|
112
112
|
}
|
|
113
113
|
if (name === EVAL_AGENT_BRIDGE_NAME) {
|
|
114
114
|
return await runEvalAgent(args, options);
|
|
@@ -18,6 +18,12 @@ const transport: Transport = {
|
|
|
18
18
|
} catch {
|
|
19
19
|
// Already closed.
|
|
20
20
|
}
|
|
21
|
+
|
|
22
|
+
// `parentPort.close()` only disconnects the channel in Bun; it does not
|
|
23
|
+
// make the Worker emit `close` or reap ref'ed user handles. Exit from
|
|
24
|
+
// inside the worker after `WorkerCore` has sent the `closed` ack so the
|
|
25
|
+
// host can observe real worker exit without calling `Worker.terminate()`.
|
|
26
|
+
setTimeout(() => process.exit(0), 0);
|
|
21
27
|
},
|
|
22
28
|
};
|
|
23
29
|
|
package/src/eval/py/prelude.py
CHANGED
|
@@ -463,8 +463,8 @@ if "__omp_prelude_loaded__" not in globals():
|
|
|
463
463
|
|
|
464
464
|
tool = _ToolProxy()
|
|
465
465
|
|
|
466
|
-
def
|
|
467
|
-
"""Oneshot, stateless
|
|
466
|
+
def completion(prompt, *, model="default", system=None, schema=None):
|
|
467
|
+
"""Oneshot, stateless completion against a model tier.
|
|
468
468
|
|
|
469
469
|
`model` selects a tier: "smol", "default" (the session's active model),
|
|
470
470
|
or "slow". Pass `system` for a system prompt. Pass a JSON-Schema dict
|
|
@@ -476,7 +476,7 @@ if "__omp_prelude_loaded__" not in globals():
|
|
|
476
476
|
args["system"] = system
|
|
477
477
|
if schema is not None:
|
|
478
478
|
args["schema"] = schema
|
|
479
|
-
res = _bridge_call("
|
|
479
|
+
res = _bridge_call("__completion__", args)
|
|
480
480
|
text = res.get("text") if isinstance(res, dict) else res
|
|
481
481
|
return json.loads(text) if schema is not None else text
|
|
482
482
|
|