@oh-my-pi/pi-coding-agent 15.10.3 → 15.10.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +20 -0
- package/dist/types/eval/__tests__/js-context-manager.test.d.ts +1 -0
- package/dist/types/eval/bridge-timeout.d.ts +1 -1
- package/dist/types/eval/{llm-bridge.d.ts → completion-bridge.d.ts} +8 -8
- package/dist/types/eval/idle-timeout.d.ts +1 -1
- package/package.json +9 -9
- package/src/eval/__tests__/agent-bridge.test.ts +13 -0
- package/src/eval/__tests__/{llm-bridge.test.ts → completion-bridge.test.ts} +60 -54
- package/src/eval/__tests__/js-context-manager.test.ts +241 -0
- package/src/eval/agent-bridge.ts +6 -1
- package/src/eval/bridge-timeout.ts +1 -1
- package/src/eval/{llm-bridge.ts → completion-bridge.ts} +30 -27
- package/src/eval/idle-timeout.ts +1 -1
- package/src/eval/js/context-manager.ts +66 -6
- package/src/eval/js/shared/prelude.txt +4 -4
- package/src/eval/js/tool-bridge.ts +3 -3
- package/src/eval/js/worker-entry.ts +6 -0
- package/src/eval/py/prelude.py +3 -3
- package/src/internal-urls/docs-index.generated.ts +4 -3
- package/src/modes/components/tips.txt +1 -1
- package/src/prompts/system/tiny-title-system.md +1 -1
- package/src/prompts/system/title-system.md +16 -3
- package/src/prompts/system/workflow-notice.md +1 -1
- package/src/prompts/tools/eval.md +3 -3
- package/src/tools/eval-render.ts +2 -2
- package/src/tools/eval.ts +1 -1
- package/src/utils/title-generator.ts +2 -2
- /package/dist/types/eval/__tests__/{llm-bridge.test.d.ts → completion-bridge.test.d.ts} +0 -0
package/CHANGELOG.md
CHANGED
|
@@ -2,6 +2,26 @@
|
|
|
2
2
|
|
|
3
3
|
## [Unreleased]
|
|
4
4
|
|
|
5
|
+
## [15.10.4] - 2026-06-08
|
|
6
|
+
|
|
7
|
+
### Added
|
|
8
|
+
|
|
9
|
+
- macOS release binaries are now signed with a Developer ID Application identity (hardened runtime + secure timestamp + JIT/library-validation entitlements) and notarized in CI when the `APPLE_*` signing secrets are configured; releases auto-fall back to ad-hoc signing until then. This makes the shipped binaries Gatekeeper-acceptable, unblocking an official Homebrew submission ([#776](https://github.com/can1357/oh-my-pi/issues/776)). See `docs/macos-signing-notarization.md`.
|
|
10
|
+
- Added a Homebrew install path: `brew install can1357/tap/omp`. The [can1357/homebrew-tap](https://github.com/can1357/homebrew-tap) formula installs the prebuilt release binary, and a `release_brew` CI job regenerates it (version + per-asset sha256) from each published release via `scripts/ci-update-brew-formula.ts` ([#776](https://github.com/can1357/oh-my-pi/issues/776)).
|
|
11
|
+
|
|
12
|
+
### Changed
|
|
13
|
+
|
|
14
|
+
- Adjusted `completion()` model resolution so the `default` tier now prefers the session’s active model and falls back to the configured default role when needed
|
|
15
|
+
- Rewrote the session auto-title prompt (`prompts/system/title-system.md`) and the `set_title` tool description to ask for a concise, sentence-case title (3-7 words) that captures the session's topic/goal, with good/bad examples and explicit guidance to treat the first message as data (no following embedded links/instructions, no refusals, describe URL/reference asks). The local on-device title prompt (`tiny-title-system.md`) was aligned to the same 3-7 word, sentence-case convention. The deterministic greeting/low-signal filter and the `none` deferral sentinel are unchanged.
|
|
16
|
+
- Renamed the eval oneshot helper from `llm()` to `completion()` in both JavaScript and Python preludes, including status events, prompt docs, and runtime tests.
|
|
17
|
+
|
|
18
|
+
### Fixed
|
|
19
|
+
|
|
20
|
+
- Fixed `completion()` to always send a non-empty default system prompt when `system` is omitted so providers that require instructions no longer reject requests
|
|
21
|
+
- Fixed structured `completion()` mode to return parsed JSON from plain text output when the model skips the forced `respond` tool call
|
|
22
|
+
- Fixed slow-tier `completion()` reasoning requests to avoid unsupported effort settings by only enabling reasoning on reasoning-capable models and capping effort to supported levels
|
|
23
|
+
- Fixed JS eval worker reset/dispose to close workers gracefully before forced termination, avoiding Bun 1.3.14 N-API teardown crashes with native modules such as `canvas`.
|
|
24
|
+
|
|
5
25
|
## [15.10.3] - 2026-06-08
|
|
6
26
|
|
|
7
27
|
### Added
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
* Timeout suspension for in-flight host-side eval bridge calls.
|
|
3
3
|
*
|
|
4
4
|
* The eval watchdog caps a cell's `timeout` as a budget on the cell runtime's
|
|
5
|
-
* own work. Host-side `agent()` / `parallel()` / `
|
|
5
|
+
* own work. Host-side `agent()` / `parallel()` / `completion()` bridge calls hand
|
|
6
6
|
* control to the outer TypeScript process, where the Python kernel or JS VM is
|
|
7
7
|
* only waiting for a result. While that delegated work is in flight, the cell
|
|
8
8
|
* timeout must be ignored completely; once the bridge returns and the runtime is
|
|
@@ -1,25 +1,25 @@
|
|
|
1
1
|
import type { ToolSession } from "../tools";
|
|
2
2
|
import type { JsStatusEvent } from "./js/shared/types";
|
|
3
|
-
/** Synthetic bridge name reserved for the `
|
|
4
|
-
export declare const
|
|
5
|
-
type
|
|
6
|
-
export interface
|
|
3
|
+
/** Synthetic bridge name reserved for the `completion()` helper across both runtimes. */
|
|
4
|
+
export declare const EVAL_COMPLETION_BRIDGE_NAME = "__completion__";
|
|
5
|
+
type CompletionTier = "smol" | "default" | "slow";
|
|
6
|
+
export interface EvalCompletionBridgeOptions {
|
|
7
7
|
session: ToolSession;
|
|
8
8
|
signal?: AbortSignal;
|
|
9
9
|
emitStatus?: (event: JsStatusEvent) => void;
|
|
10
10
|
}
|
|
11
|
-
export interface
|
|
11
|
+
export interface EvalCompletionResult {
|
|
12
12
|
text: string;
|
|
13
13
|
details: {
|
|
14
14
|
model: string;
|
|
15
|
-
tier:
|
|
15
|
+
tier: CompletionTier;
|
|
16
16
|
structured: boolean;
|
|
17
17
|
};
|
|
18
18
|
}
|
|
19
19
|
/**
|
|
20
|
-
* Run a single stateless completion on behalf of an eval cell's `
|
|
20
|
+
* Run a single stateless completion on behalf of an eval cell's `completion()` call.
|
|
21
21
|
* Returns a `{ text, details }` value shaped like a {@link callSessionTool}
|
|
22
22
|
* result so the existing bridge transport carries it to either runtime.
|
|
23
23
|
*/
|
|
24
|
-
export declare function
|
|
24
|
+
export declare function runEvalCompletion(args: unknown, options: EvalCompletionBridgeOptions): Promise<EvalCompletionResult>;
|
|
25
25
|
export {};
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
*
|
|
4
4
|
* A cell's `timeout` bounds time while the Python kernel or JS VM is in control.
|
|
5
5
|
* Host-side bridge calls can {@link pause} the watchdog so delegated
|
|
6
|
-
* `agent()`/`parallel()`/`
|
|
6
|
+
* `agent()`/`parallel()`/`completion()` work is ignored completely, then {@link resume}
|
|
7
7
|
* starts a fresh timeout window once the runtime gets control back.
|
|
8
8
|
*
|
|
9
9
|
* The active timer self-reschedules instead of being torn down on every
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"type": "module",
|
|
3
3
|
"name": "@oh-my-pi/pi-coding-agent",
|
|
4
|
-
"version": "15.10.
|
|
4
|
+
"version": "15.10.4",
|
|
5
5
|
"description": "Coding agent CLI with read, bash, edit, write tools and session management",
|
|
6
6
|
"homepage": "https://omp.sh",
|
|
7
7
|
"author": "Can Boluk",
|
|
@@ -47,14 +47,14 @@
|
|
|
47
47
|
"@agentclientprotocol/sdk": "0.22.1",
|
|
48
48
|
"@babel/parser": "^7.29.7",
|
|
49
49
|
"@mozilla/readability": "^0.6.0",
|
|
50
|
-
"@oh-my-pi/hashline": "15.10.
|
|
51
|
-
"@oh-my-pi/omp-stats": "15.10.
|
|
52
|
-
"@oh-my-pi/pi-agent-core": "15.10.
|
|
53
|
-
"@oh-my-pi/pi-ai": "15.10.
|
|
54
|
-
"@oh-my-pi/pi-mnemopi": "15.10.
|
|
55
|
-
"@oh-my-pi/pi-natives": "15.10.
|
|
56
|
-
"@oh-my-pi/pi-tui": "15.10.
|
|
57
|
-
"@oh-my-pi/pi-utils": "15.10.
|
|
50
|
+
"@oh-my-pi/hashline": "15.10.4",
|
|
51
|
+
"@oh-my-pi/omp-stats": "15.10.4",
|
|
52
|
+
"@oh-my-pi/pi-agent-core": "15.10.4",
|
|
53
|
+
"@oh-my-pi/pi-ai": "15.10.4",
|
|
54
|
+
"@oh-my-pi/pi-mnemopi": "15.10.4",
|
|
55
|
+
"@oh-my-pi/pi-natives": "15.10.4",
|
|
56
|
+
"@oh-my-pi/pi-tui": "15.10.4",
|
|
57
|
+
"@oh-my-pi/pi-utils": "15.10.4",
|
|
58
58
|
"@opentelemetry/api": "^1.9.1",
|
|
59
59
|
"@opentelemetry/context-async-hooks": "^2.7.1",
|
|
60
60
|
"@opentelemetry/exporter-trace-otlp-proto": "^0.218.0",
|
|
@@ -205,6 +205,19 @@ describe("runEvalAgent", () => {
|
|
|
205
205
|
expect(secondOptions.outputSchema).toBeUndefined();
|
|
206
206
|
});
|
|
207
207
|
|
|
208
|
+
it("forces LSP off for bridge subagents even when task.enableLsp is on", async () => {
|
|
209
|
+
mockAgents();
|
|
210
|
+
const runSpy = vi.spyOn(taskExecutor, "runSubprocess").mockImplementation(async options => singleResult(options));
|
|
211
|
+
// makeSession() defaults to enableLsp: true and task.enableLsp: true.
|
|
212
|
+
const session = makeSession();
|
|
213
|
+
|
|
214
|
+
await runEvalAgent({ prompt: "hello" }, { session });
|
|
215
|
+
|
|
216
|
+
const options = runSpy.mock.calls[0]?.[0];
|
|
217
|
+
if (!options) throw new Error("runSubprocess was not called");
|
|
218
|
+
expect(options.enableLsp).toBe(false);
|
|
219
|
+
});
|
|
220
|
+
|
|
208
221
|
it("maps successful and failed subagent results", async () => {
|
|
209
222
|
mockAgents();
|
|
210
223
|
const runSpy = vi.spyOn(taskExecutor, "runSubprocess");
|
|
@@ -10,10 +10,10 @@ import { Settings } from "../../config/settings";
|
|
|
10
10
|
import type { ToolSession } from "../../tools";
|
|
11
11
|
import { ToolError } from "../../tools/tool-errors";
|
|
12
12
|
import { EVAL_TIMEOUT_PAUSE_OP, EVAL_TIMEOUT_RESUME_OP } from "../bridge-timeout";
|
|
13
|
+
import { runEvalCompletion } from "../completion-bridge";
|
|
13
14
|
import { IdleTimeout } from "../idle-timeout";
|
|
14
15
|
import { disposeAllVmContexts } from "../js/context-manager";
|
|
15
16
|
import { executeJs } from "../js/executor";
|
|
16
|
-
import { runEvalLlm } from "../llm-bridge";
|
|
17
17
|
import { disposeAllKernelSessions, type PythonResult } from "../py/executor";
|
|
18
18
|
|
|
19
19
|
function makeModel(provider: string, id: string, extra: Partial<Model<Api>> = {}): Model<Api> {
|
|
@@ -98,16 +98,19 @@ function assistant(opts: {
|
|
|
98
98
|
};
|
|
99
99
|
}
|
|
100
100
|
|
|
101
|
-
async function
|
|
101
|
+
async function runPythonCompletionInSubprocess(options: {
|
|
102
|
+
structured: boolean;
|
|
103
|
+
tempDir: TempDir;
|
|
104
|
+
}): Promise<PythonResult> {
|
|
102
105
|
const repoRoot = path.resolve(import.meta.dir, "../../../..");
|
|
103
|
-
const scriptPath = path.join(options.tempDir.path(), "run-python-
|
|
104
|
-
const resultPath = path.join(options.tempDir.path(), "python-
|
|
106
|
+
const scriptPath = path.join(options.tempDir.path(), "run-python-completion.ts");
|
|
107
|
+
const resultPath = path.join(options.tempDir.path(), "python-completion-result.json");
|
|
105
108
|
const aiPath = path.resolve(import.meta.dir, "../../../../ai/src/index.ts");
|
|
106
109
|
const executorPath = path.resolve(import.meta.dir, "../py/executor.ts");
|
|
107
110
|
const settingsPath = path.resolve(import.meta.dir, "../../config/settings.ts");
|
|
108
111
|
const code = options.structured
|
|
109
|
-
? 'import json\nprint(json.dumps(
|
|
110
|
-
: 'print(
|
|
112
|
+
? 'import json\nprint(json.dumps(completion("hi", schema={"type": "object"})))'
|
|
113
|
+
: 'print(completion("hi", model="smol"))';
|
|
111
114
|
const responseContent = options.structured
|
|
112
115
|
? '[{ type: "toolCall", id: "tc-1", name: "respond", arguments: { ok: true } }]'
|
|
113
116
|
: '[{ type: "text", text: "hello from python" }]';
|
|
@@ -153,7 +156,7 @@ vi.spyOn(ai, "completeSimple").mockResolvedValue({
|
|
|
153
156
|
});
|
|
154
157
|
const result = await executePython(${JSON.stringify(code)}, {
|
|
155
158
|
cwd: ${JSON.stringify(options.tempDir.path())},
|
|
156
|
-
sessionId: ${JSON.stringify(`py-
|
|
159
|
+
sessionId: ${JSON.stringify(`py-completion:${options.structured ? "struct" : "plain"}`)},
|
|
157
160
|
sessionFile: ${JSON.stringify(path.join(options.tempDir.path(), "session.jsonl"))},
|
|
158
161
|
toolSession: session,
|
|
159
162
|
kernelMode: "per-call",
|
|
@@ -165,11 +168,12 @@ process.exit(0);
|
|
|
165
168
|
const child = await $`bun ${scriptPath}`.cwd(repoRoot).quiet().nothrow();
|
|
166
169
|
const stdout = child.stdout.toString();
|
|
167
170
|
const stderr = child.stderr.toString();
|
|
168
|
-
if (child.exitCode !== 0)
|
|
171
|
+
if (child.exitCode !== 0)
|
|
172
|
+
throw new Error(stderr || stdout || `Python completion subprocess exited with ${child.exitCode}`);
|
|
169
173
|
return (await Bun.file(resultPath).json()) as PythonResult;
|
|
170
174
|
}
|
|
171
175
|
|
|
172
|
-
describe("
|
|
176
|
+
describe("runEvalCompletion", () => {
|
|
173
177
|
afterEach(() => {
|
|
174
178
|
vi.restoreAllMocks();
|
|
175
179
|
});
|
|
@@ -178,9 +182,9 @@ describe("runEvalLlm", () => {
|
|
|
178
182
|
const spy = vi.spyOn(ai, "completeSimple").mockResolvedValue(assistant({ text: "ok" }));
|
|
179
183
|
const session = makeSession();
|
|
180
184
|
|
|
181
|
-
await
|
|
182
|
-
await
|
|
183
|
-
await
|
|
185
|
+
await runEvalCompletion({ prompt: "q", model: "smol" }, { session });
|
|
186
|
+
await runEvalCompletion({ prompt: "q", model: "default" }, { session });
|
|
187
|
+
await runEvalCompletion({ prompt: "q", model: "slow" }, { session });
|
|
184
188
|
|
|
185
189
|
const resolved = spy.mock.calls.map(call => {
|
|
186
190
|
const model = call[0] as Model<Api>;
|
|
@@ -193,7 +197,7 @@ describe("runEvalLlm", () => {
|
|
|
193
197
|
const spy = vi.spyOn(ai, "completeSimple").mockResolvedValue(assistant({ text: "ok" }));
|
|
194
198
|
const session = makeSession({ available: [SMOL, DEFAULT, SLOW], activeModel: "p/slow" });
|
|
195
199
|
|
|
196
|
-
await
|
|
200
|
+
await runEvalCompletion({ prompt: "q", model: "default" }, { session });
|
|
197
201
|
|
|
198
202
|
const model = spy.mock.calls[0]?.[0] as Model<Api>;
|
|
199
203
|
expect(`${model.provider}/${model.id}`).toBe("p/slow");
|
|
@@ -201,7 +205,7 @@ describe("runEvalLlm", () => {
|
|
|
201
205
|
|
|
202
206
|
it("returns the completion text in plain mode", async () => {
|
|
203
207
|
vi.spyOn(ai, "completeSimple").mockResolvedValue(assistant({ text: "the answer" }));
|
|
204
|
-
const result = await
|
|
208
|
+
const result = await runEvalCompletion({ prompt: "q", model: "smol" }, { session: makeSession() });
|
|
205
209
|
expect(result.text).toBe("the answer");
|
|
206
210
|
expect(result.details).toEqual({ model: "p/smol", tier: "smol", structured: false });
|
|
207
211
|
});
|
|
@@ -209,10 +213,10 @@ describe("runEvalLlm", () => {
|
|
|
209
213
|
it("supplies a non-empty systemPrompt when system is omitted (codex 'Instructions are required' guard)", async () => {
|
|
210
214
|
// The openai-codex Responses transformer drops `instructions` when no
|
|
211
215
|
// system prompt is provided, and the remote endpoint then 400s with
|
|
212
|
-
// "Instructions are required".
|
|
213
|
-
// systemPrompt so `
|
|
216
|
+
// "Instructions are required". runEvalCompletion must always carry a non-empty
|
|
217
|
+
// systemPrompt so `completion("…")` without a `system` argument works.
|
|
214
218
|
const spy = vi.spyOn(ai, "completeSimple").mockResolvedValue(assistant({ text: "ok" }));
|
|
215
|
-
await
|
|
219
|
+
await runEvalCompletion({ prompt: "q", model: "smol" }, { session: makeSession() });
|
|
216
220
|
const ctx = spy.mock.calls[0]?.[1] as { systemPrompt?: string[] };
|
|
217
221
|
expect(ctx.systemPrompt).toBeDefined();
|
|
218
222
|
expect(ctx.systemPrompt?.length).toBeGreaterThan(0);
|
|
@@ -221,7 +225,7 @@ describe("runEvalLlm", () => {
|
|
|
221
225
|
|
|
222
226
|
it("honors an explicit system prompt instead of overriding it", async () => {
|
|
223
227
|
const spy = vi.spyOn(ai, "completeSimple").mockResolvedValue(assistant({ text: "ok" }));
|
|
224
|
-
await
|
|
228
|
+
await runEvalCompletion({ prompt: "q", model: "smol", system: "Be terse." }, { session: makeSession() });
|
|
225
229
|
const ctx = spy.mock.calls[0]?.[1] as { systemPrompt?: string[] };
|
|
226
230
|
expect(ctx.systemPrompt).toEqual(["Be terse."]);
|
|
227
231
|
});
|
|
@@ -230,7 +234,7 @@ describe("runEvalLlm", () => {
|
|
|
230
234
|
const spy = vi
|
|
231
235
|
.spyOn(ai, "completeSimple")
|
|
232
236
|
.mockResolvedValue(assistant({ toolCall: { name: "respond", arguments: { answer: 42 } } }));
|
|
233
|
-
const result = await
|
|
237
|
+
const result = await runEvalCompletion(
|
|
234
238
|
{ prompt: "q", model: "smol", schema: { type: "object", properties: { answer: { type: "number" } } } },
|
|
235
239
|
{ session: makeSession() },
|
|
236
240
|
);
|
|
@@ -246,7 +250,7 @@ describe("runEvalLlm", () => {
|
|
|
246
250
|
|
|
247
251
|
it("falls back to JSON embedded in text when the model skips the respond tool", async () => {
|
|
248
252
|
vi.spyOn(ai, "completeSimple").mockResolvedValue(assistant({ text: 'here: {"answer": 7}' }));
|
|
249
|
-
const result = await
|
|
253
|
+
const result = await runEvalCompletion(
|
|
250
254
|
{ prompt: "q", model: "smol", schema: { type: "object" } },
|
|
251
255
|
{ session: makeSession() },
|
|
252
256
|
);
|
|
@@ -257,8 +261,8 @@ describe("runEvalLlm", () => {
|
|
|
257
261
|
const spy = vi.spyOn(ai, "completeSimple").mockResolvedValue(assistant({ text: "ok" }));
|
|
258
262
|
const session = makeSession({ available: [SMOL, DEFAULT, REASONING_SLOW] });
|
|
259
263
|
|
|
260
|
-
await
|
|
261
|
-
await
|
|
264
|
+
await runEvalCompletion({ prompt: "q", model: "smol" }, { session });
|
|
265
|
+
await runEvalCompletion({ prompt: "q", model: "slow" }, { session });
|
|
262
266
|
|
|
263
267
|
const smolOpts = spy.mock.calls[0]?.[2] as { reasoning?: unknown };
|
|
264
268
|
const slowOpts = spy.mock.calls[1]?.[2] as { reasoning?: unknown };
|
|
@@ -269,47 +273,49 @@ describe("runEvalLlm", () => {
|
|
|
269
273
|
it("does not request reasoning for the slow tier on a non-reasoning model", async () => {
|
|
270
274
|
const spy = vi.spyOn(ai, "completeSimple").mockResolvedValue(assistant({ text: "ok" }));
|
|
271
275
|
// SLOW is reasoning:false — must not trip requireSupportedEffort downstream.
|
|
272
|
-
const result = await
|
|
276
|
+
const result = await runEvalCompletion({ prompt: "q", model: "slow" }, { session: makeSession() });
|
|
273
277
|
expect(result.text).toBe("ok");
|
|
274
278
|
const opts = spy.mock.calls[0]?.[2] as { reasoning?: unknown };
|
|
275
279
|
expect(opts.reasoning).toBeUndefined();
|
|
276
280
|
});
|
|
277
281
|
|
|
278
282
|
it("throws ToolError on invalid arguments", async () => {
|
|
279
|
-
await expect(
|
|
280
|
-
await expect(
|
|
281
|
-
|
|
282
|
-
);
|
|
283
|
+
await expect(runEvalCompletion({ prompt: "" }, { session: makeSession() })).rejects.toBeInstanceOf(ToolError);
|
|
284
|
+
await expect(
|
|
285
|
+
runEvalCompletion({ prompt: "q", model: "huge" }, { session: makeSession() }),
|
|
286
|
+
).rejects.toBeInstanceOf(ToolError);
|
|
283
287
|
});
|
|
284
288
|
|
|
285
289
|
it("throws ToolError when no model resolves for the tier", async () => {
|
|
286
290
|
const session = makeSession({ available: [DEFAULT], roles: { smol: "missing/model" } });
|
|
287
|
-
await expect(
|
|
291
|
+
await expect(runEvalCompletion({ prompt: "q", model: "smol" }, { session })).rejects.toBeInstanceOf(ToolError);
|
|
288
292
|
});
|
|
289
293
|
|
|
290
294
|
it("throws ToolError when the resolved model has no API key", async () => {
|
|
291
295
|
const session = makeSession({ apiKey: null });
|
|
292
|
-
await expect(
|
|
296
|
+
await expect(runEvalCompletion({ prompt: "q", model: "smol" }, { session })).rejects.toBeInstanceOf(ToolError);
|
|
293
297
|
});
|
|
294
298
|
|
|
295
299
|
it("maps error and aborted stop reasons to ToolError", async () => {
|
|
296
300
|
vi.spyOn(ai, "completeSimple").mockResolvedValueOnce(assistant({ stopReason: "error", errorMessage: "boom" }));
|
|
297
|
-
await expect(
|
|
301
|
+
await expect(runEvalCompletion({ prompt: "q", model: "smol" }, { session: makeSession() })).rejects.toThrow(
|
|
302
|
+
"boom",
|
|
303
|
+
);
|
|
298
304
|
|
|
299
305
|
vi.spyOn(ai, "completeSimple").mockResolvedValueOnce(assistant({ stopReason: "aborted" }));
|
|
300
|
-
await expect(
|
|
301
|
-
|
|
302
|
-
);
|
|
306
|
+
await expect(
|
|
307
|
+
runEvalCompletion({ prompt: "q", model: "smol" }, { session: makeSession() }),
|
|
308
|
+
).rejects.toBeInstanceOf(ToolError);
|
|
303
309
|
});
|
|
304
310
|
|
|
305
311
|
it("throws ToolError when plain mode produces no text", async () => {
|
|
306
312
|
vi.spyOn(ai, "completeSimple").mockResolvedValue(assistant({ text: "" }));
|
|
307
|
-
await expect(
|
|
308
|
-
|
|
309
|
-
);
|
|
313
|
+
await expect(
|
|
314
|
+
runEvalCompletion({ prompt: "q", model: "smol" }, { session: makeSession() }),
|
|
315
|
+
).rejects.toBeInstanceOf(ToolError);
|
|
310
316
|
});
|
|
311
317
|
|
|
312
|
-
it("pauses the idle watchdog while a slow
|
|
318
|
+
it("pauses the idle watchdog while a slow completion() request is in flight", async () => {
|
|
313
319
|
// A oneshot completion emits no status until it returns; delegated model
|
|
314
320
|
// time must be invisible to the eval timeout budget.
|
|
315
321
|
vi.spyOn(ai, "completeSimple").mockImplementation(async () => {
|
|
@@ -319,7 +325,7 @@ describe("runEvalLlm", () => {
|
|
|
319
325
|
|
|
320
326
|
const ops: string[] = [];
|
|
321
327
|
using idle = new IdleTimeout(60);
|
|
322
|
-
const result = await
|
|
328
|
+
const result = await runEvalCompletion(
|
|
323
329
|
{ prompt: "q", model: "smol" },
|
|
324
330
|
{
|
|
325
331
|
session: makeSession(),
|
|
@@ -333,12 +339,12 @@ describe("runEvalLlm", () => {
|
|
|
333
339
|
);
|
|
334
340
|
|
|
335
341
|
expect(result.text).toBe("the answer");
|
|
336
|
-
expect(ops).toEqual([EVAL_TIMEOUT_PAUSE_OP, EVAL_TIMEOUT_RESUME_OP, "
|
|
342
|
+
expect(ops).toEqual([EVAL_TIMEOUT_PAUSE_OP, EVAL_TIMEOUT_RESUME_OP, "completion"]);
|
|
337
343
|
expect(idle.signal.aborted).toBe(false);
|
|
338
344
|
});
|
|
339
345
|
});
|
|
340
346
|
|
|
341
|
-
describe("
|
|
347
|
+
describe("completion() through eval runtimes", () => {
|
|
342
348
|
afterEach(() => {
|
|
343
349
|
vi.restoreAllMocks();
|
|
344
350
|
});
|
|
@@ -348,13 +354,13 @@ describe("llm() through eval runtimes", () => {
|
|
|
348
354
|
await disposeAllKernelSessions();
|
|
349
355
|
});
|
|
350
356
|
|
|
351
|
-
it("exposes
|
|
352
|
-
using tempDir = TempDir.createSync("@omp-eval-
|
|
357
|
+
it("exposes completion() in the JavaScript runtime", async () => {
|
|
358
|
+
using tempDir = TempDir.createSync("@omp-eval-completion-js-");
|
|
353
359
|
const sessionFile = path.join(tempDir.path(), "session.jsonl");
|
|
354
|
-
const sessionId = `js-
|
|
360
|
+
const sessionId = `js-completion:${crypto.randomUUID()}`;
|
|
355
361
|
vi.spyOn(ai, "completeSimple").mockResolvedValue(assistant({ text: "hello from smol" }));
|
|
356
362
|
|
|
357
|
-
const result = await executeJs('return await
|
|
363
|
+
const result = await executeJs('return await completion("hi", { model: "smol" });', {
|
|
358
364
|
cwd: tempDir.path(),
|
|
359
365
|
sessionId,
|
|
360
366
|
session: makeSession(),
|
|
@@ -365,16 +371,16 @@ describe("llm() through eval runtimes", () => {
|
|
|
365
371
|
expect(result.output.trim()).toBe("hello from smol");
|
|
366
372
|
});
|
|
367
373
|
|
|
368
|
-
it("parses structured
|
|
369
|
-
using tempDir = TempDir.createSync("@omp-eval-
|
|
374
|
+
it("parses structured completion() output in the JavaScript runtime", async () => {
|
|
375
|
+
using tempDir = TempDir.createSync("@omp-eval-completion-js-struct-");
|
|
370
376
|
const sessionFile = path.join(tempDir.path(), "session.jsonl");
|
|
371
|
-
const sessionId = `js-
|
|
377
|
+
const sessionId = `js-completion-struct:${crypto.randomUUID()}`;
|
|
372
378
|
vi.spyOn(ai, "completeSimple").mockResolvedValue(
|
|
373
379
|
assistant({ toolCall: { name: "respond", arguments: { ok: true, n: 3 } } }),
|
|
374
380
|
);
|
|
375
381
|
|
|
376
382
|
const result = await executeJs(
|
|
377
|
-
'const r = await
|
|
383
|
+
'const r = await completion("hi", { schema: { type: "object" } }); return JSON.stringify(r);',
|
|
378
384
|
{ cwd: tempDir.path(), sessionId, session: makeSession(), sessionFile },
|
|
379
385
|
);
|
|
380
386
|
|
|
@@ -382,10 +388,10 @@ describe("llm() through eval runtimes", () => {
|
|
|
382
388
|
expect(JSON.parse(result.output.trim())).toEqual({ ok: true, n: 3 });
|
|
383
389
|
});
|
|
384
390
|
|
|
385
|
-
it("exposes
|
|
386
|
-
const tempDir = TempDir.createSync("@omp-eval-
|
|
391
|
+
it("exposes completion() in the Python runtime", async () => {
|
|
392
|
+
const tempDir = TempDir.createSync("@omp-eval-completion-py-");
|
|
387
393
|
try {
|
|
388
|
-
const result = await
|
|
394
|
+
const result = await runPythonCompletionInSubprocess({ structured: false, tempDir });
|
|
389
395
|
expect(result.exitCode).toBe(0);
|
|
390
396
|
expect(result.output.trim()).toBe("hello from python");
|
|
391
397
|
} finally {
|
|
@@ -393,10 +399,10 @@ describe("llm() through eval runtimes", () => {
|
|
|
393
399
|
}
|
|
394
400
|
});
|
|
395
401
|
|
|
396
|
-
it("parses structured
|
|
397
|
-
const tempDir = TempDir.createSync("@omp-eval-
|
|
402
|
+
it("parses structured completion() output in the Python runtime", async () => {
|
|
403
|
+
const tempDir = TempDir.createSync("@omp-eval-completion-py-struct-");
|
|
398
404
|
try {
|
|
399
|
-
const result = await
|
|
405
|
+
const result = await runPythonCompletionInSubprocess({ structured: true, tempDir });
|
|
400
406
|
expect(result.exitCode).toBe(0);
|
|
401
407
|
expect(JSON.parse(result.output.trim())).toEqual({ ok: true });
|
|
402
408
|
} finally {
|