@oh-my-pi/pi-coding-agent 15.5.13 → 15.5.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. package/CHANGELOG.md +31 -0
  2. package/dist/types/config/model-registry.d.ts +1 -1
  3. package/dist/types/config/models-config-schema.d.ts +2 -0
  4. package/dist/types/config/settings-schema.d.ts +1 -10
  5. package/dist/types/eval/__tests__/llm-bridge.test.d.ts +1 -0
  6. package/dist/types/eval/llm-bridge.d.ts +25 -0
  7. package/dist/types/export/html/template.generated.d.ts +1 -1
  8. package/dist/types/extensibility/plugins/legacy-pi-compat.d.ts +15 -0
  9. package/dist/types/modes/theme/theme.d.ts +2 -1
  10. package/dist/types/session/agent-session.d.ts +2 -0
  11. package/dist/types/tools/index.d.ts +0 -1
  12. package/package.json +8 -8
  13. package/src/config/model-registry.ts +89 -5
  14. package/src/config/models-config-schema.ts +1 -1
  15. package/src/config/settings-schema.ts +1 -10
  16. package/src/eval/__tests__/llm-bridge.test.ts +297 -0
  17. package/src/eval/js/shared/prelude.txt +8 -0
  18. package/src/eval/js/tool-bridge.ts +4 -0
  19. package/src/eval/llm-bridge.ts +181 -0
  20. package/src/eval/py/prelude.py +52 -31
  21. package/src/export/html/template.generated.ts +1 -1
  22. package/src/export/html/template.js +0 -13
  23. package/src/extensibility/plugins/legacy-pi-compat.ts +60 -23
  24. package/src/internal-urls/docs-index.generated.ts +3 -4
  25. package/src/main.ts +4 -0
  26. package/src/modes/components/model-selector.ts +119 -22
  27. package/src/modes/components/status-line/presets.ts +1 -0
  28. package/src/modes/components/status-line/segments.ts +23 -0
  29. package/src/modes/interactive-mode.ts +22 -87
  30. package/src/modes/theme/theme.ts +7 -0
  31. package/src/prompts/tools/eval.md +2 -0
  32. package/src/session/agent-session.ts +19 -0
  33. package/src/session/session-manager.ts +47 -0
  34. package/src/tools/eval.ts +24 -48
  35. package/src/tools/index.ts +0 -4
  36. package/src/tools/renderers.ts +0 -2
  37. package/dist/types/tools/calculator.d.ts +0 -77
  38. package/src/prompts/tools/calculator.md +0 -10
  39. package/src/tools/calculator.ts +0 -541
@@ -1,3 +1,18 @@
1
+ import * as path from "node:path";
2
+ /**
3
+ * Compute the bunfs package root from the compiled binary's `import.meta.dir`
4
+ * (or any stand-in supplied by tests). Bun 1.3 reports the bunfs mount root
5
+ * (`/$bunfs/root` or `<drive>:\~BUN\root`) for imported modules as well as the
6
+ * entrypoint, so the normal path is `<root>/packages`.
7
+ *
8
+ * The suffix branch preserves correctness if a future Bun release switches to
9
+ * module-specific `import.meta.dir` values inside compiled binaries, matching
10
+ * the source layout:
11
+ * `<bunfs>/packages/coding-agent/src/extensibility/plugins`.
12
+ *
13
+ * Exported for tests; production callers use `BUNFS_PACKAGE_ROOT` below.
14
+ */
15
+ export declare function __computeBunfsPackageRoot(metaDir: string, pathImpl?: typeof path): string;
1
16
  export declare function loadLegacyPiModule(resolvedPath: string): Promise<unknown>;
2
17
  export declare function installLegacyPiSpecifierShim(): void;
3
18
  /** Test seam: clears the memoized canonical specifier resolutions. */
@@ -6,7 +6,7 @@ export type SymbolPreset = "unicode" | "nerd" | "ascii";
6
6
  /**
7
7
  * All available symbol keys organized by category.
8
8
  */
9
- export type SymbolKey = "status.success" | "status.error" | "status.warning" | "status.info" | "status.pending" | "status.disabled" | "status.enabled" | "status.running" | "status.shadowed" | "status.aborted" | "nav.cursor" | "nav.selected" | "nav.expand" | "nav.collapse" | "nav.back" | "tree.branch" | "tree.last" | "tree.vertical" | "tree.horizontal" | "tree.hook" | "boxRound.topLeft" | "boxRound.topRight" | "boxRound.bottomLeft" | "boxRound.bottomRight" | "boxRound.horizontal" | "boxRound.vertical" | "boxSharp.topLeft" | "boxSharp.topRight" | "boxSharp.bottomLeft" | "boxSharp.bottomRight" | "boxSharp.horizontal" | "boxSharp.vertical" | "boxSharp.cross" | "boxSharp.teeDown" | "boxSharp.teeUp" | "boxSharp.teeRight" | "boxSharp.teeLeft" | "sep.powerline" | "sep.powerlineThin" | "sep.powerlineLeft" | "sep.powerlineRight" | "sep.powerlineThinLeft" | "sep.powerlineThinRight" | "sep.block" | "sep.space" | "sep.asciiLeft" | "sep.asciiRight" | "sep.dot" | "sep.slash" | "sep.pipe" | "icon.model" | "icon.plan" | "icon.goal" | "icon.pause" | "icon.loop" | "icon.folder" | "icon.scratchFolder" | "icon.file" | "icon.git" | "icon.branch" | "icon.pr" | "icon.tokens" | "icon.context" | "icon.cost" | "icon.time" | "icon.pi" | "icon.agents" | "icon.cache" | "icon.input" | "icon.output" | "icon.host" | "icon.session" | "icon.package" | "icon.warning" | "icon.rewind" | "icon.auto" | "icon.fast" | "icon.extensionSkill" | "icon.extensionTool" | "icon.extensionSlashCommand" | "icon.extensionMcp" | "icon.extensionRule" | "icon.extensionHook" | "icon.extensionPrompt" | "icon.extensionContextFile" | "icon.extensionInstruction" | "icon.mic" | "thinking.minimal" | "thinking.low" | "thinking.medium" | "thinking.high" | "thinking.xhigh" | "checkbox.checked" | "checkbox.unchecked" | "format.bullet" | "format.dash" | "format.bracketLeft" | "format.bracketRight" | "md.quoteBorder" | "md.hrChar" | "md.bullet" | "lang.default" | "lang.typescript" | "lang.javascript" | "lang.python" | "lang.rust" | "lang.go" | "lang.java" | "lang.c" | "lang.cpp" | "lang.csharp" | "lang.ruby" | "lang.php" | "lang.swift" | "lang.kotlin" | "lang.shell" | "lang.html" | "lang.css" | "lang.json" | "lang.yaml" | "lang.markdown" | "lang.sql" | "lang.docker" | "lang.lua" | "lang.text" | "lang.env" | "lang.toml" | "lang.xml" | "lang.ini" | "lang.conf" | "lang.log" | "lang.csv" | "lang.tsv" | "lang.image" | "lang.pdf" | "lang.archive" | "lang.binary" | "tab.appearance" | "tab.model" | "tab.interaction" | "tab.context" | "tab.editing" | "tab.tools" | "tab.memory" | "tab.tasks" | "tab.providers";
9
+ export type SymbolKey = "status.success" | "status.error" | "status.warning" | "status.info" | "status.pending" | "status.disabled" | "status.enabled" | "status.running" | "status.shadowed" | "status.aborted" | "nav.cursor" | "nav.selected" | "nav.expand" | "nav.collapse" | "nav.back" | "tree.branch" | "tree.last" | "tree.vertical" | "tree.horizontal" | "tree.hook" | "boxRound.topLeft" | "boxRound.topRight" | "boxRound.bottomLeft" | "boxRound.bottomRight" | "boxRound.horizontal" | "boxRound.vertical" | "boxSharp.topLeft" | "boxSharp.topRight" | "boxSharp.bottomLeft" | "boxSharp.bottomRight" | "boxSharp.horizontal" | "boxSharp.vertical" | "boxSharp.cross" | "boxSharp.teeDown" | "boxSharp.teeUp" | "boxSharp.teeRight" | "boxSharp.teeLeft" | "sep.powerline" | "sep.powerlineThin" | "sep.powerlineLeft" | "sep.powerlineRight" | "sep.powerlineThinLeft" | "sep.powerlineThinRight" | "sep.block" | "sep.space" | "sep.asciiLeft" | "sep.asciiRight" | "sep.dot" | "sep.slash" | "sep.pipe" | "icon.model" | "icon.plan" | "icon.goal" | "icon.pause" | "icon.loop" | "icon.folder" | "icon.scratchFolder" | "icon.file" | "icon.git" | "icon.branch" | "icon.pr" | "icon.tokens" | "icon.context" | "icon.cost" | "icon.time" | "icon.pi" | "icon.agents" | "icon.cache" | "icon.input" | "icon.output" | "icon.host" | "icon.session" | "icon.package" | "icon.warning" | "icon.rewind" | "icon.auto" | "icon.fast" | "icon.extensionSkill" | "icon.extensionTool" | "icon.extensionSlashCommand" | "icon.extensionMcp" | "icon.extensionRule" | "icon.extensionHook" | "icon.extensionPrompt" | "icon.extensionContextFile" | "icon.extensionInstruction" | "icon.mic" | "thinking.minimal" | "thinking.low" | "thinking.medium" | "thinking.high" | "thinking.xhigh" | "checkbox.checked" | "checkbox.unchecked" | "format.bullet" | "format.dash" | "format.bracketLeft" | "format.bracketRight" | "md.quoteBorder" | "md.hrChar" | "md.bullet" | "md.colorSwatch" | "lang.default" | "lang.typescript" | "lang.javascript" | "lang.python" | "lang.rust" | "lang.go" | "lang.java" | "lang.c" | "lang.cpp" | "lang.csharp" | "lang.ruby" | "lang.php" | "lang.swift" | "lang.kotlin" | "lang.shell" | "lang.html" | "lang.css" | "lang.json" | "lang.yaml" | "lang.markdown" | "lang.sql" | "lang.docker" | "lang.lua" | "lang.text" | "lang.env" | "lang.toml" | "lang.xml" | "lang.ini" | "lang.conf" | "lang.log" | "lang.csv" | "lang.tsv" | "lang.image" | "lang.pdf" | "lang.archive" | "lang.binary" | "tab.appearance" | "tab.model" | "tab.interaction" | "tab.context" | "tab.editing" | "tab.tools" | "tab.memory" | "tab.tasks" | "tab.providers";
10
10
  export type SpinnerType = "status" | "activity";
11
11
  export type ThemeColor = "accent" | "border" | "borderAccent" | "borderMuted" | "success" | "error" | "warning" | "muted" | "dim" | "text" | "thinkingText" | "userMessageText" | "customMessageText" | "customMessageLabel" | "toolTitle" | "toolOutput" | "mdHeading" | "mdLink" | "mdLinkUrl" | "mdCode" | "mdCodeBlock" | "mdCodeBlockBorder" | "mdQuote" | "mdQuoteBorder" | "mdHr" | "mdListBullet" | "toolDiffAdded" | "toolDiffRemoved" | "toolDiffContext" | "syntaxComment" | "syntaxKeyword" | "syntaxFunction" | "syntaxVariable" | "syntaxString" | "syntaxNumber" | "syntaxType" | "syntaxOperator" | "syntaxPunctuation" | "thinkingOff" | "thinkingMinimal" | "thinkingLow" | "thinkingMedium" | "thinkingHigh" | "thinkingXhigh" | "bashMode" | "pythonMode" | "statusLineSep" | "statusLineModel" | "statusLinePath" | "statusLineGitClean" | "statusLineGitDirty" | "statusLineContext" | "statusLineSpend" | "statusLineStaged" | "statusLineDirty" | "statusLineUntracked" | "statusLineOutput" | "statusLineCost" | "statusLineSubagents";
12
12
  /** Check if a string is a valid ThemeColor value */
@@ -165,6 +165,7 @@ export declare class Theme {
165
165
  quoteBorder: string;
166
166
  hrChar: string;
167
167
  bullet: string;
168
+ colorSwatch: string;
168
169
  };
169
170
  /**
170
171
  * Default spinner frames (status spinner).
@@ -262,6 +262,8 @@ export interface SessionStats {
262
262
  premiumRequests: number;
263
263
  cost: number;
264
264
  }
265
+ export declare const ANTHROPIC_TOOL_CALL_BATCH_CAP = 4;
266
+ export declare function resolveToolCallBatchCapForModel(model: Model | undefined): number | undefined;
265
267
  export declare class AgentSession {
266
268
  #private;
267
269
  readonly agent: Agent;
@@ -31,7 +31,6 @@ export * from "./ast-edit";
31
31
  export * from "./ast-grep";
32
32
  export * from "./bash";
33
33
  export * from "./browser";
34
- export * from "./calculator";
35
34
  export * from "./checkpoint";
36
35
  export * from "./debug";
37
36
  export * from "./eval";
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "type": "module",
3
3
  "name": "@oh-my-pi/pi-coding-agent",
4
- "version": "15.5.13",
4
+ "version": "15.5.15",
5
5
  "description": "Coding agent CLI with read, bash, edit, write tools and session management",
6
6
  "homepage": "https://omp.sh",
7
7
  "author": "Can Boluk",
@@ -47,13 +47,13 @@
47
47
  "@agentclientprotocol/sdk": "0.21.0",
48
48
  "@babel/parser": "^7.29.3",
49
49
  "@mozilla/readability": "^0.6.0",
50
- "@oh-my-pi/hashline": "15.5.13",
51
- "@oh-my-pi/omp-stats": "15.5.13",
52
- "@oh-my-pi/pi-agent-core": "15.5.13",
53
- "@oh-my-pi/pi-ai": "15.5.13",
54
- "@oh-my-pi/pi-natives": "15.5.13",
55
- "@oh-my-pi/pi-tui": "15.5.13",
56
- "@oh-my-pi/pi-utils": "15.5.13",
50
+ "@oh-my-pi/hashline": "15.5.15",
51
+ "@oh-my-pi/omp-stats": "15.5.15",
52
+ "@oh-my-pi/pi-agent-core": "15.5.15",
53
+ "@oh-my-pi/pi-ai": "15.5.15",
54
+ "@oh-my-pi/pi-natives": "15.5.15",
55
+ "@oh-my-pi/pi-tui": "15.5.15",
56
+ "@oh-my-pi/pi-utils": "15.5.15",
57
57
  "@puppeteer/browsers": "^2.13.0",
58
58
  "@types/turndown": "5.0.6",
59
59
  "@xterm/headless": "^6.0.0",
@@ -192,7 +192,7 @@ function validateProviderConfiguration(
192
192
  }
193
193
  }
194
194
 
195
- if (mode === "models-config" && config.discovery && !config.api) {
195
+ if (mode === "models-config" && config.discovery && !config.api && config.discovery.type !== "proxy") {
196
196
  throw new Error(`Provider ${providerName}: "api" is required when discovery is enabled at provider level.`);
197
197
  }
198
198
 
@@ -1209,13 +1209,17 @@ export class ModelRegistry {
1209
1209
  keylessProviders.add(providerName);
1210
1210
  }
1211
1211
 
1212
- if (providerConfig.discovery && providerConfig.api) {
1212
+ if (providerConfig.discovery && (providerConfig.api || providerConfig.discovery.type === "proxy")) {
1213
+ const disableStrictCompat = providerConfig.disableStrictTools ? { disableStrictTools: true } : undefined;
1213
1214
  discoverableProviders.push({
1214
1215
  provider: providerName,
1215
- api: providerConfig.api as Api,
1216
+ // Proxy discovery derives per-model api from /v1/models's
1217
+ // supported_endpoint_types; the provider-level api is only a
1218
+ // fallback for entries that don't advertise one.
1219
+ api: (providerConfig.api ?? "openai-completions") as Api,
1216
1220
  baseUrl: providerConfig.baseUrl,
1217
1221
  headers: providerConfig.headers,
1218
- compat: providerConfig.compat,
1222
+ compat: mergeCompat(providerConfig.compat, disableStrictCompat),
1219
1223
  discovery: providerConfig.discovery,
1220
1224
  optional: false,
1221
1225
  });
@@ -1385,6 +1389,8 @@ export class ModelRegistry {
1385
1389
  case "lm-studio":
1386
1390
  case "openai-models-list":
1387
1391
  return this.#discoverOpenAIModelsList(providerConfig);
1392
+ case "proxy":
1393
+ return this.#discoverProxyModels(providerConfig);
1388
1394
  }
1389
1395
  }
1390
1396
 
@@ -1711,7 +1717,7 @@ export class ModelRegistry {
1711
1717
 
1712
1718
  const response = await fetch(modelsUrl, {
1713
1719
  headers,
1714
- signal: AbortSignal.timeout(250),
1720
+ signal: AbortSignal.timeout(10_000),
1715
1721
  });
1716
1722
  if (!response.ok) {
1717
1723
  throw new Error(`HTTP ${response.status} from ${modelsUrl}`);
@@ -1746,6 +1752,84 @@ export class ModelRegistry {
1746
1752
  return this.#applyProviderModelOverrides(providerConfig.provider, discovered);
1747
1753
  }
1748
1754
 
1755
+ /**
1756
+ * Discover models from an Anthropic+OpenAI-compatible reseller proxy that
1757
+ * exposes both `/v1/messages` and `/v1/chat/completions`, advertising each
1758
+ * model's wire capabilities through `supported_endpoint_types` on
1759
+ * `GET /v1/models` (new-api / one-api-style proxies).
1760
+ *
1761
+ * Routing per model:
1762
+ * supported_endpoint_types: ["anthropic", ...] -> api: "anthropic-messages"
1763
+ * supported_endpoint_types: ["openai"] -> api: "openai-completions"
1764
+ * missing / neither -> provider-level api fallback
1765
+ *
1766
+ * Anthropic models share the same baseUrl; the Anthropic SDK strips a
1767
+ * trailing `/v1` itself before appending `/v1/messages`, so the discovery
1768
+ * URL (which ends in `/v1`) round-trips correctly.
1769
+ */
1770
+ async #discoverProxyModels(providerConfig: DiscoveryProviderConfig): Promise<Model<Api>[]> {
1771
+ const baseUrl = this.#normalizeOpenAIModelsListBaseUrl(providerConfig.baseUrl);
1772
+ const modelsUrl = `${baseUrl}/models`;
1773
+
1774
+ const headers: Record<string, string> = { ...(providerConfig.headers ?? {}) };
1775
+ const apiKey = await this.authStorage.getApiKey(providerConfig.provider);
1776
+ if (apiKey && apiKey !== DEFAULT_LOCAL_TOKEN && apiKey !== kNoAuth) {
1777
+ headers.Authorization = `Bearer ${apiKey}`;
1778
+ }
1779
+
1780
+ const response = await fetch(modelsUrl, {
1781
+ headers,
1782
+ signal: AbortSignal.timeout(10_000),
1783
+ });
1784
+ if (!response.ok) {
1785
+ throw new Error(`HTTP ${response.status} from ${modelsUrl}`);
1786
+ }
1787
+ const payload = (await response.json()) as {
1788
+ data?: Array<{ id?: string; supported_endpoint_types?: string[] }>;
1789
+ };
1790
+ const items = payload.data ?? [];
1791
+ const discovered: Model<Api>[] = [];
1792
+ for (const item of items) {
1793
+ const id = item.id;
1794
+ if (!id) continue;
1795
+ const endpoints = item.supported_endpoint_types ?? [];
1796
+ const api: Api | undefined = endpoints.includes("anthropic")
1797
+ ? "anthropic-messages"
1798
+ : endpoints.includes("openai")
1799
+ ? "openai-completions"
1800
+ : providerConfig.api;
1801
+ if (!api) continue;
1802
+ const isAnthropic = api === "anthropic-messages";
1803
+ discovered.push(
1804
+ enrichModelThinking({
1805
+ id,
1806
+ name: id,
1807
+ api,
1808
+ provider: providerConfig.provider,
1809
+ baseUrl,
1810
+ reasoning: false,
1811
+ input: ["text"],
1812
+ cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
1813
+ contextWindow: 128000,
1814
+ maxTokens: 8192,
1815
+ headers,
1816
+ // OpenAI-compat fields are no-ops on anthropic models; the
1817
+ // Anthropic SDK ignores them. Provider-level disableStrictTools
1818
+ // flows in via #applyProviderCompat for the third-party-Anthropic
1819
+ // path.
1820
+ compat: isAnthropic
1821
+ ? undefined
1822
+ : {
1823
+ supportsStore: false,
1824
+ supportsDeveloperRole: false,
1825
+ supportsReasoningEffort: false,
1826
+ },
1827
+ }),
1828
+ );
1829
+ }
1830
+ return this.#applyProviderModelOverrides(providerConfig.provider, discovered);
1831
+ }
1832
+
1749
1833
  #normalizeLlamaCppBaseUrl(baseUrl?: string): string {
1750
1834
  const defaultBaseUrl = "http://127.0.0.1:8080";
1751
1835
  const raw = baseUrl || defaultBaseUrl;
@@ -121,7 +121,7 @@ export const ModelOverrideSchema = z.object({
121
121
  export type ModelOverride = z.infer<typeof ModelOverrideSchema>;
122
122
 
123
123
  export const ProviderDiscoverySchema = z.object({
124
- type: z.enum(["ollama", "llama.cpp", "lm-studio", "openai-models-list"]),
124
+ type: z.enum(["ollama", "llama.cpp", "lm-studio", "openai-models-list", "proxy"]),
125
125
  });
126
126
 
127
127
  export const ProviderAuthSchema = z.enum(["apiKey", "none", "oauth"]);
@@ -81,6 +81,7 @@ export type StatusLineSegmentId =
81
81
  | "hostname"
82
82
  | "cache_read"
83
83
  | "cache_write"
84
+ | "cache_hit"
84
85
  | "session_name"
85
86
  | "usage";
86
87
 
@@ -2016,16 +2017,6 @@ export const SETTINGS_SCHEMA = {
2016
2017
  },
2017
2018
  },
2018
2019
 
2019
- "calc.enabled": {
2020
- type: "boolean",
2021
- default: false,
2022
- ui: {
2023
- tab: "tools",
2024
- label: "Calculator",
2025
- description: "Enable the calculator tool for basic calculations",
2026
- },
2027
- },
2028
-
2029
2020
  "tts.enabled": {
2030
2021
  type: "boolean",
2031
2022
  default: false,
@@ -0,0 +1,297 @@
1
+ import { afterAll, afterEach, describe, expect, it, vi } from "bun:test";
2
+ import * as path from "node:path";
3
+ import type { Api, AssistantMessage, Model } from "@oh-my-pi/pi-ai";
4
+ import * as ai from "@oh-my-pi/pi-ai";
5
+ import { Effort } from "@oh-my-pi/pi-ai";
6
+ import { TempDir } from "@oh-my-pi/pi-utils";
7
+ import type { ModelRegistry } from "../../config/model-registry";
8
+ import { Settings } from "../../config/settings";
9
+ import type { ToolSession } from "../../tools";
10
+ import { ToolError } from "../../tools/tool-errors";
11
+ import { disposeAllVmContexts } from "../js/context-manager";
12
+ import { executeJs } from "../js/executor";
13
+ import { runEvalLlm } from "../llm-bridge";
14
+ import { disposeAllKernelSessions, executePython } from "../py/executor";
15
+
16
+ function makeModel(provider: string, id: string, extra: Partial<Model<Api>> = {}): Model<Api> {
17
+ return {
18
+ id,
19
+ name: id,
20
+ api: "openai-responses",
21
+ provider,
22
+ baseUrl: "https://example.test/v1",
23
+ reasoning: false,
24
+ input: ["text"],
25
+ cost: { input: 1, output: 1, cacheRead: 0, cacheWrite: 1 },
26
+ contextWindow: 128000,
27
+ maxTokens: 4096,
28
+ ...extra,
29
+ } as Model<Api>;
30
+ }
31
+
32
+ const SMOL = makeModel("p", "smol");
33
+ const DEFAULT = makeModel("p", "default");
34
+ const SLOW = makeModel("p", "slow");
35
+ const REASONING_SLOW = makeModel("p", "slow", {
36
+ api: "anthropic-messages",
37
+ reasoning: true,
38
+ thinking: { minLevel: Effort.Low, maxLevel: Effort.High, mode: "anthropic-adaptive" },
39
+ });
40
+
41
+ interface SessionOptions {
42
+ available?: Model<Api>[];
43
+ apiKey?: string | null;
44
+ activeModel?: string;
45
+ roles?: Partial<Record<"smol" | "default" | "slow", string>>;
46
+ }
47
+
48
+ function makeSession(opts: SessionOptions = {}): ToolSession {
49
+ const settings = Settings.isolated({ "async.enabled": false, "task.isolation.mode": "none" });
50
+ const roles = opts.roles ?? { smol: "p/smol", slow: "p/slow" };
51
+ for (const role in roles) {
52
+ const value = roles[role as keyof typeof roles];
53
+ if (value) settings.setModelRole(role, value);
54
+ }
55
+ const modelRegistry = {
56
+ getAvailable: () => opts.available ?? [SMOL, DEFAULT, SLOW],
57
+ getApiKey: async () => (opts.apiKey === undefined ? "test-key" : opts.apiKey),
58
+ } as unknown as ModelRegistry;
59
+ return {
60
+ settings,
61
+ modelRegistry,
62
+ getActiveModelString: () => opts.activeModel ?? "p/default",
63
+ } as unknown as ToolSession;
64
+ }
65
+
66
+ function assistant(opts: {
67
+ text?: string;
68
+ toolCall?: { name: string; arguments: Record<string, unknown> };
69
+ stopReason?: AssistantMessage["stopReason"];
70
+ errorMessage?: string;
71
+ }): AssistantMessage {
72
+ const content: AssistantMessage["content"] = [];
73
+ if (opts.text) content.push({ type: "text", text: opts.text });
74
+ if (opts.toolCall) {
75
+ content.push({ type: "toolCall", id: "tc-1", name: opts.toolCall.name, arguments: opts.toolCall.arguments });
76
+ }
77
+ return {
78
+ role: "assistant",
79
+ content,
80
+ api: "openai-responses",
81
+ provider: "p",
82
+ model: "default",
83
+ usage: {
84
+ input: 0,
85
+ output: 0,
86
+ cacheRead: 0,
87
+ cacheWrite: 0,
88
+ totalTokens: 0,
89
+ cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 },
90
+ },
91
+ stopReason: opts.stopReason ?? "stop",
92
+ errorMessage: opts.errorMessage,
93
+ timestamp: Date.now(),
94
+ };
95
+ }
96
+
97
+ describe("runEvalLlm", () => {
98
+ afterEach(() => {
99
+ vi.restoreAllMocks();
100
+ });
101
+
102
+ it("resolves each tier to its expected model", async () => {
103
+ const spy = vi.spyOn(ai, "completeSimple").mockResolvedValue(assistant({ text: "ok" }));
104
+ const session = makeSession();
105
+
106
+ await runEvalLlm({ prompt: "q", model: "smol" }, { session });
107
+ await runEvalLlm({ prompt: "q", model: "default" }, { session });
108
+ await runEvalLlm({ prompt: "q", model: "slow" }, { session });
109
+
110
+ const resolved = spy.mock.calls.map(call => {
111
+ const model = call[0] as Model<Api>;
112
+ return `${model.provider}/${model.id}`;
113
+ });
114
+ expect(resolved).toEqual(["p/smol", "p/default", "p/slow"]);
115
+ });
116
+
117
+ it("prefers the session active model for the default tier, falling back to pi/default", async () => {
118
+ const spy = vi.spyOn(ai, "completeSimple").mockResolvedValue(assistant({ text: "ok" }));
119
+ const session = makeSession({ available: [SMOL, DEFAULT, SLOW], activeModel: "p/slow" });
120
+
121
+ await runEvalLlm({ prompt: "q", model: "default" }, { session });
122
+
123
+ const model = spy.mock.calls[0]?.[0] as Model<Api>;
124
+ expect(`${model.provider}/${model.id}`).toBe("p/slow");
125
+ });
126
+
127
+ it("returns the completion text in plain mode", async () => {
128
+ vi.spyOn(ai, "completeSimple").mockResolvedValue(assistant({ text: "the answer" }));
129
+ const result = await runEvalLlm({ prompt: "q", model: "smol" }, { session: makeSession() });
130
+ expect(result.text).toBe("the answer");
131
+ expect(result.details).toEqual({ model: "p/smol", tier: "smol", structured: false });
132
+ });
133
+
134
+ it("forces a respond tool call and returns its arguments in structured mode", async () => {
135
+ const spy = vi
136
+ .spyOn(ai, "completeSimple")
137
+ .mockResolvedValue(assistant({ toolCall: { name: "respond", arguments: { answer: 42 } } }));
138
+ const result = await runEvalLlm(
139
+ { prompt: "q", model: "smol", schema: { type: "object", properties: { answer: { type: "number" } } } },
140
+ { session: makeSession() },
141
+ );
142
+
143
+ expect(JSON.parse(result.text)).toEqual({ answer: 42 });
144
+ expect(result.details.structured).toBe(true);
145
+
146
+ const ctx = spy.mock.calls[0]?.[1] as { tools?: Array<{ name: string }> };
147
+ const opts = spy.mock.calls[0]?.[2] as { toolChoice?: unknown };
148
+ expect(ctx.tools?.[0]?.name).toBe("respond");
149
+ expect(opts.toolChoice).toEqual({ type: "tool", name: "respond" });
150
+ });
151
+
152
+ it("falls back to JSON embedded in text when the model skips the respond tool", async () => {
153
+ vi.spyOn(ai, "completeSimple").mockResolvedValue(assistant({ text: 'here: {"answer": 7}' }));
154
+ const result = await runEvalLlm(
155
+ { prompt: "q", model: "smol", schema: { type: "object" } },
156
+ { session: makeSession() },
157
+ );
158
+ expect(JSON.parse(result.text)).toEqual({ answer: 7 });
159
+ });
160
+
161
+ it("requests reasoning only for the slow tier on a reasoning-capable model", async () => {
162
+ const spy = vi.spyOn(ai, "completeSimple").mockResolvedValue(assistant({ text: "ok" }));
163
+ const session = makeSession({ available: [SMOL, DEFAULT, REASONING_SLOW] });
164
+
165
+ await runEvalLlm({ prompt: "q", model: "smol" }, { session });
166
+ await runEvalLlm({ prompt: "q", model: "slow" }, { session });
167
+
168
+ const smolOpts = spy.mock.calls[0]?.[2] as { reasoning?: unknown };
169
+ const slowOpts = spy.mock.calls[1]?.[2] as { reasoning?: unknown };
170
+ expect(smolOpts.reasoning).toBeUndefined();
171
+ expect(slowOpts.reasoning).toBe(Effort.High);
172
+ });
173
+
174
+ it("does not request reasoning for the slow tier on a non-reasoning model", async () => {
175
+ const spy = vi.spyOn(ai, "completeSimple").mockResolvedValue(assistant({ text: "ok" }));
176
+ // SLOW is reasoning:false — must not trip requireSupportedEffort downstream.
177
+ const result = await runEvalLlm({ prompt: "q", model: "slow" }, { session: makeSession() });
178
+ expect(result.text).toBe("ok");
179
+ const opts = spy.mock.calls[0]?.[2] as { reasoning?: unknown };
180
+ expect(opts.reasoning).toBeUndefined();
181
+ });
182
+
183
+ it("throws ToolError on invalid arguments", async () => {
184
+ await expect(runEvalLlm({ prompt: "" }, { session: makeSession() })).rejects.toBeInstanceOf(ToolError);
185
+ await expect(runEvalLlm({ prompt: "q", model: "huge" }, { session: makeSession() })).rejects.toBeInstanceOf(
186
+ ToolError,
187
+ );
188
+ });
189
+
190
+ it("throws ToolError when no model resolves for the tier", async () => {
191
+ const session = makeSession({ available: [DEFAULT], roles: { smol: "missing/model" } });
192
+ await expect(runEvalLlm({ prompt: "q", model: "smol" }, { session })).rejects.toBeInstanceOf(ToolError);
193
+ });
194
+
195
+ it("throws ToolError when the resolved model has no API key", async () => {
196
+ const session = makeSession({ apiKey: null });
197
+ await expect(runEvalLlm({ prompt: "q", model: "smol" }, { session })).rejects.toBeInstanceOf(ToolError);
198
+ });
199
+
200
+ it("maps error and aborted stop reasons to ToolError", async () => {
201
+ vi.spyOn(ai, "completeSimple").mockResolvedValueOnce(assistant({ stopReason: "error", errorMessage: "boom" }));
202
+ await expect(runEvalLlm({ prompt: "q", model: "smol" }, { session: makeSession() })).rejects.toThrow("boom");
203
+
204
+ vi.spyOn(ai, "completeSimple").mockResolvedValueOnce(assistant({ stopReason: "aborted" }));
205
+ await expect(runEvalLlm({ prompt: "q", model: "smol" }, { session: makeSession() })).rejects.toBeInstanceOf(
206
+ ToolError,
207
+ );
208
+ });
209
+
210
+ it("throws ToolError when plain mode produces no text", async () => {
211
+ vi.spyOn(ai, "completeSimple").mockResolvedValue(assistant({ text: "" }));
212
+ await expect(runEvalLlm({ prompt: "q", model: "smol" }, { session: makeSession() })).rejects.toBeInstanceOf(
213
+ ToolError,
214
+ );
215
+ });
216
+ });
217
+
218
+ describe("llm() through eval runtimes", () => {
219
+ afterEach(() => {
220
+ vi.restoreAllMocks();
221
+ });
222
+
223
+ afterAll(async () => {
224
+ await disposeAllVmContexts();
225
+ await disposeAllKernelSessions();
226
+ });
227
+
228
+ it("exposes llm() in the JavaScript runtime", async () => {
229
+ using tempDir = TempDir.createSync("@omp-eval-llm-js-");
230
+ const sessionFile = path.join(tempDir.path(), "session.jsonl");
231
+ const sessionId = `js-llm:${crypto.randomUUID()}`;
232
+ vi.spyOn(ai, "completeSimple").mockResolvedValue(assistant({ text: "hello from smol" }));
233
+
234
+ const result = await executeJs('return await llm("hi", { model: "smol" });', {
235
+ cwd: tempDir.path(),
236
+ sessionId,
237
+ session: makeSession(),
238
+ sessionFile,
239
+ });
240
+
241
+ expect(result.exitCode).toBe(0);
242
+ expect(result.output.trim()).toBe("hello from smol");
243
+ });
244
+
245
+ it("parses structured llm() output in the JavaScript runtime", async () => {
246
+ using tempDir = TempDir.createSync("@omp-eval-llm-js-struct-");
247
+ const sessionFile = path.join(tempDir.path(), "session.jsonl");
248
+ const sessionId = `js-llm-struct:${crypto.randomUUID()}`;
249
+ vi.spyOn(ai, "completeSimple").mockResolvedValue(
250
+ assistant({ toolCall: { name: "respond", arguments: { ok: true, n: 3 } } }),
251
+ );
252
+
253
+ const result = await executeJs(
254
+ 'const r = await llm("hi", { schema: { type: "object" } }); return JSON.stringify(r);',
255
+ { cwd: tempDir.path(), sessionId, session: makeSession(), sessionFile },
256
+ );
257
+
258
+ expect(result.exitCode).toBe(0);
259
+ expect(JSON.parse(result.output.trim())).toEqual({ ok: true, n: 3 });
260
+ });
261
+
262
+ it("exposes llm() in the Python runtime", async () => {
263
+ using tempDir = TempDir.createSync("@omp-eval-llm-py-");
264
+ const sessionFile = path.join(tempDir.path(), "session.jsonl");
265
+ const sessionId = `py-llm:${crypto.randomUUID()}`;
266
+ vi.spyOn(ai, "completeSimple").mockResolvedValue(assistant({ text: "hello from python" }));
267
+
268
+ const result = await executePython('print(llm("hi", model="smol"))', {
269
+ cwd: tempDir.path(),
270
+ sessionId,
271
+ sessionFile,
272
+ toolSession: makeSession(),
273
+ });
274
+
275
+ expect(result.exitCode).toBe(0);
276
+ expect(result.output.trim()).toBe("hello from python");
277
+ });
278
+
279
+ it("parses structured llm() output in the Python runtime", async () => {
280
+ using tempDir = TempDir.createSync("@omp-eval-llm-py-struct-");
281
+ const sessionFile = path.join(tempDir.path(), "session.jsonl");
282
+ const sessionId = `py-llm-struct:${crypto.randomUUID()}`;
283
+ vi.spyOn(ai, "completeSimple").mockResolvedValue(
284
+ assistant({ toolCall: { name: "respond", arguments: { ok: true } } }),
285
+ );
286
+
287
+ const result = await executePython('import json\nprint(json.dumps(llm("hi", schema={"type": "object"})))', {
288
+ cwd: tempDir.path(),
289
+ sessionId,
290
+ sessionFile,
291
+ toolSession: makeSession(),
292
+ });
293
+
294
+ expect(result.exitCode).toBe(0);
295
+ expect(JSON.parse(result.output.trim())).toEqual({ ok: true });
296
+ });
297
+ });
@@ -39,6 +39,13 @@ if (!globalThis.__omp_js_prelude_loaded__) {
39
39
  return values.length === 1 ? values[0] : values;
40
40
  };
41
41
 
42
+ const llm = async (prompt, opts = {}) => {
43
+ const o = toOptions(opts);
44
+ const res = await globalThis.__omp_call_tool__("__llm__", { prompt, ...o });
45
+ const text = res && typeof res === "object" ? res.text : res;
46
+ return o.schema ? JSON.parse(text) : text;
47
+ };
48
+
42
49
  const display = value => {
43
50
  globalThis.__omp_display__(value);
44
51
  };
@@ -61,6 +68,7 @@ if (!globalThis.__omp_js_prelude_loaded__) {
61
68
  globalThis.print = consoleBridge.log;
62
69
  globalThis.display = display;
63
70
  globalThis.tool = tool;
71
+ globalThis.llm = llm;
64
72
  globalThis.output = output;
65
73
  globalThis.read = read;
66
74
  globalThis.write = write;
@@ -1,6 +1,7 @@
1
1
  import type { AgentTool, AgentToolResult } from "@oh-my-pi/pi-agent-core";
2
2
  import type { ToolSession } from "../../tools";
3
3
  import { ToolError } from "../../tools/tool-errors";
4
+ import { EVAL_LLM_BRIDGE_NAME, runEvalLlm } from "../llm-bridge";
4
5
  import type { JsStatusEvent } from "./shared/types";
5
6
 
6
7
  export type { JsStatusEvent } from "./shared/types";
@@ -101,6 +102,9 @@ function summarizeToolResult(
101
102
  }
102
103
 
103
104
  export async function callSessionTool(name: string, args: unknown, options: ToolBridgeOptions): Promise<ToolValue> {
105
+ if (name === EVAL_LLM_BRIDGE_NAME) {
106
+ return await runEvalLlm(args, options);
107
+ }
104
108
  const tool = getTool(options.session, name);
105
109
  const normalizedArgs = normalizeArgs(args);
106
110
  const toolCallId = `js-${name}-${crypto.randomUUID()}`;