@oh-my-pi/pi-coding-agent 15.11.6 → 15.11.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +29 -1
- package/dist/cli.js +114 -71
- package/dist/types/cli/bench-cli.d.ts +78 -0
- package/dist/types/commands/bench.d.ts +29 -0
- package/dist/types/config/model-resolver.d.ts +3 -2
- package/dist/types/config/settings-schema.d.ts +72 -0
- package/dist/types/edit/renderer.d.ts +1 -0
- package/dist/types/modes/components/oauth-selector.d.ts +10 -1
- package/dist/types/modes/components/settings-selector.d.ts +8 -1
- package/dist/types/modes/components/snapcompact-shape-preview.d.ts +31 -0
- package/dist/types/modes/components/tool-execution.d.ts +13 -9
- package/dist/types/modes/setup-wizard/scenes/sign-in.d.ts +3 -0
- package/dist/types/modes/setup-wizard/scenes/types.d.ts +10 -1
- package/dist/types/modes/setup-wizard/scenes/web-search.d.ts +3 -0
- package/dist/types/session/snapcompact-inline.d.ts +2 -0
- package/dist/types/tools/bash.d.ts +2 -0
- package/dist/types/tools/eval-render.d.ts +1 -0
- package/dist/types/tools/renderers.d.ts +13 -0
- package/dist/types/tools/ssh.d.ts +1 -0
- package/package.json +11 -11
- package/src/cli/bench-cli.ts +437 -0
- package/src/cli-commands.ts +1 -0
- package/src/commands/bench.ts +42 -0
- package/src/config/model-registry.ts +52 -5
- package/src/config/model-resolver.ts +36 -5
- package/src/config/settings-schema.ts +92 -0
- package/src/edit/renderer.ts +5 -0
- package/src/hindsight/client.ts +26 -1
- package/src/hindsight/state.ts +6 -2
- package/src/internal-urls/docs-index.generated.ts +1 -1
- package/src/mcp/transports/stdio.ts +81 -7
- package/src/modes/components/oauth-selector.ts +67 -7
- package/src/modes/components/settings-selector.ts +27 -0
- package/src/modes/components/snapcompact-shape-preview-doc.md +11 -0
- package/src/modes/components/snapcompact-shape-preview.ts +192 -0
- package/src/modes/components/tool-execution.ts +18 -10
- package/src/modes/controllers/input-controller.ts +8 -6
- package/src/modes/controllers/selector-controller.ts +4 -2
- package/src/modes/interactive-mode.ts +24 -0
- package/src/modes/setup-wizard/index.ts +1 -0
- package/src/modes/setup-wizard/scenes/glyph.ts +24 -6
- package/src/modes/setup-wizard/scenes/providers.ts +36 -2
- package/src/modes/setup-wizard/scenes/sign-in.ts +10 -1
- package/src/modes/setup-wizard/scenes/theme.ts +28 -1
- package/src/modes/setup-wizard/scenes/types.ts +10 -1
- package/src/modes/setup-wizard/scenes/web-search.ts +22 -6
- package/src/modes/setup-wizard/wizard-overlay.ts +38 -1
- package/src/modes/utils/context-usage.ts +1 -1
- package/src/prompts/bench.md +7 -0
- package/src/sdk.ts +1 -0
- package/src/session/agent-session.ts +5 -0
- package/src/session/snapcompact-inline.ts +11 -19
- package/src/tools/bash.ts +3 -0
- package/src/tools/eval-render.ts +4 -0
- package/src/tools/renderers.ts +13 -0
- package/src/tools/ssh.ts +3 -0
|
@@ -0,0 +1,437 @@
|
|
|
1
|
+
import type { ResolvedThinkingLevel } from "@oh-my-pi/pi-agent-core";
|
|
2
|
+
import type {
|
|
3
|
+
Api,
|
|
4
|
+
ApiKeyResolver,
|
|
5
|
+
AssistantMessage,
|
|
6
|
+
AssistantMessageEvent,
|
|
7
|
+
AssistantMessageEventStream,
|
|
8
|
+
Context,
|
|
9
|
+
Effort,
|
|
10
|
+
Model,
|
|
11
|
+
SimpleStreamOptions,
|
|
12
|
+
} from "@oh-my-pi/pi-ai";
|
|
13
|
+
import { streamSimple } from "@oh-my-pi/pi-ai";
|
|
14
|
+
import type { CanonicalModelVariant } from "@oh-my-pi/pi-catalog/identity";
|
|
15
|
+
import { replaceTabs, truncateToWidth } from "@oh-my-pi/pi-tui";
|
|
16
|
+
import { formatDuration, getProjectDir } from "@oh-my-pi/pi-utils";
|
|
17
|
+
import chalk from "chalk";
|
|
18
|
+
import type { ApiKeyResolverModel } from "../config/api-key-resolver";
|
|
19
|
+
import { type CanonicalModelQueryOptions, ModelRegistry } from "../config/model-registry";
|
|
20
|
+
import { formatModelString, getModelMatchPreferences, resolveCliModel } from "../config/model-resolver";
|
|
21
|
+
import { Settings } from "../config/settings";
|
|
22
|
+
import benchPrompt from "../prompts/bench.md" with { type: "text" };
|
|
23
|
+
import { discoverAuthStorage } from "../sdk";
|
|
24
|
+
import { resolveThinkingLevelForModel, shouldDisableReasoning, toReasoningEffort } from "../thinking";
|
|
25
|
+
|
|
26
|
+
const DEFAULT_RUNS = 1;
|
|
27
|
+
const DEFAULT_MAX_TOKENS = 512;
|
|
28
|
+
const ERROR_WIDTH = 110;
|
|
29
|
+
const BENCH_PROMPT = benchPrompt.trim();
|
|
30
|
+
|
|
31
|
+
export interface BenchCommandArgs {
|
|
32
|
+
models: string[];
|
|
33
|
+
flags: {
|
|
34
|
+
runs?: number;
|
|
35
|
+
maxTokens?: number;
|
|
36
|
+
prompt?: string;
|
|
37
|
+
json?: boolean;
|
|
38
|
+
};
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
export interface BenchModelRegistry {
|
|
42
|
+
getAll(): Model<Api>[];
|
|
43
|
+
getApiKey(model: Model<Api>, sessionId?: string): Promise<string | undefined>;
|
|
44
|
+
resolver(model: ApiKeyResolverModel, sessionId?: string): ApiKeyResolver;
|
|
45
|
+
resolveCanonicalModel?(canonicalId: string, options?: CanonicalModelQueryOptions): Model<Api> | undefined;
|
|
46
|
+
getCanonicalVariants?(canonicalId: string, options?: CanonicalModelQueryOptions): CanonicalModelVariant[];
|
|
47
|
+
getCanonicalId?(model: Model<Api>): string | undefined;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
export interface BenchRuntime {
|
|
51
|
+
modelRegistry: BenchModelRegistry;
|
|
52
|
+
settings?: Settings;
|
|
53
|
+
close?: () => void;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
export interface BenchRunSuccess {
|
|
57
|
+
ok: true;
|
|
58
|
+
ttftMs: number;
|
|
59
|
+
durationMs: number;
|
|
60
|
+
outputTokens: number;
|
|
61
|
+
/** Generation throughput measured over the post-first-token window. */
|
|
62
|
+
tokensPerSecond: number;
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
export interface BenchRunFailure {
|
|
66
|
+
ok: false;
|
|
67
|
+
error: string;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
export type BenchRunResult = BenchRunSuccess | BenchRunFailure;
|
|
71
|
+
|
|
72
|
+
export interface BenchAverages {
|
|
73
|
+
ttftMs: number;
|
|
74
|
+
durationMs: number;
|
|
75
|
+
outputTokens: number;
|
|
76
|
+
tokensPerSecond: number;
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
export interface BenchModelReport {
|
|
80
|
+
/** Selector as the user typed it (e.g. "opus" or "gemini-3.5:low"). */
|
|
81
|
+
selector: string;
|
|
82
|
+
/** Resolved `provider/id`. */
|
|
83
|
+
model: string;
|
|
84
|
+
/** Explicit thinking level from a `:level` selector suffix; undefined = provider default. */
|
|
85
|
+
thinking?: ResolvedThinkingLevel;
|
|
86
|
+
results: BenchRunResult[];
|
|
87
|
+
/** Averages over successful runs; null when every run failed. */
|
|
88
|
+
average: BenchAverages | null;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
export interface BenchSummary {
|
|
92
|
+
runs: number;
|
|
93
|
+
maxTokens: number;
|
|
94
|
+
models: BenchModelReport[];
|
|
95
|
+
failures: number;
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
type BenchStreamSimple = (
|
|
99
|
+
model: Model<Api>,
|
|
100
|
+
context: Context,
|
|
101
|
+
options?: SimpleStreamOptions,
|
|
102
|
+
) => AssistantMessageEventStream;
|
|
103
|
+
|
|
104
|
+
export interface BenchDependencies {
|
|
105
|
+
createRuntime?: () => Promise<BenchRuntime>;
|
|
106
|
+
randomSessionId?: () => string;
|
|
107
|
+
writeStdout?: (text: string) => void;
|
|
108
|
+
writeStderr?: (text: string) => void;
|
|
109
|
+
setExitCode?: (code: number) => void;
|
|
110
|
+
streamSimple?: BenchStreamSimple;
|
|
111
|
+
now?: () => number;
|
|
112
|
+
stdoutIsTTY?: boolean;
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
function getErrorMessage(error: unknown): string {
|
|
116
|
+
if (error instanceof Error && error.message) return error.message;
|
|
117
|
+
return String(error);
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
function normalizePositiveInteger(name: string, value: number | undefined, fallback: number): number {
|
|
121
|
+
if (value === undefined) return fallback;
|
|
122
|
+
if (!Number.isInteger(value) || value <= 0) {
|
|
123
|
+
throw new Error(`Expected --${name} to be a positive integer, got ${value}`);
|
|
124
|
+
}
|
|
125
|
+
return value;
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
function isFirstTokenEvent(event: AssistantMessageEvent): boolean {
|
|
129
|
+
switch (event.type) {
|
|
130
|
+
case "text_delta":
|
|
131
|
+
case "thinking_delta":
|
|
132
|
+
case "toolcall_delta":
|
|
133
|
+
return event.delta.length > 0;
|
|
134
|
+
case "text_end":
|
|
135
|
+
case "thinking_end":
|
|
136
|
+
return event.content.length > 0;
|
|
137
|
+
default:
|
|
138
|
+
return false;
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
/**
|
|
143
|
+
* Tokens/s over the generation window (duration minus TTFT) so queue/prefill
|
|
144
|
+
* latency does not dilute throughput. Falls back to total duration when the
|
|
145
|
+
* response arrived as a single chunk (TTFT ~ duration).
|
|
146
|
+
*/
|
|
147
|
+
function computeTokensPerSecond(outputTokens: number, durationMs: number, ttftMs: number): number {
|
|
148
|
+
const decodeMs = durationMs - ttftMs;
|
|
149
|
+
const windowMs = decodeMs > 0 ? decodeMs : durationMs;
|
|
150
|
+
return windowMs > 0 ? (outputTokens * 1000) / windowMs : 0;
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
interface BenchRequestOptions {
|
|
154
|
+
apiKey: ApiKeyResolver;
|
|
155
|
+
sessionId: string;
|
|
156
|
+
prompt: string;
|
|
157
|
+
maxTokens: number;
|
|
158
|
+
/** Explicit effort from a `:level` selector suffix; absent = provider default. */
|
|
159
|
+
reasoning?: Effort;
|
|
160
|
+
/** Only set for an explicit `:off` suffix — some endpoints reject disablement. */
|
|
161
|
+
disableReasoning?: boolean;
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
async function runBenchRequest(
|
|
165
|
+
model: Model<Api>,
|
|
166
|
+
options: BenchRequestOptions,
|
|
167
|
+
streamFn: BenchStreamSimple,
|
|
168
|
+
now: () => number,
|
|
169
|
+
): Promise<BenchRunResult> {
|
|
170
|
+
const startedAt = now();
|
|
171
|
+
let firstTokenAt: number | undefined;
|
|
172
|
+
try {
|
|
173
|
+
const context: Context = {
|
|
174
|
+
// Codex's Responses endpoint 400s with "Instructions are required" when no
|
|
175
|
+
// system prompt is present — same guard as eval's completion bridge.
|
|
176
|
+
systemPrompt: ["You are a helpful assistant."],
|
|
177
|
+
messages: [{ role: "user", content: options.prompt, timestamp: Date.now(), attribution: "user" }],
|
|
178
|
+
};
|
|
179
|
+
const stream = streamFn(model, context, {
|
|
180
|
+
apiKey: options.apiKey,
|
|
181
|
+
sessionId: options.sessionId,
|
|
182
|
+
maxTokens:
|
|
183
|
+
Number.isFinite(model.maxTokens) && model.maxTokens > 0
|
|
184
|
+
? Math.min(options.maxTokens, model.maxTokens)
|
|
185
|
+
: options.maxTokens,
|
|
186
|
+
reasoning: options.reasoning,
|
|
187
|
+
disableReasoning: options.disableReasoning,
|
|
188
|
+
// pi-ai opts every OpenRouter request into response caching (1h TTL).
|
|
189
|
+
// Bench sends a byte-identical request each run, so within the TTL
|
|
190
|
+
// OpenRouter replays the cached generation with zeroed usage — the run
|
|
191
|
+
// shows "tokens 0, TPS 0.0" at line speed. Opt back out so every run
|
|
192
|
+
// measures a fresh generation.
|
|
193
|
+
headers: model.provider === "openrouter" ? { "X-OpenRouter-Cache": "false" } : undefined,
|
|
194
|
+
});
|
|
195
|
+
let message: AssistantMessage | undefined;
|
|
196
|
+
for await (const event of stream) {
|
|
197
|
+
if (firstTokenAt === undefined && isFirstTokenEvent(event)) {
|
|
198
|
+
firstTokenAt = now();
|
|
199
|
+
}
|
|
200
|
+
if (event.type === "error") {
|
|
201
|
+
return { ok: false, error: event.error.errorMessage ?? "request failed" };
|
|
202
|
+
}
|
|
203
|
+
if (event.type === "done") {
|
|
204
|
+
message = event.message;
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
message ??= await stream.result();
|
|
208
|
+
if (message.stopReason === "error" || message.errorMessage) {
|
|
209
|
+
return { ok: false, error: message.errorMessage ?? "request failed" };
|
|
210
|
+
}
|
|
211
|
+
const rawDuration = message.duration ?? now() - startedAt;
|
|
212
|
+
const durationMs = Number.isFinite(rawDuration) && rawDuration > 0 ? rawDuration : 0;
|
|
213
|
+
const rawTtft = message.ttft ?? (firstTokenAt === undefined ? durationMs : firstTokenAt - startedAt);
|
|
214
|
+
const ttftMs = Number.isFinite(rawTtft) && rawTtft > 0 ? rawTtft : 0;
|
|
215
|
+
const outputTokens = Number.isFinite(message.usage.output) && message.usage.output > 0 ? message.usage.output : 0;
|
|
216
|
+
return {
|
|
217
|
+
ok: true,
|
|
218
|
+
ttftMs,
|
|
219
|
+
durationMs,
|
|
220
|
+
outputTokens,
|
|
221
|
+
tokensPerSecond: computeTokensPerSecond(outputTokens, durationMs, ttftMs),
|
|
222
|
+
};
|
|
223
|
+
} catch (error) {
|
|
224
|
+
return { ok: false, error: getErrorMessage(error) };
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
function buildModelReport(
|
|
229
|
+
selector: string,
|
|
230
|
+
model: Model<Api>,
|
|
231
|
+
thinking: ResolvedThinkingLevel | undefined,
|
|
232
|
+
results: BenchRunResult[],
|
|
233
|
+
): BenchModelReport {
|
|
234
|
+
const successes = results.filter((result): result is BenchRunSuccess => result.ok);
|
|
235
|
+
const average =
|
|
236
|
+
successes.length === 0
|
|
237
|
+
? null
|
|
238
|
+
: {
|
|
239
|
+
ttftMs: successes.reduce((sum, r) => sum + r.ttftMs, 0) / successes.length,
|
|
240
|
+
durationMs: successes.reduce((sum, r) => sum + r.durationMs, 0) / successes.length,
|
|
241
|
+
outputTokens: successes.reduce((sum, r) => sum + r.outputTokens, 0) / successes.length,
|
|
242
|
+
tokensPerSecond: successes.reduce((sum, r) => sum + r.tokensPerSecond, 0) / successes.length,
|
|
243
|
+
};
|
|
244
|
+
return { selector, model: formatModelString(model), thinking, results, average };
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
function formatMs(ms: number): string {
|
|
248
|
+
return formatDuration(Math.max(0, Math.round(ms)));
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
function formatRunLine(result: BenchRunResult, index: number, total: number): string {
|
|
252
|
+
const prefix = chalk.dim(`run ${index + 1}/${total}`);
|
|
253
|
+
if (result.ok) {
|
|
254
|
+
return ` ${chalk.green("✓")} ${prefix} ${chalk.dim("TTFT")} ${formatMs(result.ttftMs)} ${chalk.dim("TPS")} ${result.tokensPerSecond.toFixed(1)}/s ${chalk.dim("tokens")} ${result.outputTokens} ${chalk.dim("total")} ${formatMs(result.durationMs)}`;
|
|
255
|
+
}
|
|
256
|
+
return ` ${chalk.red("✗")} ${prefix} ${chalk.red(truncateToWidth(replaceTabs(result.error).replace(/\r?\n/g, " "), ERROR_WIDTH))}`;
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
export function formatBenchTable(summary: BenchSummary): string {
|
|
260
|
+
const ranked = [...summary.models].sort((a, b) => {
|
|
261
|
+
if (a.average === null && b.average === null) return 0;
|
|
262
|
+
if (a.average === null) return 1;
|
|
263
|
+
if (b.average === null) return -1;
|
|
264
|
+
return b.average.tokensPerSecond - a.average.tokensPerSecond;
|
|
265
|
+
});
|
|
266
|
+
const rows = ranked.map(report => ({
|
|
267
|
+
model: report.model,
|
|
268
|
+
ttft: report.average ? formatMs(report.average.ttftMs) : "-",
|
|
269
|
+
tps: report.average ? `${report.average.tokensPerSecond.toFixed(1)}/s` : "-",
|
|
270
|
+
tokens: report.average ? String(Math.round(report.average.outputTokens)) : "-",
|
|
271
|
+
total: report.average ? formatMs(report.average.durationMs) : "-",
|
|
272
|
+
failed: report.results.filter(result => !result.ok).length,
|
|
273
|
+
}));
|
|
274
|
+
const headers = { model: "model", ttft: "TTFT", tps: "TPS", tokens: "tokens", total: "total" } as const;
|
|
275
|
+
const width = (key: keyof typeof headers): number =>
|
|
276
|
+
Math.max(headers[key].length, ...rows.map(row => row[key].length));
|
|
277
|
+
const lines = [
|
|
278
|
+
[
|
|
279
|
+
headers.model.padEnd(width("model")),
|
|
280
|
+
headers.ttft.padEnd(width("ttft")),
|
|
281
|
+
headers.tps.padEnd(width("tps")),
|
|
282
|
+
headers.tokens.padEnd(width("tokens")),
|
|
283
|
+
headers.total.padEnd(width("total")),
|
|
284
|
+
]
|
|
285
|
+
.join(" ")
|
|
286
|
+
.trimEnd(),
|
|
287
|
+
];
|
|
288
|
+
for (const row of rows) {
|
|
289
|
+
const failedSuffix = row.failed > 0 ? ` ${chalk.red(`(${row.failed} failed)`)}` : "";
|
|
290
|
+
lines.push(
|
|
291
|
+
[
|
|
292
|
+
row.model.padEnd(width("model")),
|
|
293
|
+
row.ttft.padEnd(width("ttft")),
|
|
294
|
+
row.tps.padEnd(width("tps")),
|
|
295
|
+
row.tokens.padEnd(width("tokens")),
|
|
296
|
+
row.total.padEnd(width("total")),
|
|
297
|
+
]
|
|
298
|
+
.join(" ")
|
|
299
|
+
.trimEnd() + failedSuffix,
|
|
300
|
+
);
|
|
301
|
+
}
|
|
302
|
+
return `${lines.map((line, index) => (index === 0 ? chalk.dim(line) : line)).join("\n")}\n`;
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
async function createDefaultRuntime(): Promise<BenchRuntime> {
|
|
306
|
+
const authStorage = await discoverAuthStorage();
|
|
307
|
+
try {
|
|
308
|
+
const settings = await Settings.init({ cwd: getProjectDir() });
|
|
309
|
+
const modelRegistry = new ModelRegistry(authStorage);
|
|
310
|
+
return {
|
|
311
|
+
modelRegistry,
|
|
312
|
+
settings,
|
|
313
|
+
close: () => authStorage.close(),
|
|
314
|
+
};
|
|
315
|
+
} catch (error) {
|
|
316
|
+
authStorage.close();
|
|
317
|
+
throw error;
|
|
318
|
+
}
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
interface BenchTarget {
|
|
322
|
+
selector: string;
|
|
323
|
+
model: Model<Api>;
|
|
324
|
+
thinking: ResolvedThinkingLevel | undefined;
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
function resolveBenchModels(
|
|
328
|
+
selectors: string[],
|
|
329
|
+
modelRegistry: BenchModelRegistry,
|
|
330
|
+
settings: Settings | undefined,
|
|
331
|
+
writeStderr: (text: string) => void,
|
|
332
|
+
): BenchTarget[] {
|
|
333
|
+
const preferences = getModelMatchPreferences(settings);
|
|
334
|
+
const resolved: BenchTarget[] = [];
|
|
335
|
+
const errors: string[] = [];
|
|
336
|
+
for (const selector of selectors) {
|
|
337
|
+
const result = resolveCliModel({ cliModel: selector, modelRegistry, preferences });
|
|
338
|
+
if (result.error) {
|
|
339
|
+
errors.push(`${selector}: ${result.error}`);
|
|
340
|
+
continue;
|
|
341
|
+
}
|
|
342
|
+
if (!result.model) {
|
|
343
|
+
errors.push(`${selector}: model not found`);
|
|
344
|
+
continue;
|
|
345
|
+
}
|
|
346
|
+
if (result.warning) writeStderr(`${chalk.yellow(`Warning: ${result.warning}`)}\n`);
|
|
347
|
+
resolved.push({
|
|
348
|
+
selector,
|
|
349
|
+
model: result.model,
|
|
350
|
+
thinking: resolveThinkingLevelForModel(result.model, result.thinkingLevel),
|
|
351
|
+
});
|
|
352
|
+
}
|
|
353
|
+
if (errors.length > 0) {
|
|
354
|
+
throw new Error(`Could not resolve ${errors.length === 1 ? "model" : "models"}:\n${errors.join("\n")}`);
|
|
355
|
+
}
|
|
356
|
+
return resolved;
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
export async function runBenchCommand(command: BenchCommandArgs, deps: BenchDependencies = {}): Promise<BenchSummary> {
|
|
360
|
+
const runs = normalizePositiveInteger("runs", command.flags.runs, DEFAULT_RUNS);
|
|
361
|
+
const maxTokens = normalizePositiveInteger("max-tokens", command.flags.maxTokens, DEFAULT_MAX_TOKENS);
|
|
362
|
+
const prompt = command.flags.prompt?.trim() || BENCH_PROMPT;
|
|
363
|
+
const json = command.flags.json === true;
|
|
364
|
+
const randomSessionId = deps.randomSessionId ?? (() => Bun.randomUUIDv7());
|
|
365
|
+
const writeStdout = deps.writeStdout ?? ((text: string) => process.stdout.write(text));
|
|
366
|
+
const writeStderr = deps.writeStderr ?? ((text: string) => process.stderr.write(text));
|
|
367
|
+
const setExitCode =
|
|
368
|
+
deps.setExitCode ??
|
|
369
|
+
((code: number) => {
|
|
370
|
+
process.exitCode = code;
|
|
371
|
+
});
|
|
372
|
+
const streamFn = deps.streamSimple ?? streamSimple;
|
|
373
|
+
const now = deps.now ?? (() => performance.now());
|
|
374
|
+
const interactive = deps.stdoutIsTTY ?? process.stdout.isTTY === true;
|
|
375
|
+
if (command.models.length === 0) {
|
|
376
|
+
throw new Error("Pass at least one model selector, e.g. `omp bench opus gpt-5.2`");
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
const runtime = await (deps.createRuntime ?? createDefaultRuntime)();
|
|
380
|
+
try {
|
|
381
|
+
const targets = resolveBenchModels(command.models, runtime.modelRegistry, runtime.settings, writeStderr);
|
|
382
|
+
const reports: BenchModelReport[] = [];
|
|
383
|
+
for (const { selector, model, thinking } of targets) {
|
|
384
|
+
if (!json) {
|
|
385
|
+
const resolvedNote = selector === formatModelString(model) ? "" : chalk.dim(` (${selector})`);
|
|
386
|
+
writeStdout(`${chalk.bold(formatModelString(model))}${resolvedNote}\n`);
|
|
387
|
+
}
|
|
388
|
+
const results: BenchRunResult[] = [];
|
|
389
|
+
for (let index = 0; index < runs; index++) {
|
|
390
|
+
const sessionId = randomSessionId();
|
|
391
|
+
const initialKey = await runtime.modelRegistry.getApiKey(model, sessionId);
|
|
392
|
+
if (!initialKey) {
|
|
393
|
+
const failure: BenchRunFailure = {
|
|
394
|
+
ok: false,
|
|
395
|
+
error: `No credentials for provider "${model.provider}". Run \`omp\` and use /login, or set the provider API key.`,
|
|
396
|
+
};
|
|
397
|
+
results.push(failure);
|
|
398
|
+
if (!json) writeStdout(`${formatRunLine(failure, index, runs)}\n`);
|
|
399
|
+
break; // remaining runs would fail identically
|
|
400
|
+
}
|
|
401
|
+
if (!json && interactive) {
|
|
402
|
+
writeStdout(chalk.dim(` … run ${index + 1}/${runs} streaming`));
|
|
403
|
+
}
|
|
404
|
+
const result = await runBenchRequest(
|
|
405
|
+
model,
|
|
406
|
+
{
|
|
407
|
+
apiKey: runtime.modelRegistry.resolver(model, sessionId),
|
|
408
|
+
sessionId,
|
|
409
|
+
prompt,
|
|
410
|
+
maxTokens,
|
|
411
|
+
reasoning: toReasoningEffort(thinking),
|
|
412
|
+
disableReasoning: shouldDisableReasoning(thinking) ? true : undefined,
|
|
413
|
+
},
|
|
414
|
+
streamFn,
|
|
415
|
+
now,
|
|
416
|
+
);
|
|
417
|
+
results.push(result);
|
|
418
|
+
if (!json) {
|
|
419
|
+
if (interactive) writeStdout("\r\x1b[2K");
|
|
420
|
+
writeStdout(`${formatRunLine(result, index, runs)}\n`);
|
|
421
|
+
}
|
|
422
|
+
}
|
|
423
|
+
reports.push(buildModelReport(selector, model, thinking, results));
|
|
424
|
+
}
|
|
425
|
+
const failures = reports.reduce((sum, report) => sum + report.results.filter(result => !result.ok).length, 0);
|
|
426
|
+
const summary: BenchSummary = { runs, maxTokens, models: reports, failures };
|
|
427
|
+
if (json) {
|
|
428
|
+
writeStdout(`${JSON.stringify(summary, null, 2)}\n`);
|
|
429
|
+
} else if (reports.length > 1 || runs > 1) {
|
|
430
|
+
writeStdout(`\n${formatBenchTable(summary)}`);
|
|
431
|
+
}
|
|
432
|
+
if (failures > 0) setExitCode(1);
|
|
433
|
+
return summary;
|
|
434
|
+
} finally {
|
|
435
|
+
runtime.close?.();
|
|
436
|
+
}
|
|
437
|
+
}
|
package/src/cli-commands.ts
CHANGED
|
@@ -16,6 +16,7 @@ export const commands: CommandEntry[] = [
|
|
|
16
16
|
{ name: "auth-broker", load: () => import("./commands/auth-broker").then(m => m.default) },
|
|
17
17
|
{ name: "auth-gateway", load: () => import("./commands/auth-gateway").then(m => m.default) },
|
|
18
18
|
{ name: "agents", load: () => import("./commands/agents").then(m => m.default) },
|
|
19
|
+
{ name: "bench", load: () => import("./commands/bench").then(m => m.default) },
|
|
19
20
|
{ name: "commit", load: () => import("./commands/commit").then(m => m.default) },
|
|
20
21
|
{ name: "completions", load: () => import("./commands/completions").then(m => m.default) },
|
|
21
22
|
{ name: "__complete", load: () => import("./commands/complete").then(m => m.default) },
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
import { Args, Command, Flags } from "@oh-my-pi/pi-utils/cli";
|
|
2
|
+
import { runBenchCommand } from "../cli/bench-cli";
|
|
3
|
+
|
|
4
|
+
export default class Bench extends Command {
|
|
5
|
+
static description =
|
|
6
|
+
"Benchmark models with the same prompt: time-to-first-token and generation throughput (tokens/s)";
|
|
7
|
+
|
|
8
|
+
static args = {
|
|
9
|
+
models: Args.string({
|
|
10
|
+
description: "Model selectors (provider/model or fuzzy id, e.g. opus)",
|
|
11
|
+
required: true,
|
|
12
|
+
multiple: true,
|
|
13
|
+
}),
|
|
14
|
+
};
|
|
15
|
+
|
|
16
|
+
static flags = {
|
|
17
|
+
runs: Flags.integer({ description: "Requests per model (results are averaged)", default: 1 }),
|
|
18
|
+
"max-tokens": Flags.integer({ description: "Max output tokens per request", default: 512 }),
|
|
19
|
+
prompt: Flags.string({ description: "Custom prompt text (default: bundled bench prompt)" }),
|
|
20
|
+
json: Flags.boolean({ description: "Output JSON" }),
|
|
21
|
+
};
|
|
22
|
+
|
|
23
|
+
static examples = [
|
|
24
|
+
"# Compare two models\n omp bench anthropic/claude-opus-4-5 openai/gpt-5.2",
|
|
25
|
+
"# Fuzzy selectors work\n omp bench opus sonnet",
|
|
26
|
+
"# Average over 3 runs each\n omp bench opus gpt-5.2 --runs 3",
|
|
27
|
+
"# Machine-readable output\n omp bench opus --json",
|
|
28
|
+
];
|
|
29
|
+
|
|
30
|
+
async run(): Promise<void> {
|
|
31
|
+
const { args, flags } = await this.parse(Bench);
|
|
32
|
+
await runBenchCommand({
|
|
33
|
+
models: args.models ?? [],
|
|
34
|
+
flags: {
|
|
35
|
+
runs: flags.runs,
|
|
36
|
+
maxTokens: flags["max-tokens"],
|
|
37
|
+
prompt: flags.prompt,
|
|
38
|
+
json: flags.json,
|
|
39
|
+
},
|
|
40
|
+
});
|
|
41
|
+
}
|
|
42
|
+
}
|
|
@@ -20,6 +20,11 @@ import {
|
|
|
20
20
|
UNK_CONTEXT_WINDOW,
|
|
21
21
|
UNK_MAX_TOKENS,
|
|
22
22
|
} from "@oh-my-pi/pi-catalog/provider-models";
|
|
23
|
+
import {
|
|
24
|
+
collapseBuiltModelVariants,
|
|
25
|
+
getVariantAliasSources,
|
|
26
|
+
resolveVariantAlias,
|
|
27
|
+
} from "@oh-my-pi/pi-catalog/variant-collapse";
|
|
23
28
|
|
|
24
29
|
// Sentinel for local-only OAuth token (LM Studio, vLLM) — declared inline to avoid loading
|
|
25
30
|
// any provider module at startup. Must match `DEFAULT_LOCAL_TOKEN` in oauth/lm-studio.ts.
|
|
@@ -542,7 +547,37 @@ function normalizeSuppressedSelector(selector: string): string {
|
|
|
542
547
|
if (!trimmed) return trimmed;
|
|
543
548
|
const parsed = parseModelString(trimmed);
|
|
544
549
|
if (!parsed) return trimmed;
|
|
545
|
-
|
|
550
|
+
// Retired effort-tier variant ids normalize to their collapsed logical id
|
|
551
|
+
// so persisted suppressions keyed by raw member ids still bind.
|
|
552
|
+
const aliasId = resolveVariantAlias(parsed.provider, parsed.id);
|
|
553
|
+
return `${parsed.provider}/${aliasId ?? parsed.id}`;
|
|
554
|
+
}
|
|
555
|
+
|
|
556
|
+
/**
|
|
557
|
+
* Look up a model's override, falling back to entries keyed by retired
|
|
558
|
+
* effort-tier variant ids (models.yml authored before collapsing). A raw key
|
|
559
|
+
* only re-binds when no live model holds that id.
|
|
560
|
+
*/
|
|
561
|
+
function resolveModelOverrideWithAliases(
|
|
562
|
+
overrides: Map<string, ModelOverride>,
|
|
563
|
+
model: Model<Api>,
|
|
564
|
+
hasLiveModel: (provider: string, id: string) => boolean,
|
|
565
|
+
): ModelOverride | undefined {
|
|
566
|
+
const direct = overrides.get(model.id);
|
|
567
|
+
if (direct) return direct;
|
|
568
|
+
for (const rawId of getVariantAliasSources(model.provider, model.id)) {
|
|
569
|
+
if (hasLiveModel(model.provider, rawId)) continue;
|
|
570
|
+
const remapped = overrides.get(rawId);
|
|
571
|
+
if (remapped) {
|
|
572
|
+
logger.debug("model override re-keyed through variant alias", {
|
|
573
|
+
provider: model.provider,
|
|
574
|
+
from: rawId,
|
|
575
|
+
to: model.id,
|
|
576
|
+
});
|
|
577
|
+
return remapped;
|
|
578
|
+
}
|
|
579
|
+
}
|
|
580
|
+
return undefined;
|
|
546
581
|
}
|
|
547
582
|
|
|
548
583
|
function getDisabledProviderIdsFromSettings(): Set<string> {
|
|
@@ -799,7 +834,9 @@ export class ModelRegistry {
|
|
|
799
834
|
const withConfigModels = this.#mergeCustomModels(resolvedDefaults, this.#customModelOverlays);
|
|
800
835
|
// Merge runtime extension models so they survive refresh() cycles
|
|
801
836
|
const combined = this.#mergeCustomModels(withConfigModels, this.#runtimeModelOverlays);
|
|
802
|
-
|
|
837
|
+
// Custom/config providers bypass the model-manager merge point —
|
|
838
|
+
// collapse effort-tier variants here so X/X-thinking twins fold.
|
|
839
|
+
const withModelOverrides = this.#applyModelOverrides(collapseBuiltModelVariants(combined), this.#modelOverrides);
|
|
803
840
|
this.#models = this.#applyRuntimeProviderOverrides(withModelOverrides);
|
|
804
841
|
this.#rebuildCanonicalIndex();
|
|
805
842
|
this.#lastStaticLoadMtime = this.#modelsConfigFile.getMtimeMs();
|
|
@@ -1152,7 +1189,7 @@ export class ModelRegistry {
|
|
|
1152
1189
|
const withConfigModels = this.#mergeCustomModels(resolved, this.#customModelOverlays);
|
|
1153
1190
|
// Merge runtime extension models so they survive online discovery completion
|
|
1154
1191
|
const combined = this.#mergeCustomModels(withConfigModels, this.#runtimeModelOverlays);
|
|
1155
|
-
const withModelOverrides = this.#applyModelOverrides(combined, this.#modelOverrides);
|
|
1192
|
+
const withModelOverrides = this.#applyModelOverrides(collapseBuiltModelVariants(combined), this.#modelOverrides);
|
|
1156
1193
|
this.#models = this.#applyRuntimeProviderOverrides(withModelOverrides);
|
|
1157
1194
|
this.#rebuildCanonicalIndex();
|
|
1158
1195
|
}
|
|
@@ -1398,8 +1435,13 @@ export class ModelRegistry {
|
|
|
1398
1435
|
#applyProviderModelOverrides(provider: string, models: Model<Api>[]): Model<Api>[] {
|
|
1399
1436
|
const overrides = this.#modelOverrides.get(provider);
|
|
1400
1437
|
if (!overrides || overrides.size === 0) return models;
|
|
1438
|
+
let liveIds: Set<string> | null = null;
|
|
1439
|
+
const hasLiveModel = (_provider: string, id: string) => {
|
|
1440
|
+
liveIds ??= new Set(models.map(m => m.id));
|
|
1441
|
+
return liveIds.has(id);
|
|
1442
|
+
};
|
|
1401
1443
|
return models.map(model => {
|
|
1402
|
-
const override = overrides
|
|
1444
|
+
const override = resolveModelOverrideWithAliases(overrides, model, hasLiveModel);
|
|
1403
1445
|
if (!override) return model;
|
|
1404
1446
|
return applyModelOverride(model, override);
|
|
1405
1447
|
});
|
|
@@ -1443,10 +1485,15 @@ export class ModelRegistry {
|
|
|
1443
1485
|
}
|
|
1444
1486
|
#applyModelOverrides(models: Model<Api>[], overrides: Map<string, Map<string, ModelOverride>>): Model<Api>[] {
|
|
1445
1487
|
if (overrides.size === 0) return models;
|
|
1488
|
+
let liveKeys: Set<string> | null = null;
|
|
1489
|
+
const hasLiveModel = (provider: string, id: string) => {
|
|
1490
|
+
liveKeys ??= new Set(models.map(m => `${m.provider}\u0000${m.id}`));
|
|
1491
|
+
return liveKeys.has(`${provider}\u0000${id}`);
|
|
1492
|
+
};
|
|
1446
1493
|
return models.map(model => {
|
|
1447
1494
|
const providerOverrides = overrides.get(model.provider);
|
|
1448
1495
|
if (!providerOverrides) return model;
|
|
1449
|
-
const override = providerOverrides
|
|
1496
|
+
const override = resolveModelOverrideWithAliases(providerOverrides, model, hasLiveModel);
|
|
1450
1497
|
if (!override) return model;
|
|
1451
1498
|
return applyModelOverride(model, override);
|
|
1452
1499
|
});
|
|
@@ -3,8 +3,9 @@
|
|
|
3
3
|
*
|
|
4
4
|
* Layering:
|
|
5
5
|
* - `matchModel` is the single matching engine. Order: exact `provider/id`
|
|
6
|
-
* reference (with OpenRouter routed/date fallbacks) →
|
|
7
|
-
* exact
|
|
6
|
+
* reference (with variant-alias and OpenRouter routed/date fallbacks) →
|
|
7
|
+
* exact canonical id → exact bare id → retired variant alias →
|
|
8
|
+
* provider-scoped fuzzy → substring with alias-vs-dated pick.
|
|
8
9
|
* - `parseModelPatternWithContext`/`parseModelPattern` layer the selector
|
|
9
10
|
* grammar on top: trailing `:level` thinking suffixes (`splitThinkingSuffix`)
|
|
10
11
|
* and `@upstream` provider routing (`splitUpstreamRouting`).
|
|
@@ -19,9 +20,11 @@ import type { Api, Effort, KnownProvider, Model, ModelSpec } from "@oh-my-pi/pi-
|
|
|
19
20
|
import { buildModel } from "@oh-my-pi/pi-catalog/build";
|
|
20
21
|
import { modelMatchesHost } from "@oh-my-pi/pi-catalog/hosts";
|
|
21
22
|
import { buildModelProviderPriorityRank } from "@oh-my-pi/pi-catalog/identity";
|
|
23
|
+
import { stripThinkingVariantToken } from "@oh-my-pi/pi-catalog/identity/family";
|
|
22
24
|
import { clampThinkingLevelForModel } from "@oh-my-pi/pi-catalog/model-thinking";
|
|
23
25
|
import { modelsAreEqual } from "@oh-my-pi/pi-catalog/models";
|
|
24
26
|
import { DEFAULT_MODEL_PER_PROVIDER } from "@oh-my-pi/pi-catalog/provider-models";
|
|
27
|
+
import { resolveBareVariantAlias, resolveVariantAlias } from "@oh-my-pi/pi-catalog/variant-collapse";
|
|
25
28
|
import { fuzzyMatch } from "@oh-my-pi/pi-tui";
|
|
26
29
|
import { logger } from "@oh-my-pi/pi-utils";
|
|
27
30
|
import chalk from "chalk";
|
|
@@ -228,6 +231,18 @@ export function resolveProviderModelReference(
|
|
|
228
231
|
return exact;
|
|
229
232
|
}
|
|
230
233
|
|
|
234
|
+
// Retired effort-tier variant ids resolve to their collapsed logical
|
|
235
|
+
// model: hand-table aliases first, then the `X-thinking` → `X` grammar
|
|
236
|
+
// for auto-derived pairs. Exact lookup above always wins while raw is live.
|
|
237
|
+
const variantAliasId =
|
|
238
|
+
resolveVariantAlias(normalizedProvider, normalizedModelId) ?? stripThinkingVariantToken(normalizedModelId);
|
|
239
|
+
if (variantAliasId) {
|
|
240
|
+
const aliased = index.get(`${normalizedProvider}\u0000${variantAliasId.toLowerCase()}`);
|
|
241
|
+
if (aliased) {
|
|
242
|
+
return aliased;
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
|
|
231
246
|
if (normalizedProvider !== "openrouter") {
|
|
232
247
|
return undefined;
|
|
233
248
|
}
|
|
@@ -407,11 +422,13 @@ function findExactCanonicalModelMatch(
|
|
|
407
422
|
|
|
408
423
|
/**
|
|
409
424
|
* The single model-matching engine. Tries, in order:
|
|
410
|
-
* 1. exact `provider/id` reference (OpenRouter routed/date
|
|
425
|
+
* 1. exact `provider/id` reference (variant-alias and OpenRouter routed/date
|
|
426
|
+
* fallbacks included),
|
|
411
427
|
* 2. exact canonical id (coalesces provider variants),
|
|
412
428
|
* 3. exact bare id (preference-ranked),
|
|
413
|
-
* 4.
|
|
414
|
-
* 5.
|
|
429
|
+
* 4. retired effort-tier variant alias (collapsed catalog entries),
|
|
430
|
+
* 5. provider-scoped fuzzy match,
|
|
431
|
+
* 6. substring match with the alias-vs-dated pick.
|
|
415
432
|
* Returns the matched model or undefined if no match found.
|
|
416
433
|
*/
|
|
417
434
|
function matchModel(
|
|
@@ -440,6 +457,20 @@ function matchModel(
|
|
|
440
457
|
if (exactMatches.length > 0) {
|
|
441
458
|
return pickPreferredModel(exactMatches, context);
|
|
442
459
|
}
|
|
460
|
+
|
|
461
|
+
// Retired effort-tier variant ids (bare, no provider prefix) resolve to
|
|
462
|
+
// their collapsed logical model; models from the providers whose table
|
|
463
|
+
// declared the alias win ties. Auto-derived `X-thinking` pairs resolve
|
|
464
|
+
// through the grammar fallback.
|
|
465
|
+
const bareAlias = resolveBareVariantAlias(modelPattern);
|
|
466
|
+
const bareAliasTargetId = bareAlias?.id ?? stripThinkingVariantToken(modelPattern);
|
|
467
|
+
if (bareAliasTargetId) {
|
|
468
|
+
const aliasMatches = availableModels.filter(m => m.id.toLowerCase() === bareAliasTargetId.toLowerCase());
|
|
469
|
+
if (aliasMatches.length > 0) {
|
|
470
|
+
const preferred = bareAlias ? aliasMatches.filter(m => bareAlias.providers.includes(m.provider)) : [];
|
|
471
|
+
return pickPreferredModel(preferred.length > 0 ? preferred : aliasMatches, context);
|
|
472
|
+
}
|
|
473
|
+
}
|
|
443
474
|
// Check for provider/modelId format — fuzzy match within provider only.
|
|
444
475
|
const slashIndex = modelPattern.indexOf("/");
|
|
445
476
|
if (slashIndex !== -1) {
|