@oh-my-pi/pi-coding-agent 15.11.6 → 15.11.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. package/CHANGELOG.md +29 -1
  2. package/dist/cli.js +114 -71
  3. package/dist/types/cli/bench-cli.d.ts +78 -0
  4. package/dist/types/commands/bench.d.ts +29 -0
  5. package/dist/types/config/model-resolver.d.ts +3 -2
  6. package/dist/types/config/settings-schema.d.ts +72 -0
  7. package/dist/types/edit/renderer.d.ts +1 -0
  8. package/dist/types/modes/components/oauth-selector.d.ts +10 -1
  9. package/dist/types/modes/components/settings-selector.d.ts +8 -1
  10. package/dist/types/modes/components/snapcompact-shape-preview.d.ts +31 -0
  11. package/dist/types/modes/components/tool-execution.d.ts +13 -9
  12. package/dist/types/modes/setup-wizard/scenes/sign-in.d.ts +3 -0
  13. package/dist/types/modes/setup-wizard/scenes/types.d.ts +10 -1
  14. package/dist/types/modes/setup-wizard/scenes/web-search.d.ts +3 -0
  15. package/dist/types/session/snapcompact-inline.d.ts +2 -0
  16. package/dist/types/tools/bash.d.ts +2 -0
  17. package/dist/types/tools/eval-render.d.ts +1 -0
  18. package/dist/types/tools/renderers.d.ts +13 -0
  19. package/dist/types/tools/ssh.d.ts +1 -0
  20. package/package.json +11 -11
  21. package/src/cli/bench-cli.ts +437 -0
  22. package/src/cli-commands.ts +1 -0
  23. package/src/commands/bench.ts +42 -0
  24. package/src/config/model-registry.ts +52 -5
  25. package/src/config/model-resolver.ts +36 -5
  26. package/src/config/settings-schema.ts +92 -0
  27. package/src/edit/renderer.ts +5 -0
  28. package/src/hindsight/client.ts +26 -1
  29. package/src/hindsight/state.ts +6 -2
  30. package/src/internal-urls/docs-index.generated.ts +1 -1
  31. package/src/mcp/transports/stdio.ts +81 -7
  32. package/src/modes/components/oauth-selector.ts +67 -7
  33. package/src/modes/components/settings-selector.ts +27 -0
  34. package/src/modes/components/snapcompact-shape-preview-doc.md +11 -0
  35. package/src/modes/components/snapcompact-shape-preview.ts +192 -0
  36. package/src/modes/components/tool-execution.ts +18 -10
  37. package/src/modes/controllers/input-controller.ts +8 -6
  38. package/src/modes/controllers/selector-controller.ts +4 -2
  39. package/src/modes/interactive-mode.ts +24 -0
  40. package/src/modes/setup-wizard/index.ts +1 -0
  41. package/src/modes/setup-wizard/scenes/glyph.ts +24 -6
  42. package/src/modes/setup-wizard/scenes/providers.ts +36 -2
  43. package/src/modes/setup-wizard/scenes/sign-in.ts +10 -1
  44. package/src/modes/setup-wizard/scenes/theme.ts +28 -1
  45. package/src/modes/setup-wizard/scenes/types.ts +10 -1
  46. package/src/modes/setup-wizard/scenes/web-search.ts +22 -6
  47. package/src/modes/setup-wizard/wizard-overlay.ts +38 -1
  48. package/src/modes/utils/context-usage.ts +1 -1
  49. package/src/prompts/bench.md +7 -0
  50. package/src/sdk.ts +1 -0
  51. package/src/session/agent-session.ts +5 -0
  52. package/src/session/snapcompact-inline.ts +11 -19
  53. package/src/tools/bash.ts +3 -0
  54. package/src/tools/eval-render.ts +4 -0
  55. package/src/tools/renderers.ts +13 -0
  56. package/src/tools/ssh.ts +3 -0
@@ -0,0 +1,437 @@
1
+ import type { ResolvedThinkingLevel } from "@oh-my-pi/pi-agent-core";
2
+ import type {
3
+ Api,
4
+ ApiKeyResolver,
5
+ AssistantMessage,
6
+ AssistantMessageEvent,
7
+ AssistantMessageEventStream,
8
+ Context,
9
+ Effort,
10
+ Model,
11
+ SimpleStreamOptions,
12
+ } from "@oh-my-pi/pi-ai";
13
+ import { streamSimple } from "@oh-my-pi/pi-ai";
14
+ import type { CanonicalModelVariant } from "@oh-my-pi/pi-catalog/identity";
15
+ import { replaceTabs, truncateToWidth } from "@oh-my-pi/pi-tui";
16
+ import { formatDuration, getProjectDir } from "@oh-my-pi/pi-utils";
17
+ import chalk from "chalk";
18
+ import type { ApiKeyResolverModel } from "../config/api-key-resolver";
19
+ import { type CanonicalModelQueryOptions, ModelRegistry } from "../config/model-registry";
20
+ import { formatModelString, getModelMatchPreferences, resolveCliModel } from "../config/model-resolver";
21
+ import { Settings } from "../config/settings";
22
+ import benchPrompt from "../prompts/bench.md" with { type: "text" };
23
+ import { discoverAuthStorage } from "../sdk";
24
+ import { resolveThinkingLevelForModel, shouldDisableReasoning, toReasoningEffort } from "../thinking";
25
+
26
+ const DEFAULT_RUNS = 1;
27
+ const DEFAULT_MAX_TOKENS = 512;
28
+ const ERROR_WIDTH = 110;
29
+ const BENCH_PROMPT = benchPrompt.trim();
30
+
31
+ export interface BenchCommandArgs {
32
+ models: string[];
33
+ flags: {
34
+ runs?: number;
35
+ maxTokens?: number;
36
+ prompt?: string;
37
+ json?: boolean;
38
+ };
39
+ }
40
+
41
+ export interface BenchModelRegistry {
42
+ getAll(): Model<Api>[];
43
+ getApiKey(model: Model<Api>, sessionId?: string): Promise<string | undefined>;
44
+ resolver(model: ApiKeyResolverModel, sessionId?: string): ApiKeyResolver;
45
+ resolveCanonicalModel?(canonicalId: string, options?: CanonicalModelQueryOptions): Model<Api> | undefined;
46
+ getCanonicalVariants?(canonicalId: string, options?: CanonicalModelQueryOptions): CanonicalModelVariant[];
47
+ getCanonicalId?(model: Model<Api>): string | undefined;
48
+ }
49
+
50
+ export interface BenchRuntime {
51
+ modelRegistry: BenchModelRegistry;
52
+ settings?: Settings;
53
+ close?: () => void;
54
+ }
55
+
56
+ export interface BenchRunSuccess {
57
+ ok: true;
58
+ ttftMs: number;
59
+ durationMs: number;
60
+ outputTokens: number;
61
+ /** Generation throughput measured over the post-first-token window. */
62
+ tokensPerSecond: number;
63
+ }
64
+
65
+ export interface BenchRunFailure {
66
+ ok: false;
67
+ error: string;
68
+ }
69
+
70
+ export type BenchRunResult = BenchRunSuccess | BenchRunFailure;
71
+
72
+ export interface BenchAverages {
73
+ ttftMs: number;
74
+ durationMs: number;
75
+ outputTokens: number;
76
+ tokensPerSecond: number;
77
+ }
78
+
79
+ export interface BenchModelReport {
80
+ /** Selector as the user typed it (e.g. "opus" or "gemini-3.5:low"). */
81
+ selector: string;
82
+ /** Resolved `provider/id`. */
83
+ model: string;
84
+ /** Explicit thinking level from a `:level` selector suffix; undefined = provider default. */
85
+ thinking?: ResolvedThinkingLevel;
86
+ results: BenchRunResult[];
87
+ /** Averages over successful runs; null when every run failed. */
88
+ average: BenchAverages | null;
89
+ }
90
+
91
+ export interface BenchSummary {
92
+ runs: number;
93
+ maxTokens: number;
94
+ models: BenchModelReport[];
95
+ failures: number;
96
+ }
97
+
98
+ type BenchStreamSimple = (
99
+ model: Model<Api>,
100
+ context: Context,
101
+ options?: SimpleStreamOptions,
102
+ ) => AssistantMessageEventStream;
103
+
104
+ export interface BenchDependencies {
105
+ createRuntime?: () => Promise<BenchRuntime>;
106
+ randomSessionId?: () => string;
107
+ writeStdout?: (text: string) => void;
108
+ writeStderr?: (text: string) => void;
109
+ setExitCode?: (code: number) => void;
110
+ streamSimple?: BenchStreamSimple;
111
+ now?: () => number;
112
+ stdoutIsTTY?: boolean;
113
+ }
114
+
115
+ function getErrorMessage(error: unknown): string {
116
+ if (error instanceof Error && error.message) return error.message;
117
+ return String(error);
118
+ }
119
+
120
+ function normalizePositiveInteger(name: string, value: number | undefined, fallback: number): number {
121
+ if (value === undefined) return fallback;
122
+ if (!Number.isInteger(value) || value <= 0) {
123
+ throw new Error(`Expected --${name} to be a positive integer, got ${value}`);
124
+ }
125
+ return value;
126
+ }
127
+
128
+ function isFirstTokenEvent(event: AssistantMessageEvent): boolean {
129
+ switch (event.type) {
130
+ case "text_delta":
131
+ case "thinking_delta":
132
+ case "toolcall_delta":
133
+ return event.delta.length > 0;
134
+ case "text_end":
135
+ case "thinking_end":
136
+ return event.content.length > 0;
137
+ default:
138
+ return false;
139
+ }
140
+ }
141
+
142
+ /**
143
+ * Tokens/s over the generation window (duration minus TTFT) so queue/prefill
144
+ * latency does not dilute throughput. Falls back to total duration when the
145
+ * response arrived as a single chunk (TTFT ~ duration).
146
+ */
147
+ function computeTokensPerSecond(outputTokens: number, durationMs: number, ttftMs: number): number {
148
+ const decodeMs = durationMs - ttftMs;
149
+ const windowMs = decodeMs > 0 ? decodeMs : durationMs;
150
+ return windowMs > 0 ? (outputTokens * 1000) / windowMs : 0;
151
+ }
152
+
153
+ interface BenchRequestOptions {
154
+ apiKey: ApiKeyResolver;
155
+ sessionId: string;
156
+ prompt: string;
157
+ maxTokens: number;
158
+ /** Explicit effort from a `:level` selector suffix; absent = provider default. */
159
+ reasoning?: Effort;
160
+ /** Only set for an explicit `:off` suffix — some endpoints reject disablement. */
161
+ disableReasoning?: boolean;
162
+ }
163
+
164
+ async function runBenchRequest(
165
+ model: Model<Api>,
166
+ options: BenchRequestOptions,
167
+ streamFn: BenchStreamSimple,
168
+ now: () => number,
169
+ ): Promise<BenchRunResult> {
170
+ const startedAt = now();
171
+ let firstTokenAt: number | undefined;
172
+ try {
173
+ const context: Context = {
174
+ // Codex's Responses endpoint 400s with "Instructions are required" when no
175
+ // system prompt is present — same guard as eval's completion bridge.
176
+ systemPrompt: ["You are a helpful assistant."],
177
+ messages: [{ role: "user", content: options.prompt, timestamp: Date.now(), attribution: "user" }],
178
+ };
179
+ const stream = streamFn(model, context, {
180
+ apiKey: options.apiKey,
181
+ sessionId: options.sessionId,
182
+ maxTokens:
183
+ Number.isFinite(model.maxTokens) && model.maxTokens > 0
184
+ ? Math.min(options.maxTokens, model.maxTokens)
185
+ : options.maxTokens,
186
+ reasoning: options.reasoning,
187
+ disableReasoning: options.disableReasoning,
188
+ // pi-ai opts every OpenRouter request into response caching (1h TTL).
189
+ // Bench sends a byte-identical request each run, so within the TTL
190
+ // OpenRouter replays the cached generation with zeroed usage — the run
191
+ // shows "tokens 0, TPS 0.0" at line speed. Opt back out so every run
192
+ // measures a fresh generation.
193
+ headers: model.provider === "openrouter" ? { "X-OpenRouter-Cache": "false" } : undefined,
194
+ });
195
+ let message: AssistantMessage | undefined;
196
+ for await (const event of stream) {
197
+ if (firstTokenAt === undefined && isFirstTokenEvent(event)) {
198
+ firstTokenAt = now();
199
+ }
200
+ if (event.type === "error") {
201
+ return { ok: false, error: event.error.errorMessage ?? "request failed" };
202
+ }
203
+ if (event.type === "done") {
204
+ message = event.message;
205
+ }
206
+ }
207
+ message ??= await stream.result();
208
+ if (message.stopReason === "error" || message.errorMessage) {
209
+ return { ok: false, error: message.errorMessage ?? "request failed" };
210
+ }
211
+ const rawDuration = message.duration ?? now() - startedAt;
212
+ const durationMs = Number.isFinite(rawDuration) && rawDuration > 0 ? rawDuration : 0;
213
+ const rawTtft = message.ttft ?? (firstTokenAt === undefined ? durationMs : firstTokenAt - startedAt);
214
+ const ttftMs = Number.isFinite(rawTtft) && rawTtft > 0 ? rawTtft : 0;
215
+ const outputTokens = Number.isFinite(message.usage.output) && message.usage.output > 0 ? message.usage.output : 0;
216
+ return {
217
+ ok: true,
218
+ ttftMs,
219
+ durationMs,
220
+ outputTokens,
221
+ tokensPerSecond: computeTokensPerSecond(outputTokens, durationMs, ttftMs),
222
+ };
223
+ } catch (error) {
224
+ return { ok: false, error: getErrorMessage(error) };
225
+ }
226
+ }
227
+
228
+ function buildModelReport(
229
+ selector: string,
230
+ model: Model<Api>,
231
+ thinking: ResolvedThinkingLevel | undefined,
232
+ results: BenchRunResult[],
233
+ ): BenchModelReport {
234
+ const successes = results.filter((result): result is BenchRunSuccess => result.ok);
235
+ const average =
236
+ successes.length === 0
237
+ ? null
238
+ : {
239
+ ttftMs: successes.reduce((sum, r) => sum + r.ttftMs, 0) / successes.length,
240
+ durationMs: successes.reduce((sum, r) => sum + r.durationMs, 0) / successes.length,
241
+ outputTokens: successes.reduce((sum, r) => sum + r.outputTokens, 0) / successes.length,
242
+ tokensPerSecond: successes.reduce((sum, r) => sum + r.tokensPerSecond, 0) / successes.length,
243
+ };
244
+ return { selector, model: formatModelString(model), thinking, results, average };
245
+ }
246
+
247
+ function formatMs(ms: number): string {
248
+ return formatDuration(Math.max(0, Math.round(ms)));
249
+ }
250
+
251
+ function formatRunLine(result: BenchRunResult, index: number, total: number): string {
252
+ const prefix = chalk.dim(`run ${index + 1}/${total}`);
253
+ if (result.ok) {
254
+ return ` ${chalk.green("✓")} ${prefix} ${chalk.dim("TTFT")} ${formatMs(result.ttftMs)} ${chalk.dim("TPS")} ${result.tokensPerSecond.toFixed(1)}/s ${chalk.dim("tokens")} ${result.outputTokens} ${chalk.dim("total")} ${formatMs(result.durationMs)}`;
255
+ }
256
+ return ` ${chalk.red("✗")} ${prefix} ${chalk.red(truncateToWidth(replaceTabs(result.error).replace(/\r?\n/g, " "), ERROR_WIDTH))}`;
257
+ }
258
+
259
+ export function formatBenchTable(summary: BenchSummary): string {
260
+ const ranked = [...summary.models].sort((a, b) => {
261
+ if (a.average === null && b.average === null) return 0;
262
+ if (a.average === null) return 1;
263
+ if (b.average === null) return -1;
264
+ return b.average.tokensPerSecond - a.average.tokensPerSecond;
265
+ });
266
+ const rows = ranked.map(report => ({
267
+ model: report.model,
268
+ ttft: report.average ? formatMs(report.average.ttftMs) : "-",
269
+ tps: report.average ? `${report.average.tokensPerSecond.toFixed(1)}/s` : "-",
270
+ tokens: report.average ? String(Math.round(report.average.outputTokens)) : "-",
271
+ total: report.average ? formatMs(report.average.durationMs) : "-",
272
+ failed: report.results.filter(result => !result.ok).length,
273
+ }));
274
+ const headers = { model: "model", ttft: "TTFT", tps: "TPS", tokens: "tokens", total: "total" } as const;
275
+ const width = (key: keyof typeof headers): number =>
276
+ Math.max(headers[key].length, ...rows.map(row => row[key].length));
277
+ const lines = [
278
+ [
279
+ headers.model.padEnd(width("model")),
280
+ headers.ttft.padEnd(width("ttft")),
281
+ headers.tps.padEnd(width("tps")),
282
+ headers.tokens.padEnd(width("tokens")),
283
+ headers.total.padEnd(width("total")),
284
+ ]
285
+ .join(" ")
286
+ .trimEnd(),
287
+ ];
288
+ for (const row of rows) {
289
+ const failedSuffix = row.failed > 0 ? ` ${chalk.red(`(${row.failed} failed)`)}` : "";
290
+ lines.push(
291
+ [
292
+ row.model.padEnd(width("model")),
293
+ row.ttft.padEnd(width("ttft")),
294
+ row.tps.padEnd(width("tps")),
295
+ row.tokens.padEnd(width("tokens")),
296
+ row.total.padEnd(width("total")),
297
+ ]
298
+ .join(" ")
299
+ .trimEnd() + failedSuffix,
300
+ );
301
+ }
302
+ return `${lines.map((line, index) => (index === 0 ? chalk.dim(line) : line)).join("\n")}\n`;
303
+ }
304
+
305
+ async function createDefaultRuntime(): Promise<BenchRuntime> {
306
+ const authStorage = await discoverAuthStorage();
307
+ try {
308
+ const settings = await Settings.init({ cwd: getProjectDir() });
309
+ const modelRegistry = new ModelRegistry(authStorage);
310
+ return {
311
+ modelRegistry,
312
+ settings,
313
+ close: () => authStorage.close(),
314
+ };
315
+ } catch (error) {
316
+ authStorage.close();
317
+ throw error;
318
+ }
319
+ }
320
+
321
+ interface BenchTarget {
322
+ selector: string;
323
+ model: Model<Api>;
324
+ thinking: ResolvedThinkingLevel | undefined;
325
+ }
326
+
327
+ function resolveBenchModels(
328
+ selectors: string[],
329
+ modelRegistry: BenchModelRegistry,
330
+ settings: Settings | undefined,
331
+ writeStderr: (text: string) => void,
332
+ ): BenchTarget[] {
333
+ const preferences = getModelMatchPreferences(settings);
334
+ const resolved: BenchTarget[] = [];
335
+ const errors: string[] = [];
336
+ for (const selector of selectors) {
337
+ const result = resolveCliModel({ cliModel: selector, modelRegistry, preferences });
338
+ if (result.error) {
339
+ errors.push(`${selector}: ${result.error}`);
340
+ continue;
341
+ }
342
+ if (!result.model) {
343
+ errors.push(`${selector}: model not found`);
344
+ continue;
345
+ }
346
+ if (result.warning) writeStderr(`${chalk.yellow(`Warning: ${result.warning}`)}\n`);
347
+ resolved.push({
348
+ selector,
349
+ model: result.model,
350
+ thinking: resolveThinkingLevelForModel(result.model, result.thinkingLevel),
351
+ });
352
+ }
353
+ if (errors.length > 0) {
354
+ throw new Error(`Could not resolve ${errors.length === 1 ? "model" : "models"}:\n${errors.join("\n")}`);
355
+ }
356
+ return resolved;
357
+ }
358
+
359
+ export async function runBenchCommand(command: BenchCommandArgs, deps: BenchDependencies = {}): Promise<BenchSummary> {
360
+ const runs = normalizePositiveInteger("runs", command.flags.runs, DEFAULT_RUNS);
361
+ const maxTokens = normalizePositiveInteger("max-tokens", command.flags.maxTokens, DEFAULT_MAX_TOKENS);
362
+ const prompt = command.flags.prompt?.trim() || BENCH_PROMPT;
363
+ const json = command.flags.json === true;
364
+ const randomSessionId = deps.randomSessionId ?? (() => Bun.randomUUIDv7());
365
+ const writeStdout = deps.writeStdout ?? ((text: string) => process.stdout.write(text));
366
+ const writeStderr = deps.writeStderr ?? ((text: string) => process.stderr.write(text));
367
+ const setExitCode =
368
+ deps.setExitCode ??
369
+ ((code: number) => {
370
+ process.exitCode = code;
371
+ });
372
+ const streamFn = deps.streamSimple ?? streamSimple;
373
+ const now = deps.now ?? (() => performance.now());
374
+ const interactive = deps.stdoutIsTTY ?? process.stdout.isTTY === true;
375
+ if (command.models.length === 0) {
376
+ throw new Error("Pass at least one model selector, e.g. `omp bench opus gpt-5.2`");
377
+ }
378
+
379
+ const runtime = await (deps.createRuntime ?? createDefaultRuntime)();
380
+ try {
381
+ const targets = resolveBenchModels(command.models, runtime.modelRegistry, runtime.settings, writeStderr);
382
+ const reports: BenchModelReport[] = [];
383
+ for (const { selector, model, thinking } of targets) {
384
+ if (!json) {
385
+ const resolvedNote = selector === formatModelString(model) ? "" : chalk.dim(` (${selector})`);
386
+ writeStdout(`${chalk.bold(formatModelString(model))}${resolvedNote}\n`);
387
+ }
388
+ const results: BenchRunResult[] = [];
389
+ for (let index = 0; index < runs; index++) {
390
+ const sessionId = randomSessionId();
391
+ const initialKey = await runtime.modelRegistry.getApiKey(model, sessionId);
392
+ if (!initialKey) {
393
+ const failure: BenchRunFailure = {
394
+ ok: false,
395
+ error: `No credentials for provider "${model.provider}". Run \`omp\` and use /login, or set the provider API key.`,
396
+ };
397
+ results.push(failure);
398
+ if (!json) writeStdout(`${formatRunLine(failure, index, runs)}\n`);
399
+ break; // remaining runs would fail identically
400
+ }
401
+ if (!json && interactive) {
402
+ writeStdout(chalk.dim(` … run ${index + 1}/${runs} streaming`));
403
+ }
404
+ const result = await runBenchRequest(
405
+ model,
406
+ {
407
+ apiKey: runtime.modelRegistry.resolver(model, sessionId),
408
+ sessionId,
409
+ prompt,
410
+ maxTokens,
411
+ reasoning: toReasoningEffort(thinking),
412
+ disableReasoning: shouldDisableReasoning(thinking) ? true : undefined,
413
+ },
414
+ streamFn,
415
+ now,
416
+ );
417
+ results.push(result);
418
+ if (!json) {
419
+ if (interactive) writeStdout("\r\x1b[2K");
420
+ writeStdout(`${formatRunLine(result, index, runs)}\n`);
421
+ }
422
+ }
423
+ reports.push(buildModelReport(selector, model, thinking, results));
424
+ }
425
+ const failures = reports.reduce((sum, report) => sum + report.results.filter(result => !result.ok).length, 0);
426
+ const summary: BenchSummary = { runs, maxTokens, models: reports, failures };
427
+ if (json) {
428
+ writeStdout(`${JSON.stringify(summary, null, 2)}\n`);
429
+ } else if (reports.length > 1 || runs > 1) {
430
+ writeStdout(`\n${formatBenchTable(summary)}`);
431
+ }
432
+ if (failures > 0) setExitCode(1);
433
+ return summary;
434
+ } finally {
435
+ runtime.close?.();
436
+ }
437
+ }
@@ -16,6 +16,7 @@ export const commands: CommandEntry[] = [
16
16
  { name: "auth-broker", load: () => import("./commands/auth-broker").then(m => m.default) },
17
17
  { name: "auth-gateway", load: () => import("./commands/auth-gateway").then(m => m.default) },
18
18
  { name: "agents", load: () => import("./commands/agents").then(m => m.default) },
19
+ { name: "bench", load: () => import("./commands/bench").then(m => m.default) },
19
20
  { name: "commit", load: () => import("./commands/commit").then(m => m.default) },
20
21
  { name: "completions", load: () => import("./commands/completions").then(m => m.default) },
21
22
  { name: "__complete", load: () => import("./commands/complete").then(m => m.default) },
@@ -0,0 +1,42 @@
1
+ import { Args, Command, Flags } from "@oh-my-pi/pi-utils/cli";
2
+ import { runBenchCommand } from "../cli/bench-cli";
3
+
4
+ export default class Bench extends Command {
5
+ static description =
6
+ "Benchmark models with the same prompt: time-to-first-token and generation throughput (tokens/s)";
7
+
8
+ static args = {
9
+ models: Args.string({
10
+ description: "Model selectors (provider/model or fuzzy id, e.g. opus)",
11
+ required: true,
12
+ multiple: true,
13
+ }),
14
+ };
15
+
16
+ static flags = {
17
+ runs: Flags.integer({ description: "Requests per model (results are averaged)", default: 1 }),
18
+ "max-tokens": Flags.integer({ description: "Max output tokens per request", default: 512 }),
19
+ prompt: Flags.string({ description: "Custom prompt text (default: bundled bench prompt)" }),
20
+ json: Flags.boolean({ description: "Output JSON" }),
21
+ };
22
+
23
+ static examples = [
24
+ "# Compare two models\n omp bench anthropic/claude-opus-4-5 openai/gpt-5.2",
25
+ "# Fuzzy selectors work\n omp bench opus sonnet",
26
+ "# Average over 3 runs each\n omp bench opus gpt-5.2 --runs 3",
27
+ "# Machine-readable output\n omp bench opus --json",
28
+ ];
29
+
30
+ async run(): Promise<void> {
31
+ const { args, flags } = await this.parse(Bench);
32
+ await runBenchCommand({
33
+ models: args.models ?? [],
34
+ flags: {
35
+ runs: flags.runs,
36
+ maxTokens: flags["max-tokens"],
37
+ prompt: flags.prompt,
38
+ json: flags.json,
39
+ },
40
+ });
41
+ }
42
+ }
@@ -20,6 +20,11 @@ import {
20
20
  UNK_CONTEXT_WINDOW,
21
21
  UNK_MAX_TOKENS,
22
22
  } from "@oh-my-pi/pi-catalog/provider-models";
23
+ import {
24
+ collapseBuiltModelVariants,
25
+ getVariantAliasSources,
26
+ resolveVariantAlias,
27
+ } from "@oh-my-pi/pi-catalog/variant-collapse";
23
28
 
24
29
  // Sentinel for local-only OAuth token (LM Studio, vLLM) — declared inline to avoid loading
25
30
  // any provider module at startup. Must match `DEFAULT_LOCAL_TOKEN` in oauth/lm-studio.ts.
@@ -542,7 +547,37 @@ function normalizeSuppressedSelector(selector: string): string {
542
547
  if (!trimmed) return trimmed;
543
548
  const parsed = parseModelString(trimmed);
544
549
  if (!parsed) return trimmed;
545
- return `${parsed.provider}/${parsed.id}`;
550
+ // Retired effort-tier variant ids normalize to their collapsed logical id
551
+ // so persisted suppressions keyed by raw member ids still bind.
552
+ const aliasId = resolveVariantAlias(parsed.provider, parsed.id);
553
+ return `${parsed.provider}/${aliasId ?? parsed.id}`;
554
+ }
555
+
556
+ /**
557
+ * Look up a model's override, falling back to entries keyed by retired
558
+ * effort-tier variant ids (models.yml authored before collapsing). A raw key
559
+ * only re-binds when no live model holds that id.
560
+ */
561
+ function resolveModelOverrideWithAliases(
562
+ overrides: Map<string, ModelOverride>,
563
+ model: Model<Api>,
564
+ hasLiveModel: (provider: string, id: string) => boolean,
565
+ ): ModelOverride | undefined {
566
+ const direct = overrides.get(model.id);
567
+ if (direct) return direct;
568
+ for (const rawId of getVariantAliasSources(model.provider, model.id)) {
569
+ if (hasLiveModel(model.provider, rawId)) continue;
570
+ const remapped = overrides.get(rawId);
571
+ if (remapped) {
572
+ logger.debug("model override re-keyed through variant alias", {
573
+ provider: model.provider,
574
+ from: rawId,
575
+ to: model.id,
576
+ });
577
+ return remapped;
578
+ }
579
+ }
580
+ return undefined;
546
581
  }
547
582
 
548
583
  function getDisabledProviderIdsFromSettings(): Set<string> {
@@ -799,7 +834,9 @@ export class ModelRegistry {
799
834
  const withConfigModels = this.#mergeCustomModels(resolvedDefaults, this.#customModelOverlays);
800
835
  // Merge runtime extension models so they survive refresh() cycles
801
836
  const combined = this.#mergeCustomModels(withConfigModels, this.#runtimeModelOverlays);
802
- const withModelOverrides = this.#applyModelOverrides(combined, this.#modelOverrides);
837
+ // Custom/config providers bypass the model-manager merge point —
838
+ // collapse effort-tier variants here so X/X-thinking twins fold.
839
+ const withModelOverrides = this.#applyModelOverrides(collapseBuiltModelVariants(combined), this.#modelOverrides);
803
840
  this.#models = this.#applyRuntimeProviderOverrides(withModelOverrides);
804
841
  this.#rebuildCanonicalIndex();
805
842
  this.#lastStaticLoadMtime = this.#modelsConfigFile.getMtimeMs();
@@ -1152,7 +1189,7 @@ export class ModelRegistry {
1152
1189
  const withConfigModels = this.#mergeCustomModels(resolved, this.#customModelOverlays);
1153
1190
  // Merge runtime extension models so they survive online discovery completion
1154
1191
  const combined = this.#mergeCustomModels(withConfigModels, this.#runtimeModelOverlays);
1155
- const withModelOverrides = this.#applyModelOverrides(combined, this.#modelOverrides);
1192
+ const withModelOverrides = this.#applyModelOverrides(collapseBuiltModelVariants(combined), this.#modelOverrides);
1156
1193
  this.#models = this.#applyRuntimeProviderOverrides(withModelOverrides);
1157
1194
  this.#rebuildCanonicalIndex();
1158
1195
  }
@@ -1398,8 +1435,13 @@ export class ModelRegistry {
1398
1435
  #applyProviderModelOverrides(provider: string, models: Model<Api>[]): Model<Api>[] {
1399
1436
  const overrides = this.#modelOverrides.get(provider);
1400
1437
  if (!overrides || overrides.size === 0) return models;
1438
+ let liveIds: Set<string> | null = null;
1439
+ const hasLiveModel = (_provider: string, id: string) => {
1440
+ liveIds ??= new Set(models.map(m => m.id));
1441
+ return liveIds.has(id);
1442
+ };
1401
1443
  return models.map(model => {
1402
- const override = overrides.get(model.id);
1444
+ const override = resolveModelOverrideWithAliases(overrides, model, hasLiveModel);
1403
1445
  if (!override) return model;
1404
1446
  return applyModelOverride(model, override);
1405
1447
  });
@@ -1443,10 +1485,15 @@ export class ModelRegistry {
1443
1485
  }
1444
1486
  #applyModelOverrides(models: Model<Api>[], overrides: Map<string, Map<string, ModelOverride>>): Model<Api>[] {
1445
1487
  if (overrides.size === 0) return models;
1488
+ let liveKeys: Set<string> | null = null;
1489
+ const hasLiveModel = (provider: string, id: string) => {
1490
+ liveKeys ??= new Set(models.map(m => `${m.provider}\u0000${m.id}`));
1491
+ return liveKeys.has(`${provider}\u0000${id}`);
1492
+ };
1446
1493
  return models.map(model => {
1447
1494
  const providerOverrides = overrides.get(model.provider);
1448
1495
  if (!providerOverrides) return model;
1449
- const override = providerOverrides.get(model.id);
1496
+ const override = resolveModelOverrideWithAliases(providerOverrides, model, hasLiveModel);
1450
1497
  if (!override) return model;
1451
1498
  return applyModelOverride(model, override);
1452
1499
  });
@@ -3,8 +3,9 @@
3
3
  *
4
4
  * Layering:
5
5
  * - `matchModel` is the single matching engine. Order: exact `provider/id`
6
- * reference (with OpenRouter routed/date fallbacks) → exact canonical id →
7
- * exact bare id → provider-scoped fuzzysubstring with alias-vs-dated pick.
6
+ * reference (with variant-alias and OpenRouter routed/date fallbacks) →
7
+ * exact canonical id → exact bare id retired variant alias
8
+ * provider-scoped fuzzy → substring with alias-vs-dated pick.
8
9
  * - `parseModelPatternWithContext`/`parseModelPattern` layer the selector
9
10
  * grammar on top: trailing `:level` thinking suffixes (`splitThinkingSuffix`)
10
11
  * and `@upstream` provider routing (`splitUpstreamRouting`).
@@ -19,9 +20,11 @@ import type { Api, Effort, KnownProvider, Model, ModelSpec } from "@oh-my-pi/pi-
19
20
  import { buildModel } from "@oh-my-pi/pi-catalog/build";
20
21
  import { modelMatchesHost } from "@oh-my-pi/pi-catalog/hosts";
21
22
  import { buildModelProviderPriorityRank } from "@oh-my-pi/pi-catalog/identity";
23
+ import { stripThinkingVariantToken } from "@oh-my-pi/pi-catalog/identity/family";
22
24
  import { clampThinkingLevelForModel } from "@oh-my-pi/pi-catalog/model-thinking";
23
25
  import { modelsAreEqual } from "@oh-my-pi/pi-catalog/models";
24
26
  import { DEFAULT_MODEL_PER_PROVIDER } from "@oh-my-pi/pi-catalog/provider-models";
27
+ import { resolveBareVariantAlias, resolveVariantAlias } from "@oh-my-pi/pi-catalog/variant-collapse";
25
28
  import { fuzzyMatch } from "@oh-my-pi/pi-tui";
26
29
  import { logger } from "@oh-my-pi/pi-utils";
27
30
  import chalk from "chalk";
@@ -228,6 +231,18 @@ export function resolveProviderModelReference(
228
231
  return exact;
229
232
  }
230
233
 
234
+ // Retired effort-tier variant ids resolve to their collapsed logical
235
+ // model: hand-table aliases first, then the `X-thinking` → `X` grammar
236
+ // for auto-derived pairs. Exact lookup above always wins while raw is live.
237
+ const variantAliasId =
238
+ resolveVariantAlias(normalizedProvider, normalizedModelId) ?? stripThinkingVariantToken(normalizedModelId);
239
+ if (variantAliasId) {
240
+ const aliased = index.get(`${normalizedProvider}\u0000${variantAliasId.toLowerCase()}`);
241
+ if (aliased) {
242
+ return aliased;
243
+ }
244
+ }
245
+
231
246
  if (normalizedProvider !== "openrouter") {
232
247
  return undefined;
233
248
  }
@@ -407,11 +422,13 @@ function findExactCanonicalModelMatch(
407
422
 
408
423
  /**
409
424
  * The single model-matching engine. Tries, in order:
410
- * 1. exact `provider/id` reference (OpenRouter routed/date fallbacks included),
425
+ * 1. exact `provider/id` reference (variant-alias and OpenRouter routed/date
426
+ * fallbacks included),
411
427
  * 2. exact canonical id (coalesces provider variants),
412
428
  * 3. exact bare id (preference-ranked),
413
- * 4. provider-scoped fuzzy match,
414
- * 5. substring match with the alias-vs-dated pick.
429
+ * 4. retired effort-tier variant alias (collapsed catalog entries),
430
+ * 5. provider-scoped fuzzy match,
431
+ * 6. substring match with the alias-vs-dated pick.
415
432
  * Returns the matched model or undefined if no match found.
416
433
  */
417
434
  function matchModel(
@@ -440,6 +457,20 @@ function matchModel(
440
457
  if (exactMatches.length > 0) {
441
458
  return pickPreferredModel(exactMatches, context);
442
459
  }
460
+
461
+ // Retired effort-tier variant ids (bare, no provider prefix) resolve to
462
+ // their collapsed logical model; models from the providers whose table
463
+ // declared the alias win ties. Auto-derived `X-thinking` pairs resolve
464
+ // through the grammar fallback.
465
+ const bareAlias = resolveBareVariantAlias(modelPattern);
466
+ const bareAliasTargetId = bareAlias?.id ?? stripThinkingVariantToken(modelPattern);
467
+ if (bareAliasTargetId) {
468
+ const aliasMatches = availableModels.filter(m => m.id.toLowerCase() === bareAliasTargetId.toLowerCase());
469
+ if (aliasMatches.length > 0) {
470
+ const preferred = bareAlias ? aliasMatches.filter(m => bareAlias.providers.includes(m.provider)) : [];
471
+ return pickPreferredModel(preferred.length > 0 ? preferred : aliasMatches, context);
472
+ }
473
+ }
443
474
  // Check for provider/modelId format — fuzzy match within provider only.
444
475
  const slashIndex = modelPattern.indexOf("/");
445
476
  if (slashIndex !== -1) {