pi-model-auto 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,916 @@
1
+ import type { Api, Context, Model, SimpleStreamOptions, Usage } from "@earendil-works/pi-ai";
2
+ import { CANONICAL_MODELS, findRampModel, type CanonicalMeta, type CanonicalScores, type CostTier, type ModelProfile } from "./canonical-models.ts";
3
+ import { DEFAULT_QUOTA_CONFIG, type QuotaConfig } from "./quota.ts";
4
+
5
+ export type Tier = "cheap" | "strong";
6
+ export type RouteClass = Tier | "model";
7
+ export type Confidence = "high" | "medium" | "low";
8
+ /** Which benchmark drives every model's capability + cost. The two are never merged; selection is wholesale. */
9
+ export type CapabilitySource = "aa" | "ramp";
10
+ /** Task hardness, ordered. Sets how far up the capability frontier selection climbs (the willingness budget). */
11
+ export type Hardness = "trivial" | "normal" | "hard" | "max";
12
+ export const HARDNESS_ORDER: Hardness[] = ["trivial", "normal", "hard", "max"];
13
+
14
+ /** Fallback capability numbers for models with no canonical match and no override. */
15
+ const FALLBACK_INTELLIGENCE = 25;
16
+ const FALLBACK_PRICE = 3;
17
+
18
+ export interface CanonicalResolution {
19
+ canonical: CanonicalMeta | null;
20
+ costTier: CostTier;
21
+ profiles: ModelProfile[];
22
+ frontier: boolean;
23
+ intelligence: number;
24
+ priceBlended: number;
25
+ scores?: CanonicalScores;
26
+ tps?: number;
27
+ /** Whether the active capability source has data for this model. Unsupported models are not auto-routed. */
28
+ supported: boolean;
29
+ confidence: Confidence;
30
+ reason: string;
31
+ }
32
+
33
+ export interface ResolvedModel {
34
+ model: Model<Api>;
35
+ acceptsImage: boolean;
36
+ canonicalKey: string | null;
37
+ costTier: CostTier;
38
+ profiles: ModelProfile[];
39
+ frontier: boolean;
40
+ /** Synthetic intelligence index; capability axis for `balanced`/fallback profiles. */
41
+ intelligence: number;
42
+ /** List price $/1M tokens (blended 3:1). The Pareto cost axis; NOT marginal/subscription cost. */
43
+ priceBlended: number;
44
+ scores?: CanonicalScores;
45
+ tps?: number;
46
+ /** Whether the active capability source covers this model (or a user override does). Drives auto-pool inclusion. */
47
+ supported: boolean;
48
+ confidence: Confidence;
49
+ matchReason: string;
50
+ /** Time-of-day shadow-price windows, carried through so the price can be re-evaluated per turn
51
+ * (see `repriceForTimeOfDay`) without rebuilding the pool. `priceBlended` here is time-neutral. */
52
+ costCoefHours?: CostCoefWindow[];
53
+ }
54
+
55
+ export interface Pool {
56
+ cheapPool: ResolvedModel[];
57
+ strongPool: ResolvedModel[];
58
+ standardPool: ResolvedModel[];
59
+ unknownPool: ResolvedModel[];
60
+ all: ResolvedModel[];
61
+ }
62
+
63
+ export interface ModelOverride {
64
+ canonical?: string;
65
+ costTier?: CostTier;
66
+ profiles?: ModelProfile[];
67
+ frontier?: boolean;
68
+ intelligence?: number;
69
+ priceBlended?: number;
70
+ scores?: CanonicalScores;
71
+ tps?: number;
72
+ /**
73
+ * Shadow-price coefficient: multiplies the model's base cost-axis price (Ramp cost-per-task under
74
+ * `ramp`). It folds *my* economics into the shared capability frontier without touching the quality
75
+ * axis, and stays dimensionless so it can never put the axis into a foreign unit. <1 = cheaper to me
76
+ * than Ramp measured (an already-paid subscription, a discounted PAYG deal); >1 = pricier. Default 1
77
+ * = pure Ramp. Keep a shared/finite subscription a *positive* shadow price, not ~0: a near-zero coef
78
+ * makes an already-strong model dominate the whole frontier and starves the cheap PAYG floor.
79
+ */
80
+ costCoef?: number;
81
+ /** Time-of-day multipliers stacked on `costCoef` (e.g. GLM burns 3× quota 14:00–18:00). */
82
+ costCoefHours?: CostCoefWindow[];
83
+ }
84
+
85
+ export interface CostCoefWindow {
86
+ /** [start, end) in local 24h hours; wraps when start > end (e.g. [22, 2] = 22:00–02:00). */
87
+ hours: [number, number];
88
+ factor: number;
89
+ }
90
+
91
+ export interface ModelFilter {
92
+ include: string[];
93
+ exclude: string[];
94
+ }
95
+
96
+ export interface RouterConfig {
97
+ /** Which benchmark drives capability + cost. `ramp` (default) = real SWE-bench outcomes; `aa` = synthetic. Never merged. */
98
+ capabilitySource: CapabilitySource;
99
+ threshold: number;
100
+ weights: {
101
+ contextTokens: number;
102
+ lastUserLen: number;
103
+ keyword: number;
104
+ reasoning: number;
105
+ toolDensity: number;
106
+ };
107
+ log: boolean;
108
+ tierModels: Partial<Record<Tier, string>>;
109
+ /** Restrict the automatically built pool by provider/id/name/canonical substring. Empty include means allow all. */
110
+ modelFilter: ModelFilter;
111
+ /** User-supplied metadata for unknown/private/local models. Keys may be provider/id, model id, or normalized model id. */
112
+ modelOverrides: Record<string, ModelOverride>;
113
+ forceStrongOnHighReasoning: boolean;
114
+ /**
115
+ * Willingness to pay for capability, by task hardness: the max extra list-price ($/1M) spent for
116
+ * one more point of quality on the chosen axis. Selection walks the Pareto frontier from the
117
+ * cheapest point upward, taking each step whose marginal $/quality-point is within budget — so the
118
+ * hardness signal (driven by reasoning level) positions us on the frontier and steep low-value
119
+ * steps (a near-tie flagship at 2× price) are only taken at `max`. The single routing knob, axis-
120
+ * agnostic. Raise a row to climb further for that hardness; `max: Infinity` = "top of frontier".
121
+ */
122
+ willingness: Record<Hardness, number>;
123
+ /**
124
+ * Cross-turn cache stickiness. Once a model has a warm prompt cache (a "lease"), switching to a
125
+ * freshly-picked model pays a cache-write tax; we only switch when the economics win — cheaper warm
126
+ * reads on a downgrade, or enough capability gain on an upgrade. Layered on top of the Pareto pick.
127
+ */
128
+ cacheAware: {
129
+ enabled: boolean;
130
+ /** Extra USD the downgrade's read savings must beat the switch tax by, before switching down. */
131
+ downgradeMarginUsd: number;
132
+ /** Minimum capability gain (axis points: resolve-rate / intelligence) to switch up. */
133
+ upgradeQualityMargin: number;
134
+ /** USD of switch tax that counts as one required extra quality point when upgrading. */
135
+ upgradeTaxPenaltyScaleUsd: number;
136
+ /** Minimum user turns between model switches. */
137
+ minTurnsBetweenSwitches: number;
138
+ };
139
+ quota: QuotaConfig;
140
+ }
141
+
142
+ export interface Decision {
143
+ cls: RouteClass;
144
+ score: number;
145
+ chosen: string;
146
+ /** Task hardness index into HARDNESS_ORDER; sets how far selection climbs the capability frontier. */
147
+ hardnessBucket: number;
148
+ requestedProfile?: ModelProfile;
149
+ reason?: string;
150
+ }
151
+
152
+ export interface Selection {
153
+ selected: ResolvedModel;
154
+ profile: ModelProfile;
155
+ reason: string;
156
+ alternatives: string[];
157
+ }
158
+
159
+ /** A warm prompt-cache hold on a model: switching away from it pays a fresh cache-write tax. */
160
+ export interface CacheLease {
161
+ modelKey: string;
162
+ provider: string;
163
+ /** Raw registry cost fields for the leased model (per-token or per-1M; normalized at use). */
164
+ cost: { input: number; cacheRead: number; cacheWrite: number };
165
+ warmTokens: number;
166
+ establishedAtTurn: number;
167
+ lastUsedTurn: number;
168
+ }
169
+
170
+ /** Per-session routing memory for cache-aware stickiness. */
171
+ export interface RoutingState {
172
+ lease?: CacheLease;
173
+ lastSwitchTurn: number;
174
+ observedCacheReadRatio: number;
175
+ realizedCostByModel: Record<string, { usd: number }>;
176
+ lastUsage?: Usage;
177
+ }
178
+
179
+ export type CacheReason =
180
+ | "disabled"
181
+ | "no-lease"
182
+ | "same-model"
183
+ | "switch-cooldown"
184
+ | "downgrade-break-even"
185
+ | "downgrade-not-worth-it"
186
+ | "upgrade-quality"
187
+ | "upgrade-not-worth-it";
188
+
189
+ export interface CacheAwareResult {
190
+ selection: Selection;
191
+ cacheReason: CacheReason;
192
+ taxUsd?: number;
193
+ expectedSavingsUsd?: number;
194
+ }
195
+
196
+ /**
197
+ * Default willingness per source. The cost axis differs by source — AA is list price ($/1M tokens,
198
+ * ~0.5–20), Ramp is measured cost per task ($, ~0.09–2.7) — so the $/quality-point budgets are on
199
+ * different scales and must not be shared. `loadConfig` picks the table matching `capabilitySource`
200
+ * unless the user sets `willingness` explicitly.
201
+ */
202
+ export const AA_WILLINGNESS: Record<Hardness, number> = { trivial: 0.1, normal: 0.4, hard: 1.0, max: Infinity };
203
+ export const RAMP_WILLINGNESS: Record<Hardness, number> = { trivial: 0.02, normal: 0.06, hard: 0.2, max: Infinity };
204
+
205
+ export const DEFAULT_CONFIG: RouterConfig = {
206
+ capabilitySource: "ramp",
207
+ threshold: 0.45,
208
+ weights: {
209
+ contextTokens: 0.25,
210
+ lastUserLen: 0.15,
211
+ keyword: 0.35,
212
+ reasoning: 0.15,
213
+ toolDensity: 0.1,
214
+ },
215
+ log: false,
216
+ tierModels: {},
217
+ modelFilter: { include: [], exclude: [] },
218
+ modelOverrides: {},
219
+ forceStrongOnHighReasoning: false,
220
+ willingness: RAMP_WILLINGNESS,
221
+ cacheAware: {
222
+ enabled: true,
223
+ downgradeMarginUsd: 0.001,
224
+ upgradeQualityMargin: 3,
225
+ upgradeTaxPenaltyScaleUsd: 0.02,
226
+ minTurnsBetweenSwitches: 1,
227
+ },
228
+ quota: DEFAULT_QUOTA_CONFIG,
229
+ };
230
+
231
+ export function normalizeModelKey(key: string): string {
232
+ const withoutProvider = key.toLowerCase().split("/").at(-1) ?? key.toLowerCase();
233
+ return withoutProvider.trim().replace(/\s*\((?:high|medium|low)\)\s*$/i, "");
234
+ }
235
+
236
+ /** Resolve-rate at/above which a Ramp model is shown as a frontier/strong-pool candidate (display only). */
237
+ const RAMP_FRONTIER_RESOLVE = 75;
238
+
239
+ function rampCostTier(costPerTask: number): CostTier {
240
+ if (costPerTask < 0.4) return "cheap";
241
+ if (costPerTask <= 1.2) return "standard";
242
+ return "premium";
243
+ }
244
+
245
+ export function resolveCanonicalModel(key: string, source: CapabilitySource = "ramp"): CanonicalResolution {
246
+ const normalized = normalizeModelKey(key);
247
+ const canonical = CANONICAL_MODELS
248
+ .filter((entry) => normalized.includes(entry.key))
249
+ .sort((a, b) => b.key.length - a.key.length)[0];
250
+
251
+ if (!canonical) {
252
+ return {
253
+ canonical: null,
254
+ costTier: "unknown",
255
+ profiles: ["balanced"],
256
+ frontier: false,
257
+ intelligence: FALLBACK_INTELLIGENCE,
258
+ priceBlended: FALLBACK_PRICE,
259
+ supported: false,
260
+ confidence: "low",
261
+ reason: "no canonical match",
262
+ };
263
+ }
264
+
265
+ if (source === "ramp") {
266
+ const ramp = findRampModel(canonical.key);
267
+ if (!ramp) {
268
+ // Canonical name is known, but Ramp never measured it — unsupported for auto-routing under `ramp`.
269
+ return {
270
+ canonical,
271
+ costTier: canonical.costTier,
272
+ profiles: canonical.profiles,
273
+ frontier: false,
274
+ intelligence: FALLBACK_INTELLIGENCE,
275
+ priceBlended: FALLBACK_PRICE,
276
+ supported: false,
277
+ confidence: "low",
278
+ reason: `no Ramp result for ${canonical.key}`,
279
+ };
280
+ }
281
+ // One real outcome (resolve-rate) is the axis for every profile; mirror it into the per-profile scores.
282
+ const scores: CanonicalScores = { coding: ramp.resolveRate, agentic: ramp.resolveRate / 100 };
283
+ return {
284
+ canonical,
285
+ costTier: rampCostTier(ramp.costPerTask),
286
+ profiles: canonical.profiles,
287
+ frontier: ramp.resolveRate >= RAMP_FRONTIER_RESOLVE,
288
+ intelligence: ramp.resolveRate,
289
+ priceBlended: ramp.costPerTask,
290
+ scores,
291
+ tps: undefined,
292
+ supported: true,
293
+ confidence: "high",
294
+ reason: `Ramp: ${canonical.key} ${ramp.resolveRate}%@$${ramp.costPerTask}`,
295
+ };
296
+ }
297
+
298
+ return {
299
+ canonical,
300
+ costTier: canonical.costTier,
301
+ profiles: canonical.profiles,
302
+ frontier: canonical.frontier,
303
+ intelligence: canonical.intelligence,
304
+ priceBlended: canonical.priceBlended,
305
+ scores: canonical.scores,
306
+ tps: canonical.tps,
307
+ supported: true,
308
+ confidence: "high",
309
+ reason: `canonical match: ${canonical.key}`,
310
+ };
311
+ }
312
+
313
+ export function modelKey(model: Model<Api>): string {
314
+ return `${model.provider}/${model.id}`;
315
+ }
316
+
317
+ export function buildAutoPool(models: Model<Api>[], cfg: RouterConfig = DEFAULT_CONFIG): Pool {
318
+ const all = models
319
+ .filter((model) => model.provider !== "pi-router")
320
+ .filter((model) => model.input?.includes("text"))
321
+ .map((model) => resolveModel(model, cfg))
322
+ // A model the active source has no data for (and no override) is not auto-routed.
323
+ .filter((model) => model.supported)
324
+ .filter((model) => matchesModelFilter(model, cfg.modelFilter))
325
+ .sort(compareResolvedModels);
326
+
327
+ return {
328
+ cheapPool: all.filter((item) => item.costTier === "cheap"),
329
+ standardPool: all.filter((item) => item.costTier === "standard"),
330
+ strongPool: all.filter((item) => item.frontier || item.costTier === "premium"),
331
+ unknownPool: all.filter((item) => item.costTier === "unknown"),
332
+ all,
333
+ };
334
+ }
335
+
336
+ export function resolveModel(model: Model<Api>, cfg: RouterConfig = DEFAULT_CONFIG): ResolvedModel {
337
+ const key = modelKey(model);
338
+ const resolution = resolveCanonicalModel(key, cfg.capabilitySource);
339
+ const override = findModelOverride(cfg, key, resolution.canonical?.key ?? null);
340
+ // The base shadow-price coefficient folds the caller's real economics into the shared frontier: it
341
+ // scales whatever base cost the source/override resolved, staying dimensionless so the axis keeps one
342
+ // unit. Time-of-day windows are NOT folded here — they are re-applied per turn in repriceForTimeOfDay
343
+ // so the clock can cross a window boundary mid-session without a pool rebuild.
344
+ const coef = override?.costCoef ?? 1;
345
+
346
+ if (override) {
347
+ const base = override.priceBlended ?? blendedPriceFromCost(model) ?? resolution.priceBlended;
348
+ return {
349
+ model,
350
+ acceptsImage: model.input?.includes("image") ?? false,
351
+ canonicalKey: override.canonical ?? resolution.canonical?.key ?? normalizeModelKey(key),
352
+ costTier: override.costTier ?? resolution.costTier,
353
+ profiles: override.profiles ?? resolution.profiles,
354
+ frontier: override.frontier ?? resolution.frontier,
355
+ intelligence: override.intelligence ?? resolution.intelligence,
356
+ priceBlended: base * coef,
357
+ scores: override.scores ?? resolution.scores,
358
+ tps: override.tps ?? resolution.tps,
359
+ // An explicit override always makes the model routable, even when the active source lacks data.
360
+ supported: true,
361
+ confidence: resolution.canonical ? "medium" : "high",
362
+ matchReason: resolution.canonical
363
+ ? `user override + ${resolution.reason}`
364
+ : "user override for unknown model",
365
+ costCoefHours: override.costCoefHours,
366
+ };
367
+ }
368
+
369
+ const base = resolution.supported ? resolution.priceBlended : (blendedPriceFromCost(model) ?? resolution.priceBlended);
370
+ return {
371
+ model,
372
+ acceptsImage: model.input?.includes("image") ?? false,
373
+ canonicalKey: resolution.canonical?.key ?? null,
374
+ costTier: resolution.costTier,
375
+ profiles: resolution.profiles,
376
+ frontier: resolution.frontier,
377
+ intelligence: resolution.intelligence,
378
+ priceBlended: base * coef,
379
+ scores: resolution.scores,
380
+ tps: resolution.tps,
381
+ supported: resolution.supported,
382
+ confidence: resolution.confidence,
383
+ matchReason: resolution.reason,
384
+ };
385
+ }
386
+
387
+ /** Product of the time-of-day window factors active at `nowHour` (1 when none apply). */
388
+ export function timeCostMultiplier(windows: CostCoefWindow[] | undefined, nowHour: number): number {
389
+ if (!windows) return 1;
390
+ let mult = 1;
391
+ for (const window of windows) {
392
+ if (hourInRange(nowHour, window.hours[0], window.hours[1])) mult *= window.factor;
393
+ }
394
+ return mult;
395
+ }
396
+
397
+ /**
398
+ * Re-apply each model's time-of-day shadow-price windows against `nowHour`, returning a pool with
399
+ * updated prices. Called once per user turn at the selection boundary (where the clock is read), so a
400
+ * window like GLM's 14:00–18:00 3× starts and stops biting as time passes — no `/reload` needed. The
401
+ * caller reads the clock once per turn and reuses the pick within the turn, so prices stay stable
402
+ * across a turn's tool continuations.
403
+ */
404
+ export function repriceForTimeOfDay(pool: Pool, nowHour: number): Pool {
405
+ const reprice = (item: ResolvedModel): ResolvedModel => {
406
+ const mult = timeCostMultiplier(item.costCoefHours, nowHour);
407
+ return mult === 1 ? item : { ...item, priceBlended: item.priceBlended * mult };
408
+ };
409
+ return {
410
+ cheapPool: pool.cheapPool.map(reprice),
411
+ standardPool: pool.standardPool.map(reprice),
412
+ strongPool: pool.strongPool.map(reprice),
413
+ unknownPool: pool.unknownPool.map(reprice),
414
+ all: pool.all.map(reprice),
415
+ };
416
+ }
417
+
418
+ /** Whether `hour` falls in the half-open window [start, end), wrapping past midnight when start > end. */
419
+ function hourInRange(hour: number, start: number, end: number): boolean {
420
+ return start <= end ? hour >= start && hour < end : hour >= start || hour < end;
421
+ }
422
+
423
+ /** Best-effort list price ($/1M tokens, blended 3:1) from the registry's per-token cost, when present. */
424
+ function blendedPriceFromCost(model: Model<Api>): number | undefined {
425
+ const input = model.cost?.input ?? 0;
426
+ const output = model.cost?.output ?? 0;
427
+ if (input <= 0 && output <= 0) return undefined;
428
+ const perToken = (input * 3 + output) / 4;
429
+ // Registries usually express cost per token; scale to per-1M. If already per-1M (large), leave as-is.
430
+ return perToken < 0.001 ? perToken * 1_000_000 : perToken;
431
+ }
432
+
433
+ export function findModelOverride(
434
+ cfg: RouterConfig,
435
+ key: string,
436
+ canonicalKey: string | null,
437
+ ): ModelOverride | undefined {
438
+ const candidates = [key, key.toLowerCase(), normalizeModelKey(key), canonicalKey].filter(Boolean) as string[];
439
+ for (const candidate of candidates) {
440
+ const override = cfg.modelOverrides[candidate];
441
+ if (override) return override;
442
+ }
443
+ return undefined;
444
+ }
445
+
446
+ export function matchesModelFilter(item: ResolvedModel, filter: ModelFilter): boolean {
447
+ const include = filter.include.map(normalizeFilterPattern).filter(Boolean);
448
+ const exclude = filter.exclude.map(normalizeFilterPattern).filter(Boolean);
449
+ const haystack = modelFilterHaystack(item);
450
+
451
+ if (exclude.some((pattern) => haystack.includes(pattern))) return false;
452
+ if (include.length === 0) return true;
453
+ return include.some((pattern) => haystack.includes(pattern));
454
+ }
455
+
456
+ function normalizeFilterPattern(pattern: string): string {
457
+ return pattern.trim().toLowerCase();
458
+ }
459
+
460
+ function modelFilterHaystack(item: ResolvedModel): string {
461
+ return [
462
+ modelKey(item.model),
463
+ item.model.provider,
464
+ item.model.id,
465
+ item.model.name,
466
+ item.canonicalKey,
467
+ normalizeModelKey(modelKey(item.model)),
468
+ ]
469
+ .filter(Boolean)
470
+ .join("\n")
471
+ .toLowerCase();
472
+ }
473
+
474
+ export function decide(
475
+ context: Context,
476
+ options: SimpleStreamOptions | undefined,
477
+ forced: { tier: Tier } | { model: string } | undefined,
478
+ cfg: RouterConfig,
479
+ ): Decision {
480
+ if (forced && "model" in forced) return { cls: "model", score: 1, chosen: forced.model, hardnessBucket: 3, reason: "forced model" };
481
+ if (forced && "tier" in forced) {
482
+ return {
483
+ cls: forced.tier,
484
+ score: forced.tier === "strong" ? 1 : 0,
485
+ chosen: "",
486
+ // @cheap means "cheapest acceptable"; @strong means "the strong end".
487
+ hardnessBucket: forced.tier === "strong" ? 3 : 0,
488
+ requestedProfile: inferRequestedProfile(context),
489
+ reason: "forced",
490
+ };
491
+ }
492
+
493
+ const score = classify(context, options, cfg);
494
+ const hardnessBucket = autoHardnessBucket(score, options);
495
+ return {
496
+ cls: hardnessBucket >= 2 ? "strong" : "cheap",
497
+ score,
498
+ chosen: "",
499
+ hardnessBucket,
500
+ requestedProfile: inferRequestedProfile(context),
501
+ };
502
+ }
503
+
504
+ /**
505
+ * Continuous task-hardness bucket (index into HARDNESS_ORDER) for auto mode. The bucket — not a
506
+ * binary cheap/strong split — drives the capability floor, so the whole frontier (incl. mid-tier
507
+ * models) becomes reachable. Reasoning level is an explicit floor *guarantee*: it can only raise it.
508
+ */
509
+ export function autoHardnessBucket(score: number, options: SimpleStreamOptions | undefined): number {
510
+ const scoreBucket = score < 0.30 ? 0 : score < 0.52 ? 1 : score < 0.74 ? 2 : 3;
511
+ const reasoningBucket = reasoningFloorBucket(options?.reasoning);
512
+ return Math.max(scoreBucket, reasoningBucket);
513
+ }
514
+
515
+ function reasoningFloorBucket(reasoning: SimpleStreamOptions["reasoning"] | undefined): number {
516
+ switch (reasoning) {
517
+ case "medium":
518
+ return 1;
519
+ case "high":
520
+ return 2;
521
+ case "xhigh":
522
+ return 3;
523
+ default:
524
+ return 0; // off / low
525
+ }
526
+ }
527
+
528
+ export function classify(context: Context, options: SimpleStreamOptions | undefined, cfg: RouterConfig): number {
529
+ const text = lastUserText(context).toLowerCase();
530
+ const contextTokens = estimateContextTokens(context);
531
+ const reasoning = options?.reasoning && ["medium", "high", "xhigh"].includes(options.reasoning) ? 1 : 0;
532
+ const toolDensity = Math.min(1, countRecentToolResults(context) / 8);
533
+
534
+ const raw =
535
+ normalize(contextTokens, 8_000, 120_000) * cfg.weights.contextTokens +
536
+ normalize(text.length, 120, 1_200) * cfg.weights.lastUserLen +
537
+ keywordScore(text) * cfg.weights.keyword +
538
+ reasoning * cfg.weights.reasoning +
539
+ toolDensity * cfg.weights.toolDensity;
540
+
541
+ return Math.max(0, Math.min(1, raw));
542
+ }
543
+
544
+ export function inferRequestedProfile(context: Context): ModelProfile {
545
+ if (contextHasImage(context)) return "vision";
546
+
547
+ const text = lastUserText(context).toLowerCase();
548
+ if (/\b(root cause|debug|architecture|design|plan|race condition|concurrency)\b/.test(text)) {
549
+ return "deep";
550
+ }
551
+ if (/\b(fast|quick|high frequency|low latency|speed)\b/.test(text)) {
552
+ return "fast";
553
+ }
554
+ if (/\b(code|coding|refactor|multi-file|implement|test|typescript|elixir|ruby|go)\b/.test(text)) {
555
+ return "coder";
556
+ }
557
+ return "balanced";
558
+ }
559
+
560
+ /** Minimum intelligence a `fast`-profile pick must clear before maximizing throughput. */
561
+ const FAST_MIN_INTELLIGENCE = 33;
562
+
563
+ /** Approximate one axis from another when a model lacks the native metric (keeps the scale comparable). */
564
+ export function axisValue(item: ResolvedModel, profile: ModelProfile): number {
565
+ if (profile === "coder") return item.scores?.coding ?? item.intelligence + 15;
566
+ if (profile === "deep") return item.scores?.agentic != null ? item.scores.agentic * 100 : item.intelligence + 24;
567
+ return item.intelligence; // balanced / vision / fallback
568
+ }
569
+
570
+ /** Models that pass the hard constraints (vision + context window) for this request. */
571
+ function eligibleModels(pool: Pool, context: Context): { eligible: ResolvedModel[]; overflow: boolean } {
572
+ const needsImage = contextHasImage(context);
573
+ const tokens = estimateContextTokens(context);
574
+ const visionOk = pool.all.filter((item) => !needsImage || item.acceptsImage);
575
+
576
+ if (needsImage && visionOk.length === 0) {
577
+ throw new Error("Pi Router: no vision-capable authenticated model for an image request.");
578
+ }
579
+
580
+ const withinWindow = visionOk.filter((item) => !item.model.contextWindow || tokens <= item.model.contextWindow);
581
+ // Window too tight everywhere: try anyway on the largest window rather than refuse outright.
582
+ return withinWindow.length > 0 ? { eligible: withinWindow, overflow: false } : { eligible: visionOk, overflow: true };
583
+ }
584
+
585
+ /**
586
+ * Capability Pareto frontier on (quality, list price): keep a model only if no other is at least as
587
+ * capable AND no more expensive (strictly better on one). Dominated models — dumber *and* pricier —
588
+ * are strict waste and never selected. This replaces the old magic-number scoreCandidate.
589
+ */
590
+ export function paretoFrontier(items: ResolvedModel[], profile: ModelProfile): ResolvedModel[] {
591
+ return items.filter((a) => {
592
+ const qa = axisValue(a, profile);
593
+ return !items.some((b) => {
594
+ if (b === a) return false;
595
+ const qb = axisValue(b, profile);
596
+ return qb >= qa && b.priceBlended <= a.priceBlended && (qb > qa || b.priceBlended < a.priceBlended);
597
+ });
598
+ });
599
+ }
600
+
601
+ /**
602
+ * The frontier as a monotone chain, cheapest+weakest → priciest+strongest, with equal-(quality,price)
603
+ * duplicates collapsed deterministically. This is the ordered set of operating points to climb.
604
+ */
605
+ export function frontierChain(items: ResolvedModel[], profile: ModelProfile): ResolvedModel[] {
606
+ const sorted = [...paretoFrontier(items, profile)].sort(
607
+ (a, b) =>
608
+ axisValue(a, profile) - axisValue(b, profile) ||
609
+ a.priceBlended - b.priceBlended ||
610
+ modelKey(a.model).localeCompare(modelKey(b.model)),
611
+ );
612
+ const chain: ResolvedModel[] = [];
613
+ for (const item of sorted) {
614
+ const prev = chain.at(-1);
615
+ if (prev && axisValue(prev, profile) === axisValue(item, profile) && prev.priceBlended === item.priceBlended) continue;
616
+ chain.push(item);
617
+ }
618
+ return chain;
619
+ }
620
+
621
+ /** Walk the frontier upward, taking each step whose marginal $/quality-point is within budget. */
622
+ function climbFrontier(chain: ResolvedModel[], profile: ModelProfile, willingness: number): ResolvedModel {
623
+ let pick = chain[0];
624
+ for (let i = 1; i < chain.length; i++) {
625
+ const dq = axisValue(chain[i], profile) - axisValue(pick, profile);
626
+ const dp = chain[i].priceBlended - pick.priceBlended;
627
+ if (dq > 0 && dp / dq > willingness) break;
628
+ pick = chain[i];
629
+ }
630
+ return pick;
631
+ }
632
+
633
+ export function selectFromPool(
634
+ decision: Decision,
635
+ pool: Pool,
636
+ context: Context,
637
+ options: SimpleStreamOptions | undefined,
638
+ cfg: RouterConfig,
639
+ ): Selection | undefined {
640
+ const profile = inferRequestedProfile(context);
641
+ const { eligible, overflow } = eligibleModels(pool, context);
642
+ if (eligible.length === 0) return undefined;
643
+
644
+ let bucket = decision.hardnessBucket;
645
+ if (cfg.forceStrongOnHighReasoning && (options?.reasoning === "high" || options?.reasoning === "xhigh")) {
646
+ bucket = HARDNESS_ORDER.length - 1;
647
+ }
648
+ const hardness = HARDNESS_ORDER[Math.max(0, Math.min(HARDNESS_ORDER.length - 1, bucket))];
649
+
650
+ // `fast` is orthogonal: gate on a low capability floor, then maximize throughput.
651
+ if (profile === "fast") {
652
+ const usable = eligible.filter((item) => item.intelligence >= FAST_MIN_INTELLIGENCE);
653
+ const pickFrom = usable.length > 0 ? usable : eligible;
654
+ const selected = [...pickFrom].sort((a, b) =>
655
+ (b.tps ?? 0) - (a.tps ?? 0) ||
656
+ a.priceBlended - b.priceBlended ||
657
+ b.intelligence - a.intelligence ||
658
+ modelKey(a.model).localeCompare(modelKey(b.model)),
659
+ )[0];
660
+ return buildSelection(selected, eligible, profile, `fast: top throughput${overflowNote(overflow)}`);
661
+ }
662
+
663
+ // Climb the capability frontier as far as the hardness budget allows.
664
+ const chain = frontierChain(eligible, profile);
665
+ const willingness = cfg.willingness[hardness];
666
+ const selected = climbFrontier(chain, profile, willingness);
667
+
668
+ const budget = willingness === Infinity ? "∞" : willingness.toString();
669
+ const reason = `${hardness}/${profile} w≤$${budget}/pt → ${axisValue(selected, profile).toFixed(0)}@$${selected.priceBlended}${overflowNote(overflow)}`;
670
+ return buildSelection(selected, chain, profile, reason);
671
+ }
672
+
673
+ function buildSelection(
674
+ selected: ResolvedModel,
675
+ frontier: ResolvedModel[],
676
+ profile: ModelProfile,
677
+ reason: string,
678
+ ): Selection {
679
+ return {
680
+ selected,
681
+ profile,
682
+ reason,
683
+ alternatives: frontier.filter((item) => item !== selected).map((item) => modelKey(item.model)),
684
+ };
685
+ }
686
+
687
+ function overflowNote(overflow: boolean): string {
688
+ return overflow ? "; context may overflow" : "";
689
+ }
690
+
691
+ // ── Cross-turn cache-aware stickiness ────────────────────────────────────────
692
+ // Layered on top of the Pareto pick. The Pareto pass says which model best fits this turn's
693
+ // hardness; this pass asks whether a warm cache lease is worth keeping instead of switching to it.
694
+
695
+ export function createRoutingState(): RoutingState {
696
+ return { lastSwitchTurn: Number.NEGATIVE_INFINITY, observedCacheReadRatio: 0, realizedCostByModel: {} };
697
+ }
698
+
699
+ /** Monotonic user-turn counter (number of user messages) — no harness turn hooks needed. */
700
+ export function userTurnIndex(context: Context): number {
701
+ return context.messages.reduce((count, message) => (message.role === "user" ? count + 1 : count), 0);
702
+ }
703
+
704
+ /** Normalize a registry cost field to USD-per-token, tolerating per-token or per-1M conventions. */
705
+ function costPerTokenUsd(cost: number): number {
706
+ return cost >= 0.001 ? cost / 1_000_000 : cost;
707
+ }
708
+
709
+ /**
710
+ * Given the fresh Pareto selection, decide whether to keep the warm lease instead. Returns the fresh
711
+ * pick when there is no lease, when the fresh pick already is the lease, or when an economic switch wins.
712
+ */
713
+ export function cacheAwareSelect(
714
+ fresh: Selection,
715
+ state: RoutingState,
716
+ pool: Pool,
717
+ context: Context,
718
+ cfg: RouterConfig,
719
+ ): CacheAwareResult {
720
+ if (!cfg.cacheAware.enabled) return { selection: fresh, cacheReason: "disabled" };
721
+
722
+ const lease = state.lease;
723
+ const leaseItem = lease ? pool.all.find((item) => modelKey(item.model) === lease.modelKey) : undefined;
724
+ // No warm lease, or the leased model is no longer eligible (deauthed / cooled down) → take the fresh pick.
725
+ if (!lease || !leaseItem) return { selection: fresh, cacheReason: "no-lease" };
726
+ if (modelKey(fresh.selected.model) === lease.modelKey) return { selection: fresh, cacheReason: "same-model" };
727
+
728
+ const profile = fresh.profile;
729
+ const stay = leaseSelection(leaseItem, fresh, profile);
730
+
731
+ if (userTurnIndex(context) - state.lastSwitchTurn < cfg.cacheAware.minTurnsBetweenSwitches) {
732
+ return { selection: { ...stay, reason: "cache-stay: switch cooldown" }, cacheReason: "switch-cooldown" };
733
+ }
734
+
735
+ const contextTokens = state.lastUsage && state.lastUsage.totalTokens > 0 ? state.lastUsage.totalTokens : estimateContextTokens(context);
736
+ const taxUsd = switchTaxUsd(contextTokens, lease, fresh.selected);
737
+ const qLease = axisValue(leaseItem, profile);
738
+ const qFresh = axisValue(fresh.selected, profile);
739
+
740
+ if (qFresh <= qLease) {
741
+ const expectedSavingsUsd = expectedDowngradeSavingsUsd(contextTokens, lease, fresh.selected, state);
742
+ if (expectedSavingsUsd >= Math.max(0, taxUsd) + cfg.cacheAware.downgradeMarginUsd) {
743
+ return {
744
+ selection: { ...fresh, reason: `${fresh.reason}; downgrade saves ${formatUsd(expectedSavingsUsd)} > tax ${formatUsd(taxUsd)}` },
745
+ cacheReason: "downgrade-break-even",
746
+ taxUsd,
747
+ expectedSavingsUsd,
748
+ };
749
+ }
750
+ return {
751
+ selection: { ...stay, reason: `cache-stay: downgrade saves ${formatUsd(expectedSavingsUsd)} < tax ${formatUsd(taxUsd)}` },
752
+ cacheReason: "downgrade-not-worth-it",
753
+ taxUsd,
754
+ expectedSavingsUsd,
755
+ };
756
+ }
757
+
758
+ const gain = qFresh - qLease;
759
+ const taxPenalty = Math.max(0, taxUsd) / Math.max(cfg.cacheAware.upgradeTaxPenaltyScaleUsd, 1e-6);
760
+ if (gain >= cfg.cacheAware.upgradeQualityMargin + taxPenalty) {
761
+ return {
762
+ selection: { ...fresh, reason: `${fresh.reason}; upgrade +${gain.toFixed(0)}pt covers tax` },
763
+ cacheReason: "upgrade-quality",
764
+ taxUsd,
765
+ };
766
+ }
767
+ return {
768
+ selection: { ...stay, reason: `cache-stay: upgrade +${gain.toFixed(0)}pt below margin` },
769
+ cacheReason: "upgrade-not-worth-it",
770
+ taxUsd,
771
+ };
772
+ }
773
+
774
+ function leaseSelection(leaseItem: ResolvedModel, fresh: Selection, profile: ModelProfile): Selection {
775
+ const leaseKey = modelKey(leaseItem.model);
776
+ return {
777
+ selected: leaseItem,
778
+ profile,
779
+ reason: "warm cache lease",
780
+ alternatives: [modelKey(fresh.selected.model), ...fresh.alternatives.filter((key) => key !== leaseKey)],
781
+ };
782
+ }
783
+
784
+ /** Record the realized usage of a turn: refresh cache-read ratio and re-establish the lease. */
785
+ export function recordRoutingUsage(state: RoutingState, selected: ResolvedModel, usage: Usage, context: Context): void {
786
+ const key = modelKey(selected.model);
787
+ const totalPromptTokens = usage.input + usage.cacheRead + usage.cacheWrite;
788
+ const cacheReadRatio = totalPromptTokens > 0 ? usage.cacheRead / totalPromptTokens : 0;
789
+ state.observedCacheReadRatio = movingAverage(state.observedCacheReadRatio, cacheReadRatio, 0.25);
790
+ state.realizedCostByModel[key] = { usd: (state.realizedCostByModel[key]?.usd ?? 0) + usage.cost.total };
791
+ state.lastUsage = usage;
792
+
793
+ const turn = userTurnIndex(context);
794
+ if (state.lease && state.lease.modelKey !== key) state.lastSwitchTurn = turn;
795
+ state.lease = {
796
+ modelKey: key,
797
+ provider: selected.model.provider,
798
+ cost: { input: selected.model.cost.input, cacheRead: selected.model.cost.cacheRead, cacheWrite: selected.model.cost.cacheWrite },
799
+ warmTokens: totalPromptTokens,
800
+ establishedAtTurn: state.lease?.modelKey === key ? state.lease.establishedAtTurn : turn,
801
+ lastUsedTurn: turn,
802
+ };
803
+ }
804
+
805
+ /** Switching pays a cache-write on the candidate instead of re-reading the warm lease. */
806
+ function switchTaxUsd(contextTokens: number, lease: CacheLease, candidate: ResolvedModel): number {
807
+ const stayCost = contextTokens * costPerTokenUsd(lease.cost.cacheRead);
808
+ const switchCost = contextTokens * costPerTokenUsd(candidate.model.cost.cacheWrite);
809
+ return switchCost - stayCost;
810
+ }
811
+
812
+ /** Downgrading earns cheaper warm reads for the rest of the domain. */
813
+ function expectedDowngradeSavingsUsd(contextTokens: number, lease: CacheLease, candidate: ResolvedModel, state: RoutingState): number {
814
+ const warmTokens = Math.max(contextTokens * state.observedCacheReadRatio, lease.warmTokens);
815
+ const readDelta = Math.max(0, costPerTokenUsd(lease.cost.cacheRead) - costPerTokenUsd(candidate.model.cost.cacheRead));
816
+ return warmTokens * readDelta;
817
+ }
818
+
819
+ function movingAverage(previous: number, next: number, weight: number): number {
820
+ return previous === 0 ? next : previous * (1 - weight) + next * weight;
821
+ }
822
+
823
+ function formatUsd(value: number): string {
824
+ return `$${value.toFixed(6)}`;
825
+ }
826
+
827
+ export function contextHasImage(context: Context): boolean {
828
+ return context.messages.some(
829
+ (message) => Array.isArray(message.content) && message.content.some((part) => part.type === "image"),
830
+ );
831
+ }
832
+
833
+ export function lastUserText(context: Context): string {
834
+ for (let i = context.messages.length - 1; i >= 0; i--) {
835
+ const message = context.messages[i];
836
+ if (message.role !== "user") continue;
837
+ return userMessageText(message);
838
+ }
839
+ return "";
840
+ }
841
+
842
+ export function routingTurnKey(context: Context): string {
843
+ let userCount = 0;
844
+ let lastText = "";
845
+
846
+ for (const message of context.messages) {
847
+ if (message.role !== "user") continue;
848
+ userCount += 1;
849
+ lastText = userMessageText(message);
850
+ }
851
+
852
+ return `${userCount}:${stableHash(lastText)}`;
853
+ }
854
+
855
+ export function shouldReuseTurnSelection(context: Context): boolean {
856
+ return context.messages.at(-1)?.role === "toolResult";
857
+ }
858
+
859
+ function userMessageText(message: Extract<Context["messages"][number], { role: "user" }>): string {
860
+ if (typeof message.content === "string") return message.content;
861
+ return message.content
862
+ .filter((part) => part.type === "text")
863
+ .map((part) => part.text)
864
+ .join("\n");
865
+ }
866
+
867
+ function stableHash(text: string): string {
868
+ let hash = 5381;
869
+ for (let i = 0; i < text.length; i++) hash = ((hash << 5) + hash) ^ text.charCodeAt(i);
870
+ return (hash >>> 0).toString(36);
871
+ }
872
+
873
+ export function estimateContextTokens(context: Context): number {
874
+ const system = context.systemPrompt?.length ?? 0;
875
+ const chars = context.messages.reduce((sum, message) => {
876
+ if (typeof message.content === "string") return sum + message.content.length;
877
+
878
+ return sum + message.content.reduce((inner, part) => {
879
+ if (part.type === "text") return inner + part.text.length;
880
+ if (part.type === "thinking") return inner + part.thinking.length;
881
+ if (part.type === "toolCall") return inner + JSON.stringify(part.arguments).length + part.name.length;
882
+ return inner + 1024;
883
+ }, 0);
884
+ }, system);
885
+
886
+ return Math.ceil(chars / 4);
887
+ }
888
+
889
+ function resolveCostRank(tier: CostTier): number {
890
+ if (tier === "cheap") return 0;
891
+ if (tier === "standard") return 1;
892
+ if (tier === "premium") return 2;
893
+ return 3;
894
+ }
895
+
896
+ function compareResolvedModels(a: ResolvedModel, b: ResolvedModel): number {
897
+ return resolveCostRank(a.costTier) - resolveCostRank(b.costTier) || modelKey(a.model).localeCompare(modelKey(b.model));
898
+ }
899
+
900
+ function countRecentToolResults(context: Context): number {
901
+ return context.messages.slice(-12).filter((message) => message.role === "toolResult").length;
902
+ }
903
+
904
+ function keywordScore(text: string): number {
905
+ const cheap = /\b(format|lint|typo|rename|docs?|readme|translate|summari[sz]e|grep|search)\b/.test(text);
906
+ const strong = /\b(architecture|design|debug|root cause|race condition|refactor|multi-file|security|performance|concurrency|plan)\b/.test(text);
907
+ if (strong) return 1;
908
+ if (cheap) return 0;
909
+ return 0.45;
910
+ }
911
+
912
+ function normalize(value: number, low: number, high: number): number {
913
+ if (value <= low) return 0;
914
+ if (value >= high) return 1;
915
+ return (value - low) / (high - low);
916
+ }