@routerlab/core 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,338 @@
1
+ // quality_predictor.ts — calibrated quality predictor for routerlab.
2
+ //
3
+ // --------------------------------------------------------------------------
4
+ // WHY THIS MODULE EXISTS
5
+ // --------------------------------------------------------------------------
6
+ // Phase 2 shipped a hardcoded per-(taskClass, model) quality prior in
7
+ // `quality_prior.ts`. That prior was a deliberate placeholder: routing
8
+ // decisions were "shaped right" but uncalibrated. Phase 3 replaces it with
9
+ // a real, data-driven predictor whose mean and 95% confidence interval are
10
+ // computed from the eval-harness measurements written to
11
+ // `eval/results/quality_table.json` by the `router-frontier` agent.
12
+ //
13
+ // Differentiation versus prior art:
14
+ // a calibrated pre-call quality estimator is not novel on its own —
15
+ // RouteLLM's matrix factorization, BEST-Route's difficulty heads, and
16
+ // cross-attention routers all do this. Our differentiation is reporting
17
+ // **explicit confidence intervals** (Wilson score, n-aware) so that a
18
+ // caller can route on a *lower-bound* quality estimate at high confidence
19
+ // rather than a point estimate, and pair that with atlas-grounded cost
20
+ // (see `cost.ts`) for a fully accountable routing trace.
21
+ //
22
+ // --------------------------------------------------------------------------
23
+ // DATA SOURCE PRECEDENCE
24
+ // --------------------------------------------------------------------------
25
+ // 1. `eval/results/quality_table.json` on disk if present (read once,
26
+ // memoized). Each cell carries `{ trials, successes }` from the eval
27
+ // harness. Mean is `successes/trials`; CI is Wilson score 95%.
28
+ // `n` is `trials`.
29
+ // 2. Otherwise the seeded prior table is used. The prior is
30
+ // treated as if it were a measurement with `PRIOR_N` synthetic
31
+ // trials so that the CI still has a defined shape. Callers can
32
+ // detect this from the `n` field — it equals `PRIOR_N` when the
33
+ // data is the fallback rather than a real measurement.
34
+ //
35
+ // --------------------------------------------------------------------------
36
+ // PURITY & DETERMINISM
37
+ // --------------------------------------------------------------------------
38
+ // All math is pure and deterministic. The only side effect is the one-time
39
+ // disk read at module init, memoized in `qualityTableCache`. Tests can
40
+ // reset that cache via `__resetQualityCacheForTest`.
41
+ import { readFileSync } from "node:fs";
42
+ import { dirname, resolve as resolvePath } from "node:path";
43
+ import { fileURLToPath } from "node:url";
44
+ import { __QUALITY_PRIOR_TABLE } from "./quality_prior.js";
45
+ // ---------------------------------------------------------------------------
46
+ // Constants
47
+ // ---------------------------------------------------------------------------
48
+ /**
49
+ * Z-score for a 95% confidence interval. Two-sided, normal approximation.
50
+ * Hardcoded to avoid pulling in a stats dependency; the Wilson interval
51
+ * uses this value verbatim.
52
+ */
53
+ const Z_95 = 1.959963984540054;
54
+ /**
55
+ * Default quality when neither the measured table nor the prior carries
56
+ * an entry for (taskClass, model). Matches `quality_prior.ts`'s default
57
+ * so callers see consistent behaviour across the two paths.
58
+ */
59
+ const DEFAULT_QUALITY = 0.5;
60
+ /**
61
+ * Synthetic trial count assigned to a fallback-prior cell so that the CI
62
+ * is well-defined. Chosen at 10 trials, which yields a wide CI (~+/-0.3
63
+ * for p=0.5) — that's correct behaviour: the prior should be treated as
64
+ * weak evidence, and a caller routing on `lo95` will be appropriately
65
+ * conservative until real measurements land.
66
+ */
67
+ export const PRIOR_N = 10;
68
+ /**
69
+ * The default location of the eval-harness quality table on disk.
70
+ * Resolved repo-relative from this module's URL so the predictor works on
71
+ * any machine without hardcoded absolute paths. This file lives at
72
+ * `packages/core/src/quality_predictor.ts`; the quality table is at
73
+ * `eval/results/quality_table.json` (4 levels up to repo root, then down).
74
+ * Tests override via the env var below.
75
+ */
76
+ const DEFAULT_QUALITY_TABLE_PATH = resolvePath(dirname(fileURLToPath(import.meta.url)), "..", "..", "..", "..", "eval", "results", "quality_table.json");
77
+ /**
78
+ * Env var that overrides `DEFAULT_QUALITY_TABLE_PATH`. Tests set this to
79
+ * a tmp file; production callers can use it to ship custom calibrations.
80
+ */
81
+ const QUALITY_TABLE_PATH_ENV_VAR = "ROUTERLAB_QUALITY_TABLE_PATH";
82
+ // ---------------------------------------------------------------------------
83
+ // Wilson score interval (pure math)
84
+ // ---------------------------------------------------------------------------
85
+ /**
86
+ * Wilson score 95% confidence interval for a binomial proportion.
87
+ *
88
+ * Reference: Wilson, E. B. (1927). "Probable inference, the law of
89
+ * succession, and statistical inference." JASA 22(158): 209-212.
90
+ *
91
+ * Why Wilson over Wald: Wald (the textbook normal approximation, p̂ ±
92
+ * z*sqrt(p̂(1-p̂)/n)) collapses to the empty interval when p̂ ∈ {0, 1}
93
+ * and underestimates uncertainty for small n. Wilson is well-defined at
94
+ * the boundary, has better coverage at small n, and is the recommended
95
+ * default for binomial CIs in modern stats texts (e.g. Agresti & Coull,
96
+ * 1998 — "Approximate is better than 'exact' for interval estimation of
97
+ * binomial proportions").
98
+ *
99
+ * Inputs:
100
+ * - `successes`: integer in [0, trials].
101
+ * - `trials`: positive integer.
102
+ *
103
+ * Returns the (lo, hi) tuple clamped to [0, 1]. Pure function.
104
+ */
105
+ export function wilsonScore95(successes, trials) {
106
+ if (!Number.isFinite(successes) || !Number.isFinite(trials)) {
107
+ throw new Error("wilsonScore95: successes and trials must be finite numbers");
108
+ }
109
+ if (trials <= 0) {
110
+ throw new Error("wilsonScore95: trials must be a positive integer");
111
+ }
112
+ if (successes < 0 || successes > trials) {
113
+ throw new Error("wilsonScore95: successes must be in [0, trials]");
114
+ }
115
+ const n = trials;
116
+ const p = successes / n;
117
+ const z = Z_95;
118
+ const z2 = z * z;
119
+ const denom = 1 + z2 / n;
120
+ const center = (p + z2 / (2 * n)) / denom;
121
+ const radius = (z * Math.sqrt((p * (1 - p)) / n + z2 / (4 * n * n))) / denom;
122
+ // Boundary fix-ups. At p=1 or p=0 the analytical Wilson bounds touch
123
+ // the unit interval exactly, but floating-point evaluation drifts by a
124
+ // few ulps. Snap those cases so callers get exact 0/1 values and so
125
+ // that downstream comparisons (e.g. `hi >= mean` when mean = 1) hold.
126
+ let lo = Math.max(0, center - radius);
127
+ let hi = Math.min(1, center + radius);
128
+ if (successes === trials)
129
+ hi = 1;
130
+ if (successes === 0)
131
+ lo = 0;
132
+ return { lo, hi };
133
+ }
134
+ const TASK_CLASSES = new Set([
135
+ "qa",
136
+ "codegen",
137
+ "summarization",
138
+ "classification",
139
+ "reasoning",
140
+ ]);
141
+ const isTaskClass = (s) => TASK_CLASSES.has(s);
142
+ const parseQualityFile = (path, raw) => {
143
+ let parsed;
144
+ try {
145
+ parsed = JSON.parse(raw);
146
+ }
147
+ catch (cause) {
148
+ throw new Error(`quality_table.json at "${path}" is not valid JSON: ${cause instanceof Error ? cause.message : String(cause)}`);
149
+ }
150
+ if (parsed === null || typeof parsed !== "object") {
151
+ throw new Error(`quality_table.json at "${path}" root must be an object`);
152
+ }
153
+ const file = parsed;
154
+ const cellsIn = file.cells;
155
+ if (cellsIn === undefined || typeof cellsIn !== "object" || cellsIn === null) {
156
+ // Schema-compatible but empty: treat as no measurements present and
157
+ // fall back to the prior, mirroring cost.ts's tolerance of a partly
158
+ // populated atlas file. The caller will see `source: "prior"` cells.
159
+ return buildPriorTable();
160
+ }
161
+ const cellsOut = {};
162
+ for (const [modelId, perTask] of Object.entries(cellsIn)) {
163
+ if (typeof perTask !== "object" || perTask === null)
164
+ continue;
165
+ const inner = {};
166
+ for (const [taskKey, cell] of Object.entries(perTask)) {
167
+ if (!isTaskClass(taskKey))
168
+ continue;
169
+ if (typeof cell !== "object" || cell === null)
170
+ continue;
171
+ const successes = cell.successes;
172
+ const trials = cell.trials;
173
+ if (typeof successes !== "number" ||
174
+ !Number.isFinite(successes) ||
175
+ successes < 0) {
176
+ continue;
177
+ }
178
+ if (typeof trials !== "number" || !Number.isFinite(trials) || trials <= 0) {
179
+ continue;
180
+ }
181
+ if (successes > trials)
182
+ continue;
183
+ inner[taskKey] = {
184
+ successes: Math.floor(successes),
185
+ trials: Math.floor(trials),
186
+ };
187
+ }
188
+ if (Object.keys(inner).length > 0) {
189
+ cellsOut[modelId] = inner;
190
+ }
191
+ }
192
+ if (Object.keys(cellsOut).length === 0) {
193
+ // File parsed but produced no usable cells — same fallback as missing.
194
+ return buildPriorTable();
195
+ }
196
+ return {
197
+ source: "measured",
198
+ cells: cellsOut,
199
+ loadedFrom: path,
200
+ ...(typeof file.generated_at === "string" ? { generatedAt: file.generated_at } : {}),
201
+ };
202
+ };
203
+ /**
204
+ * Build a `QualityTable` from the hardcoded prior. The prior values
205
+ * are interpreted as probabilities; we synthesize `(successes, trials)`
206
+ * with `trials = PRIOR_N` so the Wilson CI has a defined shape.
207
+ *
208
+ * The synthesized successes are clamped to `[1, PRIOR_N - 1]`. Intuition:
209
+ * the prior is **weak evidence**, never a certainty. A cell that says
210
+ * "0.95 quality" should not back the implausibly strong claim "perfect
211
+ * on 10/10 synthetic trials" — that would convince a strict caller
212
+ * (`qualityBar = 1.0`) to route to the model with no real measurements
213
+ * to support it. Capping at `PRIOR_N - 1` keeps the prior conservative
214
+ * and ensures a `qualityBar = 1.0` request always falls through to real
215
+ * data or errors out — both of which are the correct behaviour.
216
+ */
217
+ const buildPriorTable = () => {
218
+ const cells = {};
219
+ for (const [modelId, row] of Object.entries(__QUALITY_PRIOR_TABLE)) {
220
+ const inner = {};
221
+ for (const taskKey of TASK_CLASSES) {
222
+ const p = row[taskKey];
223
+ // Round half-up, then clamp to the open interval (0, PRIOR_N).
224
+ const rounded = Math.round(p * PRIOR_N);
225
+ const successes = Math.min(PRIOR_N - 1, Math.max(1, rounded));
226
+ inner[taskKey] = { successes, trials: PRIOR_N };
227
+ }
228
+ cells[modelId] = inner;
229
+ }
230
+ return { source: "prior", cells };
231
+ };
232
+ // ---------------------------------------------------------------------------
233
+ // Memoization
234
+ // ---------------------------------------------------------------------------
235
+ let qualityTableCache;
236
+ const resolveQualityTablePath = () => {
237
+ const fromEnv = process.env[QUALITY_TABLE_PATH_ENV_VAR];
238
+ if (fromEnv !== undefined && fromEnv.length > 0)
239
+ return fromEnv;
240
+ return DEFAULT_QUALITY_TABLE_PATH;
241
+ };
242
+ const loadQualityTable = () => {
243
+ const path = resolveQualityTablePath();
244
+ let raw;
245
+ try {
246
+ raw = readFileSync(path, "utf8");
247
+ }
248
+ catch {
249
+ // File not present — the expected state before the eval harness runs.
250
+ // Fall back to the seeded prior so routing stays usable.
251
+ return buildPriorTable();
252
+ }
253
+ return parseQualityFile(path, raw);
254
+ };
255
+ const getQualityTable = () => {
256
+ if (qualityTableCache !== undefined)
257
+ return qualityTableCache;
258
+ qualityTableCache = loadQualityTable();
259
+ return qualityTableCache;
260
+ };
261
+ /**
262
+ * Test-only hook: clears the memoized quality table so the next lookup
263
+ * re-reads disk. Mirrors `__resetCalibrationCacheForTest` in `cost.ts`.
264
+ * Not part of the public API surface but intentionally exported with the
265
+ * `__`-prefix convention so tests can reach for it explicitly.
266
+ */
267
+ export const __resetQualityCacheForTest = () => {
268
+ qualityTableCache = undefined;
269
+ };
270
+ // ---------------------------------------------------------------------------
271
+ // Public API
272
+ // ---------------------------------------------------------------------------
273
+ /**
274
+ * Return the expected quality for `(taskClass, modelId)` as a point estimate
275
+ * in [0, 1].
276
+ *
277
+ * Backward-compatible signature: matches `quality_prior.predictQuality` so
278
+ * the router doesn't need to change. The mean comes from
279
+ * `predictQualityWithCI` — callers wanting uncertainty should use that
280
+ * function directly.
281
+ *
282
+ * Unknown models receive `DEFAULT_QUALITY = 0.5`.
283
+ */
284
+ export function predictQuality(taskClass, modelId) {
285
+ return predictQualityWithCI(taskClass, modelId).mean;
286
+ }
287
+ /**
288
+ * Return the expected quality for `(taskClass, modelId)` with a 95%
289
+ * Wilson-score confidence interval.
290
+ *
291
+ * - `mean` is `successes / trials`.
292
+ * - `lo95` / `hi95` is the Wilson score interval at the 95% level.
293
+ * - `n` is the trial count (real for measured cells, `PRIOR_N` for the
294
+ * seeded prior fallback).
295
+ *
296
+ * Unknown (taskClass, modelId) pairs return a uniform-prior estimate:
297
+ * `{ mean: 0.5, lo95, hi95, n: PRIOR_N }`
298
+ * where the CI is the Wilson 95% interval for 5 successes in 10 trials —
299
+ * intentionally wide to reflect that no data backs the estimate.
300
+ */
301
+ export function predictQualityWithCI(taskClass, modelId) {
302
+ const table = getQualityTable();
303
+ const row = table.cells[modelId];
304
+ const cell = row !== undefined ? row[taskClass] : undefined;
305
+ if (cell === undefined) {
306
+ // Unknown model or unknown task class for a known model: return a
307
+ // uniform prior with PRIOR_N synthetic trials so the CI is defined.
308
+ const successes = Math.round(DEFAULT_QUALITY * PRIOR_N);
309
+ const { lo, hi } = wilsonScore95(successes, PRIOR_N);
310
+ return {
311
+ mean: DEFAULT_QUALITY,
312
+ lo95: lo,
313
+ hi95: hi,
314
+ n: PRIOR_N,
315
+ };
316
+ }
317
+ const mean = cell.successes / cell.trials;
318
+ const { lo, hi } = wilsonScore95(cell.successes, cell.trials);
319
+ return { mean, lo95: lo, hi95: hi, n: cell.trials };
320
+ }
321
+ /**
322
+ * Introspection helper. Returns whether the predictor is currently serving
323
+ * measurements or the fallback prior, and the metadata of the loaded file.
324
+ * Useful for the CLI's `route --debug` output and for the paper's
325
+ * reproducibility appendix.
326
+ */
327
+ export function getQualitySourceInfo() {
328
+ const table = getQualityTable();
329
+ const info = {
330
+ source: table.source,
331
+ };
332
+ if (table.loadedFrom !== undefined)
333
+ info.loadedFrom = table.loadedFrom;
334
+ if (table.generatedAt !== undefined)
335
+ info.generatedAt = table.generatedAt;
336
+ return info;
337
+ }
338
+ //# sourceMappingURL=quality_predictor.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"quality_predictor.js","sourceRoot":"","sources":["../src/quality_predictor.ts"],"names":[],"mappings":"AAAA,qEAAqE;AACrE,EAAE;AACF,6EAA6E;AAC7E,yBAAyB;AACzB,6EAA6E;AAC7E,sEAAsE;AACtE,uEAAuE;AACvE,2EAA2E;AAC3E,2EAA2E;AAC3E,yDAAyD;AACzD,oEAAoE;AACpE,EAAE;AACF,oCAAoC;AACpC,oEAAoE;AACpE,sEAAsE;AACtE,wEAAwE;AACxE,sEAAsE;AACtE,0EAA0E;AAC1E,uEAAuE;AACvE,yDAAyD;AACzD,EAAE;AACF,6EAA6E;AAC7E,yBAAyB;AACzB,6EAA6E;AAC7E,wEAAwE;AACxE,0EAA0E;AAC1E,oEAAoE;AACpE,wBAAwB;AACxB,8DAA8D;AAC9D,oEAAoE;AACpE,oEAAoE;AACpE,qEAAqE;AACrE,4DAA4D;AAC5D,EAAE;AACF,6EAA6E;AAC7E,uBAAuB;AACvB,6EAA6E;AAC7E,2EAA2E;AAC3E,uEAAuE;AACvE,qDAAqD;AAErD,OAAO,EAAE,YAAY,EAAE,MAAM,SAAS,CAAC;AACvC,OAAO,EAAE,OAAO,EAAE,OAAO,IAAI,WAAW,EAAE,MAAM,WAAW,CAAC;AAC5D,OAAO,EAAE,aAAa,EAAE,MAAM,UAAU,CAAC;AACzC,OAAO,EAAE,qBAAqB,EAAE,MAAM,oBAAoB,CAAC;AAuB3D,8EAA8E;AAC9E,YAAY;AACZ,8EAA8E;AAE9E;;;;GAIG;AACH,MAAM,IAAI,GAAG,iBAAiB,CAAC;AAE/B;;;;GAIG;AACH,MAAM,eAAe,GAAG,GAAG,CAAC;AAE5B;;;;;;GAMG;AACH,MAAM,CAAC,MAAM,OAAO,GAAG,EAAE,CAAC;AAE1B;;;;;;;GAOG;AACH,MAAM,0BAA0B,GAAG,WAAW,CAC5C,OAAO,CAAC,aAAa,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,EACvC,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,MAAM,EACN,SAAS,EACT,oBAAoB,CACrB,CAAC;AAEF;;;GAGG;AACH,MAAM,0BAA0B,GAAG,8BAA8B,CAAC;AAElE,8EAA8E;AAC9E,oCAAoC;AACpC,8EAA8E;AAE9E;;;;;;;;;;;;;;;;;;;GAmBG;AACH,MAAM,UAAU,aAAa,CAAC,SAAiB,EAAE,MAAc;IAC7D,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,SAAS,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC,EAAE,CAAC;QAC5D,MAAM,IAAI,KAAK,CAAC,4DAA4D,CAAC,CAAC;IAChF,CAAC;IACD,IAAI,MAAM,IAAI,CAAC,EAAE,CAAC;QAChB,MAAM,IAAI,KAAK,CAAC,kDAAkD,CAAC,CAAC;IACtE,CAAC;IACD,IAAI,SAAS,GAAG,CAAC,IAAI,SAAS,GAAG,MAAM,EAAE,CAAC;QACxC,MAAM,IAAI,KAAK,CAAC,iDAAiD,CAAC,CAAC;IACrE,CAAC;IAED,MAAM,CAAC,GAAG,MAAM,CAAC;IACjB,MAAM,CAAC,GAAG,SAAS,GAAG,CAAC,CAAC;IACxB,MAAM,CAAC,GAAG,IAAI,CAAC;IACf,MAAM,EAAE,GAAG,CAAC,GAAG,CAAC,CAAC;IAEjB,MAAM,KAAK,GAAG,CAAC,GAAG,EAAE,GAAG,CAAC,CAAC;IACzB,MAAM,MAAM,GAAG,CAAC,CAAC,GAAG,EAAE,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,KAAK,CAAC;IAC1C,MAAM,MAAM,GAAG,CAAC,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,EAAE,GAAG,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,KAAK,CAAC;IAE7E,qEAAqE;IACrE,uEAAuE;IACvE,oEAAoE;IACpE,sEAAsE;IACtE,IAAI,EAAE,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,MAAM,GAAG,MAAM,CAAC,CAAC;IACtC,IAAI,EAAE,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,MAAM,GAAG,MAAM,CAAC,CAAC;IACtC,IAAI,SAAS,KAAK,MAAM;QAAE,EAAE,GAAG,CAAC,CAAC;IACjC,IAAI,SAAS,KAAK,CAAC;QAAE,EAAE,GAAG,CAAC,CAAC;IAC5B,OAAO,EAAE,EAAE,EAAE,EAAE,EAAE,CAAC;AACpB,CAAC;AAkDD,MAAM,YAAY,GAA2B,IAAI,GAAG,CAAY;IAC9D,IAAI;IACJ,SAAS;IACT,eAAe;IACf,gBAAgB;IAChB,WAAW;CACZ,CAAC,CAAC;AAEH,MAAM,WAAW,GAAG,CAAC,CAAS,EAAkB,EAAE,CAAC,YAAY,CAAC,GAAG,CAAC,CAAc,CAAC,CAAC;AAEpF,MAAM,gBAAgB,GAAG,CAAC,IAAY,EAAE,GAAW,EAAgB,EAAE;IACnE,IAAI,MAAe,CAAC;IACpB,IAAI,CAAC;QACH,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;IAC3B,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,MAAM,IAAI,KAAK,CACb,0BAA0B,IAAI,wBAC5B,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CACvD,EAAE,CACH,CAAC;IACJ,CAAC;IACD,IAAI,MAAM,KAAK,IAAI,IAAI,OAAO,MAAM,KAAK,QAAQ,EAAE,CAAC;QAClD,MAAM,IAAI,KAAK,CAAC,0BAA0B,IAAI,0BAA0B,CAAC,CAAC;IAC5E,CAAC;IACD,MAAM,IAAI,GAAG,MAAwB,CAAC;IACtC,MAAM,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC;IAC3B,IAAI,OAAO,KAAK,SAAS,IAAI,OAAO,OAAO,KAAK,QAAQ,IAAI,OAAO,KAAK,IAAI,EAAE,CAAC;QAC7E,oEAAoE;QACpE,oEAAoE;QACpE,qEAAqE;QACrE,OAAO,eAAe,EAAE,CAAC;IAC3B,CAAC;IAED,MAAM,QAAQ,GAA4D,EAAE,CAAC;IAC7E,KAAK,MAAM,CAAC,OAAO,EAAE,OAAO,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,OAAO,CAAC,EAAE,CAAC;QACzD,IAAI,OAAO,OAAO,KAAK,QAAQ,IAAI,OAAO,KAAK,IAAI;YAAE,SAAS;QAC9D,MAAM,KAAK,GAA4C,EAAE,CAAC;QAC1D,KAAK,MAAM,CAAC,OAAO,EAAE,IAAI,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,OAAO,CAAC,EAAE,CAAC;YACtD,IAAI,CAAC,WAAW,CAAC,OAAO,CAAC;gBAAE,SAAS;YACpC,IAAI,OAAO,IAAI,KAAK,QAAQ,IAAI,IAAI,KAAK,IAAI;gBAAE,SAAS;YACxD,MAAM,SAAS,GAAI,IAAgC,CAAC,SAAS,CAAC;YAC9D,MAAM,MAAM,GAAI,IAA6B,CAAC,MAAM,CAAC;YACrD,IACE,OAAO,SAAS,KAAK,QAAQ;gBAC7B,CAAC,MAAM,CAAC,QAAQ,CAAC,SAAS,CAAC;gBAC3B,SAAS,GAAG,CAAC,EACb,CAAC;gBACD,SAAS;YACX,CAAC;YACD,IAAI,OAAO,MAAM,KAAK,QAAQ,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC,IAAI,MAAM,IAAI,CAAC,EAAE,CAAC;gBAC1E,SAAS;YACX,CAAC;YACD,IAAI,SAAS,GAAG,MAAM;gBAAE,SAAS;YACjC,KAAK,CAAC,OAAO,CAAC,GAAG;gBACf,SAAS,EAAE,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC;gBAChC,MAAM,EAAE,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC;aAC3B,CAAC;QACJ,CAAC;QACD,IAAI,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAClC,QAAQ,CAAC,OAAO,CAAC,GAAG,KAAK,CAAC;QAC5B,CAAC;IACH,CAAC;IAED,IAAI,MAAM,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACvC,uEAAuE;QACvE,OAAO,eAAe,EAAE,CAAC;IAC3B,CAAC;IAED,OAAO;QACL,MAAM,EAAE,UAAU;QAClB,KAAK,EAAE,QAAQ;QACf,UAAU,EAAE,IAAI;QAChB,GAAG,CAAC,OAAO,IAAI,CAAC,YAAY,KAAK,QAAQ,CAAC,CAAC,CAAC,EAAE,WAAW,EAAE,IAAI,CAAC,YAAY,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;KACrF,CAAC;AACJ,CAAC,CAAC;AAEF;;;;;;;;;;;;;GAaG;AACH,MAAM,eAAe,GAAG,GAAiB,EAAE;IACzC,MAAM,KAAK,GAA4D,EAAE,CAAC;IAC1E,KAAK,MAAM,CAAC,OAAO,EAAE,GAAG,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,qBAAqB,CAAC,EAAE,CAAC;QACnE,MAAM,KAAK,GAA4C,EAAE,CAAC;QAC1D,KAAK,MAAM,OAAO,IAAI,YAAY,EAAE,CAAC;YACnC,MAAM,CAAC,GAAG,GAAG,CAAC,OAAO,CAAC,CAAC;YACvB,+DAA+D;YAC/D,MAAM,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,GAAG,OAAO,CAAC,CAAC;YACxC,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,CAAC,OAAO,GAAG,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC,CAAC;YAC9D,KAAK,CAAC,OAAO,CAAC,GAAG,EAAE,SAAS,EAAE,MAAM,EAAE,OAAO,EAAE,CAAC;QAClD,CAAC;QACD,KAAK,CAAC,OAAO,CAAC,GAAG,KAAK,CAAC;IACzB,CAAC;IACD,OAAO,EAAE,MAAM,EAAE,OAAO,EAAE,KAAK,EAAE,CAAC;AACpC,CAAC,CAAC;AAEF,8EAA8E;AAC9E,cAAc;AACd,8EAA8E;AAE9E,IAAI,iBAA2C,CAAC;AAEhD,MAAM,uBAAuB,GAAG,GAAW,EAAE;IAC3C,MAAM,OAAO,GAAG,OAAO,CAAC,GAAG,CAAC,0BAA0B,CAAC,CAAC;IACxD,IAAI,OAAO,KAAK,SAAS,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC;QAAE,OAAO,OAAO,CAAC;IAChE,OAAO,0BAA0B,CAAC;AACpC,CAAC,CAAC;AAEF,MAAM,gBAAgB,GAAG,GAAiB,EAAE;IAC1C,MAAM,IAAI,GAAG,uBAAuB,EAAE,CAAC;IACvC,IAAI,GAAW,CAAC;IAChB,IAAI,CAAC;QACH,GAAG,GAAG,YAAY,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;IACnC,CAAC;IAAC,MAAM,CAAC;QACP,sEAAsE;QACtE,yDAAyD;QACzD,OAAO,eAAe,EAAE,CAAC;IAC3B,CAAC;IACD,OAAO,gBAAgB,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;AACrC,CAAC,CAAC;AAEF,MAAM,eAAe,GAAG,GAAiB,EAAE;IACzC,IAAI,iBAAiB,KAAK,SAAS;QAAE,OAAO,iBAAiB,CAAC;IAC9D,iBAAiB,GAAG,gBAAgB,EAAE,CAAC;IACvC,OAAO,iBAAiB,CAAC;AAC3B,CAAC,CAAC;AAEF;;;;;GAKG;AACH,MAAM,CAAC,MAAM,0BAA0B,GAAG,GAAS,EAAE;IACnD,iBAAiB,GAAG,SAAS,CAAC;AAChC,CAAC,CAAC;AAEF,8EAA8E;AAC9E,aAAa;AACb,8EAA8E;AAE9E;;;;;;;;;;GAUG;AACH,MAAM,UAAU,cAAc,CAAC,SAAoB,EAAE,OAAe;IAClE,OAAO,oBAAoB,CAAC,SAAS,EAAE,OAAO,CAAC,CAAC,IAAI,CAAC;AACvD,CAAC;AAED;;;;;;;;;;;;;GAaG;AACH,MAAM,UAAU,oBAAoB,CAClC,SAAoB,EACpB,OAAe;IAEf,MAAM,KAAK,GAAG,eAAe,EAAE,CAAC;IAChC,MAAM,GAAG,GAAG,KAAK,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;IACjC,MAAM,IAAI,GAAG,GAAG,KAAK,SAAS,CAAC,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;IAC5D,IAAI,IAAI,KAAK,SAAS,EAAE,CAAC;QACvB,kEAAkE;QAClE,oEAAoE;QACpE,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,eAAe,GAAG,OAAO,CAAC,CAAC;QACxD,MAAM,EAAE,EAAE,EAAE,EAAE,EAAE,GAAG,aAAa,CAAC,SAAS,EAAE,OAAO,CAAC,CAAC;QACrD,OAAO;YACL,IAAI,EAAE,eAAe;YACrB,IAAI,EAAE,EAAE;YACR,IAAI,EAAE,EAAE;YACR,CAAC,EAAE,OAAO;SACX,CAAC;IACJ,CAAC;IACD,MAAM,IAAI,GAAG,IAAI,CAAC,SAAS,GAAG,IAAI,CAAC,MAAM,CAAC;IAC1C,MAAM,EAAE,EAAE,EAAE,EAAE,EAAE,GAAG,aAAa,CAAC,IAAI,CAAC,SAAS,EAAE,IAAI,CAAC,MAAM,CAAC,CAAC;IAC9D,OAAO,EAAE,IAAI,EAAE,IAAI,EAAE,EAAE,EAAE,IAAI,EAAE,EAAE,EAAE,CAAC,EAAE,IAAI,CAAC,MAAM,EAAE,CAAC;AACtD,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,oBAAoB;IAKlC,MAAM,KAAK,GAAG,eAAe,EAAE,CAAC;IAChC,MAAM,IAAI,GAAgF;QACxF,MAAM,EAAE,KAAK,CAAC,MAAM;KACrB,CAAC;IACF,IAAI,KAAK,CAAC,UAAU,KAAK,SAAS;QAAE,IAAI,CAAC,UAAU,GAAG,KAAK,CAAC,UAAU,CAAC;IACvE,IAAI,KAAK,CAAC,WAAW,KAAK,SAAS;QAAE,IAAI,CAAC,WAAW,GAAG,KAAK,CAAC,WAAW,CAAC;IAC1E,OAAO,IAAI,CAAC;AACd,CAAC"}
@@ -0,0 +1,24 @@
1
+ import type { TaskClass } from "./types.ts";
2
+ /**
3
+ * Internal type: a quality estimate per task class for a single model id.
4
+ * Missing entries fall back to `DEFAULT_QUALITY`.
5
+ */
6
+ type QualityRow = Record<TaskClass, number>;
7
+ /**
8
+ * Look up the expected quality for `(taskClass, modelId)` from the seeded
9
+ * prior table. Kept exported for backward-compat with older callers and
10
+ * for tests that need to inspect the raw prior. New code should prefer
11
+ * `predictQuality` / `predictQualityWithCI` from `quality_predictor.ts`,
12
+ * which serves measured data when available and falls back to this
13
+ * table otherwise.
14
+ *
15
+ * Models not in the table fall back to `DEFAULT_QUALITY = 0.5`.
16
+ */
17
+ export declare function predictQuality(taskClass: TaskClass, modelId: string): number;
18
+ /**
19
+ * Exposed for tests and for introspection by downstream tooling (e.g. the
20
+ * Phase 3 calibration harness, which uses this as a starting prior).
21
+ */
22
+ export declare const __QUALITY_PRIOR_TABLE: Readonly<Record<string, QualityRow>>;
23
+ export {};
24
+ //# sourceMappingURL=quality_prior.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"quality_prior.d.ts","sourceRoot":"","sources":["../src/quality_prior.ts"],"names":[],"mappings":"AAiCA,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,YAAY,CAAC;AAE5C;;;GAGG;AACH,KAAK,UAAU,GAAG,MAAM,CAAC,SAAS,EAAE,MAAM,CAAC,CAAC;AA0D5C;;;;;;;;;GASG;AACH,wBAAgB,cAAc,CAAC,SAAS,EAAE,SAAS,EAAE,OAAO,EAAE,MAAM,GAAG,MAAM,CAM5E;AAED;;;GAGG;AACH,eAAO,MAAM,qBAAqB,EAAE,QAAQ,CAAC,MAAM,CAAC,MAAM,EAAE,UAAU,CAAC,CACxD,CAAC"}
@@ -0,0 +1,109 @@
1
+ // quality_prior.ts — seeded fallback prior for the routing engine.
2
+ //
3
+ // PHASE 2 -> PHASE 3 ROLE SHIFT: This file used to be the source of truth
4
+ // for `predictQuality`. In Phase 3 it has been demoted to a fallback prior
5
+ // consumed by `quality_predictor.ts`. The real, calibrated quality
6
+ // estimates now come from measured eval data at
7
+ // `eval/results/quality_table.json`. When that file is missing or
8
+ // produces no usable cells (e.g. project bootstrap, fresh checkout
9
+ // before the eval harness has run), `quality_predictor.ts` synthesizes
10
+ // (successes, trials) counts from this prior table so the Wilson CI is
11
+ // well-defined. The `predictQuality()` function below is kept for
12
+ // backward-compat — `index.ts` now re-exports the predictor's version
13
+ // in preference — but the table data is the load-bearing artifact.
14
+ //
15
+ // Prior-art note: a calibrated
16
+ // pre-call quality predictor on its own is not novel — RouteLLM's matrix
17
+ // factorization, BEST-Route's difficulty heads, and cross-attention
18
+ // routers all do this. Our differentiation comes from pairing the
19
+ // predictor with atlas-grounded empirical token costs PLUS reporting
20
+ // explicit Wilson 95% CIs (so callers can route on a confident lower
21
+ // bound rather than a point estimate), NOT from the predictor's
22
+ // algorithmic structure.
23
+ //
24
+ // Design choices for the prior values:
25
+ // - Bigger / newer models score higher on harder tasks (codegen,
26
+ // reasoning) where parameter count and training recency dominate.
27
+ // - Smaller / cheaper models score competitively on easier tasks
28
+ // (classification, summarization, simple QA) where capability
29
+ // headroom is wasted.
30
+ // - Values are eyeballed from public eval reports (MT-Bench, HumanEval,
31
+ // MMLU, etc.) circa late-2025 / early-2026 — they are NOT measured
32
+ // here. Phase 3 replaces them with measurements.
33
+ const DEFAULT_QUALITY = 0.5;
34
+ /**
35
+ * Hardcoded per-(model, task) quality prior. The key is the model id from
36
+ * the candidate pool (see `candidates.json`). Values are expected quality
37
+ * in [0, 1] interpreted as "fraction of tasks of this class that this
38
+ * model gets right at the rubric's threshold."
39
+ *
40
+ * REPLACE LATER with a calibrated predictor; see the prior-art
41
+ * survey for differentiation requirements.
42
+ */
43
+ const QUALITY_PRIOR = {
44
+ "claude-opus-4-7": {
45
+ qa: 0.95,
46
+ codegen: 0.93,
47
+ summarization: 0.94,
48
+ classification: 0.96,
49
+ reasoning: 0.95,
50
+ },
51
+ "claude-sonnet-4-6": {
52
+ qa: 0.91,
53
+ codegen: 0.88,
54
+ summarization: 0.92,
55
+ classification: 0.93,
56
+ reasoning: 0.89,
57
+ },
58
+ "claude-haiku-4-5": {
59
+ qa: 0.84,
60
+ codegen: 0.78,
61
+ summarization: 0.86,
62
+ classification: 0.89,
63
+ reasoning: 0.78,
64
+ },
65
+ "llama-3.3-70b": {
66
+ qa: 0.82,
67
+ codegen: 0.79,
68
+ summarization: 0.83,
69
+ classification: 0.87,
70
+ reasoning: 0.76,
71
+ },
72
+ "llama-3.1-8b": {
73
+ qa: 0.68,
74
+ codegen: 0.58,
75
+ summarization: 0.72,
76
+ classification: 0.78,
77
+ reasoning: 0.55,
78
+ },
79
+ "mixtral-8x7b": {
80
+ qa: 0.74,
81
+ codegen: 0.69,
82
+ summarization: 0.77,
83
+ classification: 0.81,
84
+ reasoning: 0.65,
85
+ },
86
+ };
87
+ /**
88
+ * Look up the expected quality for `(taskClass, modelId)` from the seeded
89
+ * prior table. Kept exported for backward-compat with older callers and
90
+ * for tests that need to inspect the raw prior. New code should prefer
91
+ * `predictQuality` / `predictQualityWithCI` from `quality_predictor.ts`,
92
+ * which serves measured data when available and falls back to this
93
+ * table otherwise.
94
+ *
95
+ * Models not in the table fall back to `DEFAULT_QUALITY = 0.5`.
96
+ */
97
+ export function predictQuality(taskClass, modelId) {
98
+ const row = QUALITY_PRIOR[modelId];
99
+ if (row === undefined) {
100
+ return DEFAULT_QUALITY;
101
+ }
102
+ return row[taskClass];
103
+ }
104
+ /**
105
+ * Exposed for tests and for introspection by downstream tooling (e.g. the
106
+ * Phase 3 calibration harness, which uses this as a starting prior).
107
+ */
108
+ export const __QUALITY_PRIOR_TABLE = QUALITY_PRIOR;
109
+ //# sourceMappingURL=quality_prior.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"quality_prior.js","sourceRoot":"","sources":["../src/quality_prior.ts"],"names":[],"mappings":"AAAA,mEAAmE;AACnE,EAAE;AACF,0EAA0E;AAC1E,2EAA2E;AAC3E,mEAAmE;AACnE,gDAAgD;AAChD,kEAAkE;AAClE,mEAAmE;AACnE,uEAAuE;AACvE,uEAAuE;AACvE,kEAAkE;AAClE,sEAAsE;AACtE,mEAAmE;AACnE,EAAE;AACF,+BAA+B;AAC/B,yEAAyE;AACzE,oEAAoE;AACpE,kEAAkE;AAClE,qEAAqE;AACrE,qEAAqE;AACrE,gEAAgE;AAChE,yBAAyB;AACzB,EAAE;AACF,uCAAuC;AACvC,mEAAmE;AACnE,sEAAsE;AACtE,mEAAmE;AACnE,kEAAkE;AAClE,0BAA0B;AAC1B,0EAA0E;AAC1E,uEAAuE;AACvE,qDAAqD;AAUrD,MAAM,eAAe,GAAG,GAAG,CAAC;AAE5B;;;;;;;;GAQG;AACH,MAAM,aAAa,GAA+B;IAChD,iBAAiB,EAAE;QACjB,EAAE,EAAE,IAAI;QACR,OAAO,EAAE,IAAI;QACb,aAAa,EAAE,IAAI;QACnB,cAAc,EAAE,IAAI;QACpB,SAAS,EAAE,IAAI;KAChB;IACD,mBAAmB,EAAE;QACnB,EAAE,EAAE,IAAI;QACR,OAAO,EAAE,IAAI;QACb,aAAa,EAAE,IAAI;QACnB,cAAc,EAAE,IAAI;QACpB,SAAS,EAAE,IAAI;KAChB;IACD,kBAAkB,EAAE;QAClB,EAAE,EAAE,IAAI;QACR,OAAO,EAAE,IAAI;QACb,aAAa,EAAE,IAAI;QACnB,cAAc,EAAE,IAAI;QACpB,SAAS,EAAE,IAAI;KAChB;IACD,eAAe,EAAE;QACf,EAAE,EAAE,IAAI;QACR,OAAO,EAAE,IAAI;QACb,aAAa,EAAE,IAAI;QACnB,cAAc,EAAE,IAAI;QACpB,SAAS,EAAE,IAAI;KAChB;IACD,cAAc,EAAE;QACd,EAAE,EAAE,IAAI;QACR,OAAO,EAAE,IAAI;QACb,aAAa,EAAE,IAAI;QACnB,cAAc,EAAE,IAAI;QACpB,SAAS,EAAE,IAAI;KAChB;IACD,cAAc,EAAE;QACd,EAAE,EAAE,IAAI;QACR,OAAO,EAAE,IAAI;QACb,aAAa,EAAE,IAAI;QACnB,cAAc,EAAE,IAAI;QACpB,SAAS,EAAE,IAAI;KAChB;CACF,CAAC;AAEF;;;;;;;;;GASG;AACH,MAAM,UAAU,cAAc,CAAC,SAAoB,EAAE,OAAe;IAClE,MAAM,GAAG,GAAG,aAAa,CAAC,OAAO,CAAC,CAAC;IACnC,IAAI,GAAG,KAAK,SAAS,EAAE,CAAC;QACtB,OAAO,eAAe,CAAC;IACzB,CAAC;IACD,OAAO,GAAG,CAAC,SAAS,CAAC,CAAC;AACxB,CAAC;AAED;;;GAGG;AACH,MAAM,CAAC,MAAM,qBAAqB,GAChC,aAAa,CAAC"}
@@ -0,0 +1,27 @@
1
+ import type { ModelCandidate, RouteDecision, RouteRequest } from "./types.ts";
2
+ /**
3
+ * Return the default candidate pool. Useful for callers that want to
4
+ * inspect the pool, filter it, or extend it before routing.
5
+ */
6
+ export declare function getDefaultCandidates(): readonly ModelCandidate[];
7
+ /**
8
+ * The public routing entrypoint.
9
+ *
10
+ * Pipeline:
11
+ * 1. Validate the request.
12
+ * 2. Resolve the candidate pool (caller override > shipped default).
13
+ * 3. Filter out candidates that fail the quality bar, the cost budget,
14
+ * or the model's context window. Record reasons.
15
+ * 4. Sort survivors by expected cost ascending (quality breaks ties).
16
+ * 5. Pick the cheapest survivor as `chosen`; the next three as
17
+ * `fallbacks`. Anything below that is in `skipped` only if it failed
18
+ * a constraint — extra cheap-survivors past the fallback list are
19
+ * simply not returned.
20
+ * 6. If no candidate survives, throw with the full skipped list so the
21
+ * caller can see exactly what went wrong.
22
+ *
23
+ * Sync return: this function does no I/O. Returning a promise would be
24
+ * misleading.
25
+ */
26
+ export declare function route(request: RouteRequest): RouteDecision;
27
+ //# sourceMappingURL=router.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"router.d.ts","sourceRoot":"","sources":["../src/router.ts"],"names":[],"mappings":"AA0CA,OAAO,KAAK,EACV,cAAc,EACd,aAAa,EAGb,YAAY,EAEb,MAAM,YAAY,CAAC;AAcpB;;;GAGG;AACH,wBAAgB,oBAAoB,IAAI,SAAS,cAAc,EAAE,CAEhE;AA4JD;;;;;;;;;;;;;;;;;;GAkBG;AACH,wBAAgB,KAAK,CAAC,OAAO,EAAE,YAAY,GAAG,aAAa,CAsB1D"}