@xynogen/pix-data 0.1.2 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,20 +1,101 @@
1
1
  # pix-data
2
2
 
3
- Pi coding agent extension — shared model data layer. Fetches and caches [models.dev](https://models.dev) metadata and [BenchLM](https://benchlm.ai) leaderboard data to `~/.cache/pi/` on session start, so other extensions can read them synchronously without redundant network calls.
3
+ Pi coding agent extension — shared model data layer. Fetches and caches the
4
+ [modelgrep](https://modelgrep.com) model catalog to `~/.cache/pi/` on session
5
+ start, so other extensions (model picker, footer, subagent resolver) can read
6
+ context window, pricing, and a coding-focused score/rank synchronously without
7
+ redundant network calls.
8
+
9
+ ## Data source
10
+
11
+ All data comes from a **single source**: [modelgrep.com](https://modelgrep.com)
12
+ (`/api/v1/models?benchmarked=1&sort=coding`). Free, no API key, ~190 benchmarked
13
+ models with real model ids. modelgrep aggregates benchmark numbers from
14
+ [Artificial Analysis](https://artificialanalysis.ai).
15
+
16
+ - **Context window + pricing** — taken verbatim from modelgrep.
17
+ - **Score** — computed locally from the raw benchmark fields (see below).
18
+ - **Rank** — the model's position once the whole catalog is sorted by that score
19
+ (best = `#1`). Unscored models sink to the bottom.
20
+
21
+ Cached 24h → `~/.cache/pi/modelgrep.json`. On outage the stale cache keeps the
22
+ picker working until it can refresh.
23
+
24
+ ## Scoring methodology
25
+
26
+ **Primary score = [Artificial Analysis Intelligence Index](https://artificialanalysis.ai/methodology/intelligence-benchmarking)**
27
+ when available — AA's authoritative composite of 9 independent evals (agents,
28
+ coding, scientific reasoning, general), already weighted toward agentic work.
29
+ It is rescaled to 0–100 (`intelligence / 65 × 100`; the current leader scores
30
+ ~65).
31
+
32
+ **Fallback = a coding-and-agentic heuristic** for the ~84% of models AA has not
33
+ index-scored, computed from the raw benchmarks below, then mapped onto the index
34
+ scale by a least-squares line. Both the heuristic weights *and* the line were
35
+ jointly tuned against the index on the models that carry *both* it and the raw
36
+ benches (`index100 ≈ 120.6·heuristic − 10.6`, deduped n=29, R²=0.901,
37
+ leave-one-out RMSE 6.55pt) — a data calibration, not a guessed penalty. The
38
+ picker exists to choose a model *for coding work in an agent*, so the heuristic
39
+ is weighted toward exactly that:
40
+
41
+ | bench | range | measures |
42
+ |---|---|---|
43
+ | `coding` | 0–100 | code generation index |
44
+ | `scicode` | 0–1 | scientific coding |
45
+ | `tau2` | 0–1 | agentic tool-use |
46
+ | `agentic` | 0–100 | agentic index |
47
+ | `gpqa` | 0–1 | graduate-level reasoning |
48
+ | `hle` | 0–1 | hard-exam reasoning |
49
+
50
+ When the index is absent, three sub-scores combine, each a weighted blend of
51
+ its benches (all normalized to 0–1):
52
+
53
+ ```
54
+ coding_score = 0.60·(coding/100) + 0.40·scicode
55
+ agentic_score = 0.70·tau2 + 0.30·(agentic/100)
56
+ reasoning_score = 0.60·gpqa + 0.40·hle
57
+
58
+ heuristic = 0.30·coding_score + 0.60·agentic_score + 0.10·reasoning_score
59
+ score = round(clamp₀₁₀₀(120.6·heuristic − 10.6)) // fitted to the index
60
+ ```
61
+
62
+ **Why a heuristic at all, and why these raw evals only:** the AA Intelligence
63
+ Index *is* the ideal number — but only ~16% of the catalog has it. For the rest
64
+ we rebuild a comparable score from the same family of raw evals. Crucially we
65
+ use each raw eval **once** and never feed `intelligence` *and* its components
66
+ together, nor any `_pct` field (which is just a percentile-rank of a raw field)
67
+ — doing so would double-count the same measurement and silently inflate weights
68
+ you can't see. Independent inputs only → honest weighted average.
69
+
70
+ **Why these weights:** an agentic coding model lives or dies on *tool-calling*
71
+ and *code generation*, so `agentic_score` (0.60) and `coding_score` (0.30)
72
+ carry the score; pure reasoning (0.10) is a tiebreaker, not the headline. The
73
+ split is not arbitrary — a grid search over weight combinations, scored by how
74
+ well the heuristic predicts the AA index (leave-one-out cross-validation),
75
+ landed on this agentic-heavy mix. Within each group the dominant bench (`tau2`
76
+ for agentic, raw `coding`, `gpqa`) carries most of the weight and a secondary
77
+ bench refines it.
78
+
79
+ **Missing benchmarks:** every blend renormalizes over the fields actually
80
+ present, so a model missing one bench is diluted only *within its own group* —
81
+ it is never zero-penalized or dropped. A model with no benchmarks at all gets a
82
+ `null` score (shown as a bare row) and sorts to the bottom.
83
+
84
+ The exact implementation is `codingScore()` in
85
+ [`src/data.ts`](src/data.ts); the weights are intentionally easy to tune in one
86
+ place if your priorities differ.
4
87
 
5
88
  ## What's included
6
89
 
7
90
  | Export | Description |
8
91
  |---|---|
9
- | `modelsDev` | `DataSource<ModelsDevApi>` — models.dev metadata (context, cost, modalities). TTL 24h → `~/.cache/pi/models.json` |
10
- | `benchmark` | `DataSource<BenchmarkEntry[]>` — BenchLM leaderboard (rank, score, pricing). TTL 24h → `~/.cache/pi/benchlm.json` |
92
+ | `modelgrep` | `DataSource<ModelGrepModel[]>` — the catalog. TTL 24h → `~/.cache/pi/modelgrep.json` |
11
93
  | `DataSource` | Generic cached data source class |
12
94
  | `CACHE_DIR` | Resolved cache directory (`~/.cache/pi`) |
13
- | `buildModelsDevIndex` | Build a lookup `Map` from a `ModelsDevApi` response |
14
- | `lookupInIndex` | Fuzzy-match a router model id against the index |
95
+ | `buildModelsDevIndex` | Build a lookup `Map` from the catalog (context/cost/modalities) |
96
+ | `lookupInIndex` | Fuzzy-match a router model id against an index |
15
97
  | `lookupModelsDev` | Sync lookup by provider + id from in-memory cache |
16
- | `lookupBenchmark` | Fuzzy lookup a model by name from BenchLM cache |
17
- | `fetchModelsDevIndex` | Async — fetch models.dev and return built index |
98
+ | `lookupBenchmark` | Sync lookup a model by id returns score + rank + pricing |
18
99
 
19
100
  ## Install
20
101
 
@@ -24,10 +105,15 @@ pi install npm:@xynogen/pix-data
24
105
 
25
106
  ## How it works
26
107
 
27
- On session start the extension fires two parallel background fetches (`modelsDev.get()` + `benchmark.get()`). If the cache is fresh the fetches are skipped. Both cache files live in `~/.cache/pi/` — any Pi extension using the same `DataSource` + cache paths will share data automatically.
108
+ On session start the extension fires a background fetch (`modelgrep.get()`),
109
+ paginating the API until the full benchmarked catalog is retrieved. If the cache
110
+ is fresh the fetch is skipped. The cache file lives in `~/.cache/pi/` — any Pi
111
+ extension using the same `DataSource` shares it automatically.
28
112
 
29
113
  ## Full distro
30
114
 
115
+ Source: [github.com/xynogen/pix-mono](https://github.com/xynogen/pix-mono)
116
+
31
117
  To install the complete pix suite (all packages + Pi itself):
32
118
 
33
119
  ```bash
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@xynogen/pix-data",
3
- "version": "0.1.2",
3
+ "version": "0.2.0",
4
4
  "description": "Pi extension — shared model data layer (models.dev + BenchLM), cached at ~/.cache/pi",
5
5
  "type": "module",
6
6
  "main": "src/index.ts",
package/src/data.test.ts CHANGED
@@ -1,107 +1,102 @@
1
1
  import { afterEach, beforeEach, describe, expect, it } from "bun:test";
2
2
  import {
3
- type BenchmarkEntry,
4
- benchmark,
5
3
  buildModelsDevIndex,
6
4
  lookupBenchmark,
7
5
  lookupInIndex,
8
6
  lookupModelsDev,
9
- type ModelsDevApi,
10
- type ModelsDevModel,
11
- modelsDev,
7
+ type ModelGrepModel,
8
+ modelgrep,
12
9
  } from "./data.ts";
13
10
 
11
+ // Compact modelgrep-shaped fixture builder.
12
+ function mg(
13
+ id: string,
14
+ opts: {
15
+ name?: string;
16
+ ctx?: number;
17
+ in?: number;
18
+ out?: number;
19
+ reasoning?: boolean;
20
+ input?: string[];
21
+ // Raw benchmark inputs to codingScore. intelligence (~0–65) wins when
22
+ // present; otherwise coding/agentic (0–100) + rest (0–1) feed the heuristic.
23
+ bench?: {
24
+ intelligence?: number;
25
+ coding?: number;
26
+ agentic?: number;
27
+ gpqa?: number;
28
+ scicode?: number;
29
+ tau2?: number;
30
+ hle?: number;
31
+ };
32
+ } = {},
33
+ ): ModelGrepModel {
34
+ return {
35
+ id,
36
+ name: opts.name ?? id,
37
+ context_length: opts.ctx,
38
+ pricing: { input: opts.in, output: opts.out },
39
+ modality: { input: opts.input },
40
+ capabilities: { reasoning: opts.reasoning },
41
+ benchmarks: { artificial_analysis: { ...opts.bench } },
42
+ };
43
+ }
44
+
14
45
  // ── buildModelsDevIndex ──────────────────────────────────────────────────────
15
46
 
16
47
  describe("buildModelsDevIndex", () => {
17
- const api: ModelsDevApi = {
18
- anthropic: {
19
- models: {
20
- "claude-sonnet-4-5": {
21
- id: "claude-sonnet-4-5",
22
- name: "Claude Sonnet 4.5",
23
- },
24
- "claude-opus-4": {
25
- id: "claude-opus-4",
26
- name: "Claude Opus 4",
27
- reasoning: true,
28
- },
29
- },
30
- },
31
- openai: {
32
- models: {
33
- "gpt-4o": {
34
- id: "gpt-4o",
35
- name: "GPT-4o",
36
- modalities: { input: ["text", "image"] },
37
- },
38
- },
39
- },
40
- };
48
+ const catalog: ModelGrepModel[] = [
49
+ mg("anthropic/claude-sonnet-4-5", { name: "Claude Sonnet 4.5" }),
50
+ mg("anthropic/claude-opus-4", { name: "Claude Opus 4", reasoning: true }),
51
+ mg("openai/gpt-4o", { name: "GPT-4o", input: ["text", "image"] }),
52
+ ];
41
53
 
42
- it("indexes all models by exact id", () => {
43
- const idx = buildModelsDevIndex(api);
54
+ it("indexes all models by slug", () => {
55
+ const idx = buildModelsDevIndex(catalog);
44
56
  expect(idx.has("claude-sonnet-4-5")).toBe(true);
45
57
  expect(idx.has("claude-opus-4")).toBe(true);
46
58
  expect(idx.has("gpt-4o")).toBe(true);
47
59
  });
48
60
 
49
- it("indexes normalized id (strip date suffix)", () => {
50
- const a: ModelsDevApi = {
51
- anthropic: {
52
- models: {
53
- "claude-sonnet-4-5-20250514": {
54
- id: "claude-sonnet-4-5-20250514",
55
- name: "Claude Sonnet 4.5",
56
- },
57
- },
58
- },
59
- };
60
- const idx = buildModelsDevIndex(a);
61
+ it("indexes normalized slug (strip date suffix)", () => {
62
+ const idx = buildModelsDevIndex([
63
+ mg("anthropic/claude-sonnet-4-5-20250514", { name: "Claude Sonnet 4.5" }),
64
+ ]);
61
65
  expect(idx.has("claude-sonnet-4-5")).toBe(true);
62
66
  });
63
67
 
64
- it("handles empty api", () => {
65
- expect(buildModelsDevIndex({}).size).toBe(0);
68
+ it("handles empty catalog", () => {
69
+ expect(buildModelsDevIndex([]).size).toBe(0);
66
70
  });
67
71
 
68
- it("handles provider with no models key", () => {
69
- expect(buildModelsDevIndex({ anthropic: {} }).size).toBe(0);
72
+ it("maps fields onto ModelsDevModel shape", () => {
73
+ const m = buildModelsDevIndex([
74
+ mg("openai/gpt-4o", { ctx: 128000, in: 5, out: 15, input: ["text"] }),
75
+ ]).get("gpt-4o");
76
+ expect(m?.limit?.context).toBe(128000);
77
+ expect(m?.cost?.input).toBe(5);
78
+ expect(m?.cost?.output).toBe(15);
79
+ expect(m?.modalities?.input).toEqual(["text"]);
70
80
  });
71
81
 
72
- it("preserves first-seen on id collision", () => {
73
- const a: ModelsDevApi = {
74
- a: { models: { "gpt-4o": { id: "gpt-4o", name: "First" } } },
75
- b: { models: { "gpt-4o": { id: "gpt-4o", name: "Second" } } },
76
- };
77
- expect(buildModelsDevIndex(a).get("gpt-4o")?.name).toBe("First");
82
+ it("preserves first-seen on slug collision", () => {
83
+ const idx = buildModelsDevIndex([
84
+ mg("a/gpt-4o", { name: "First" }),
85
+ mg("b/gpt-4o", { name: "Second" }),
86
+ ]);
87
+ expect(idx.get("gpt-4o")?.name).toBe("First");
78
88
  });
79
89
  });
80
90
 
81
91
  // ── lookupInIndex ────────────────────────────────────────────────────────────
82
92
 
83
93
  describe("lookupInIndex", () => {
84
- let index: Map<string, ModelsDevModel>;
85
-
86
- beforeEach(() => {
87
- index = buildModelsDevIndex({
88
- anthropic: {
89
- models: {
90
- "claude-sonnet-4-5": {
91
- id: "claude-sonnet-4-5",
92
- name: "Claude Sonnet 4.5",
93
- },
94
- "claude-opus-4": { id: "claude-opus-4", name: "Claude Opus 4" },
95
- },
96
- },
97
- openai: {
98
- models: {
99
- "gpt-4o": { id: "gpt-4o", name: "GPT-4o" },
100
- "o3-mini": { id: "o3-mini", name: "o3 mini" },
101
- },
102
- },
103
- });
104
- });
94
+ const index = buildModelsDevIndex([
95
+ mg("anthropic/claude-sonnet-4-5", { name: "Claude Sonnet 4.5" }),
96
+ mg("anthropic/claude-opus-4", { name: "Claude Opus 4" }),
97
+ mg("openai/gpt-4o", { name: "GPT-4o" }),
98
+ mg("openai/o3-mini", { name: "o3 mini" }),
99
+ ]);
105
100
 
106
101
  it("finds exact match", () => {
107
102
  expect(lookupInIndex("claude-sonnet-4-5", index)?.name).toBe(
@@ -142,108 +137,93 @@ describe("lookupInIndex", () => {
142
137
  });
143
138
  });
144
139
 
145
- // ── lookupModelsDev ───────────────────────────────────────────────────────────
140
+ // ── modelgrep adapters (lookupModelsDev + lookupBenchmark) ────────────────────
141
+
142
+ describe("modelgrep adapters", () => {
143
+ const catalog: ModelGrepModel[] = [
144
+ mg("anthropic/claude-haiku-4.5", {
145
+ name: "Anthropic: Claude Haiku 4.5",
146
+ ctx: 200000,
147
+ in: 1,
148
+ out: 5,
149
+ input: ["text", "image"],
150
+ bench: {
151
+ coding: 43.9,
152
+ agentic: 16.4,
153
+ gpqa: 0.672,
154
+ scicode: 0.433,
155
+ tau2: 0.547,
156
+ hle: 0.097,
157
+ },
158
+ }),
159
+ mg("tencent/hy3-preview", {
160
+ name: "Tencent: hy3 preview",
161
+ ctx: 256000,
162
+ in: 0,
163
+ out: 0,
164
+ reasoning: true,
165
+ // coding/agentic absent — only raw benches → renormalized over present
166
+ bench: { gpqa: 0.732, scicode: 0.394, tau2: 0.675, hle: 0.063 },
167
+ }),
168
+ mg("ghost/unbenched", { name: "Ghost" }), // no signal at all
169
+ ];
146
170
 
147
- describe("lookupModelsDev", () => {
148
171
  beforeEach(() => {
149
- // Seed in-memory cache directly
150
- (modelsDev as any)._mem = {
151
- anthropic: {
152
- models: {
153
- "claude-sonnet-4-5": {
154
- id: "claude-sonnet-4-5",
155
- name: "Claude Sonnet 4.5",
156
- },
157
- },
158
- },
159
- openai: {
160
- models: {
161
- "gpt-4o": { id: "gpt-4o", name: "GPT-4o" },
162
- },
163
- },
164
- };
172
+ (modelgrep as unknown as { _mem: ModelGrepModel[] })._mem = catalog;
165
173
  });
166
-
167
174
  afterEach(() => {
168
- (modelsDev as any)._mem = null;
175
+ (modelgrep as unknown as { _mem: ModelGrepModel[] | null })._mem = null;
169
176
  });
170
177
 
171
- it("finds by exact provider + id", () => {
172
- expect(lookupModelsDev("anthropic", "claude-sonnet-4-5")?.name).toBe(
173
- "Claude Sonnet 4.5",
174
- );
178
+ it("lookupModelsDev finds haiku via slug, ignoring routing prefix + date", () => {
179
+ const m = lookupModelsDev("cc", "claude-haiku-4-5-20251001");
180
+ expect(m?.limit?.context).toBe(200000);
181
+ expect(m?.cost?.input).toBe(1);
175
182
  });
176
183
 
177
- it("falls back across providers when provider miss", () => {
178
- expect(lookupModelsDev("unknown-provider", "gpt-4o")?.name).toBe("GPT-4o");
179
- });
180
-
181
- it("strips path prefix from id", () => {
184
+ it("lookupModelsDev finds hy3 via prefix + suffix strip", () => {
182
185
  expect(
183
- lookupModelsDev("anthropic", "anthropic/claude-sonnet-4-5")?.name,
184
- ).toBe("Claude Sonnet 4.5");
185
- });
186
-
187
- it("returns undefined for unknown model", () => {
188
- expect(lookupModelsDev("anthropic", "nonexistent-xyz")).toBeUndefined();
186
+ lookupModelsDev("openrouter", "tencent/hy3-preview:nitro")?.limit
187
+ ?.context,
188
+ ).toBe(256000);
189
189
  });
190
- });
191
190
 
192
- // ── lookupBenchmark ───────────────────────────────────────────────────────────
193
-
194
- describe("lookupBenchmark", () => {
195
- const entries: BenchmarkEntry[] = [
196
- {
197
- rank: 1,
198
- model: "Claude Sonnet 4.5",
199
- creator: "Anthropic",
200
- overallScore: 95,
201
- inputPrice: 3,
202
- outputPrice: 15,
203
- },
204
- {
205
- rank: 2,
206
- model: "GPT-4o",
207
- creator: "OpenAI",
208
- overallScore: 90,
209
- inputPrice: 5,
210
- outputPrice: 15,
211
- },
212
- {
213
- rank: 3,
214
- model: "Gemini 1.5 Pro",
215
- creator: "Google",
216
- overallScore: 88,
217
- inputPrice: 3.5,
218
- outputPrice: 10.5,
219
- },
220
- ];
221
-
222
- beforeEach(() => {
223
- (benchmark as any)._mem = entries;
191
+ it("lookupModelsDev returns undefined for unknown model", () => {
192
+ expect(lookupModelsDev("cc", "nonexistent-xyz")).toBeUndefined();
224
193
  });
225
194
 
226
- afterEach(() => {
227
- (benchmark as any)._mem = null;
228
- });
229
-
230
- it("finds exact match (case-insensitive, normalized)", () => {
231
- expect(lookupBenchmark("claude sonnet 4.5")?.rank).toBe(1);
195
+ it("lookupBenchmark falls back to fitted heuristic (no intelligence)", () => {
196
+ const b = lookupBenchmark("claude-haiku-4-5-20251001");
197
+ // no intelligence → 120.6·heur − 10.6 → 42
198
+ expect(b?.overallScore).toBe(42);
199
+ expect(b?.rank).toBe(2); // ranked by score: hy3 (58) > haiku (42)
200
+ expect(b?.inputPrice).toBe(1);
201
+ expect(b?.outputPrice).toBe(5);
232
202
  });
233
203
 
234
- it("finds with dashes normalized", () => {
235
- expect(lookupBenchmark("claude-sonnet-4-5")?.rank).toBe(1);
204
+ it("lookupBenchmark renormalizes heuristic over present benches", () => {
205
+ const b = lookupBenchmark("tencent/hy3-preview:nitro");
206
+ // coding/agentic indices absent → heuristic renormalizes → fitted → 58
207
+ expect(b?.overallScore).toBe(58);
208
+ expect(b?.rank).toBe(1);
236
209
  });
237
210
 
238
- it("finds partial match (needle in model)", () => {
239
- expect(lookupBenchmark("gpt-4o")?.rank).toBe(2);
211
+ it("lookupBenchmark returns null score when no benches at all", () => {
212
+ const b = lookupBenchmark("ghost/unbenched");
213
+ expect(b?.overallScore).toBeNull();
214
+ expect(b?.rank).toBe(3); // unscored sinks to the bottom
240
215
  });
241
216
 
242
- it("finds partial match (model in needle)", () => {
243
- expect(lookupBenchmark("gemini 1.5 pro latest")?.rank).toBe(3);
217
+ it("lookupBenchmark prefers AA intelligence index over heuristic", () => {
218
+ (modelgrep as unknown as { _mem: ModelGrepModel[] })._mem = [
219
+ mg("openai/gpt-5", { bench: { intelligence: 52, coding: 10 } }),
220
+ ];
221
+ const b = lookupBenchmark("gpt-5");
222
+ // intelligence present → round(52 / 65 * 100) = 80, ignores low coding
223
+ expect(b?.overallScore).toBe(80);
244
224
  });
245
225
 
246
- it("returns undefined for unknown model", () => {
226
+ it("lookupBenchmark returns undefined for unknown model", () => {
247
227
  expect(lookupBenchmark("nonexistent-model-xyz")).toBeUndefined();
248
228
  });
249
229
  });
package/src/data.ts CHANGED
@@ -1,21 +1,18 @@
1
1
  /**
2
2
  * data.ts — shared Pi model data layer
3
3
  *
4
- * Single source of truth for:
5
- * - models.dev metadata (context, cost, modalities) → ~/.cache/pi/models.json TTL 24h
6
- * - BenchLM leaderboard (rank, score, pricing) → ~/.cache/pi/benchlm.json TTL 24h
4
+ * Single source of truth, sourced from modelgrep (coding-sorted), cached at
5
+ * ~/.cache/pi/modelgrep.json (TTL 24h). Provides context, cost, modalities,
6
+ * capabilities, coding-percentile score, and rank.
7
7
  *
8
8
  * Cache files are shared across all Pi extensions — whichever extension loads
9
9
  * first populates the cache; subsequent extensions read from disk.
10
10
  *
11
11
  * Usage:
12
- * import { modelsDev, benchmark } from "./data.ts";
12
+ * import { modelgrep } from "./data.ts";
13
13
  *
14
- * const models = await modelsDev.get(); // async, fetches if stale
15
- * const entries = await benchmark.get();
16
- *
17
- * const models = modelsDev.getCached(); // sync, disk-only, no fetch
18
- * const entries = benchmark.getCached();
14
+ * const catalog = await modelgrep.get(); // async, fetches if stale
15
+ * const catalog = modelgrep.getCached(); // sync, disk-only, no fetch
19
16
  *
20
17
  * import { lookupModelsDev, lookupBenchmark } from "./data.ts";
21
18
  */
@@ -57,10 +54,29 @@ export interface BenchmarkEntry {
57
54
  outputPrice: number | null;
58
55
  }
59
56
 
60
- interface BenchmarkResponse {
61
- lastUpdated?: string;
62
- mode?: string;
63
- models: BenchmarkEntry[];
57
+ export interface ModelGrepModel {
58
+ id: string;
59
+ name?: string;
60
+ context_length?: number;
61
+ pricing?: { input?: number; output?: number };
62
+ modality?: { input?: string[]; output?: string[] };
63
+ capabilities?: { reasoning?: boolean };
64
+ benchmarks?: {
65
+ artificial_analysis?: {
66
+ // AA Intelligence Index — authoritative 9-eval composite (~0–65 range).
67
+ intelligence?: number | null;
68
+ coding?: number | null; // 0–100 index
69
+ agentic?: number | null; // 0–100 index
70
+ gpqa?: number | null; // 0–1
71
+ scicode?: number | null; // 0–1
72
+ tau2?: number | null; // 0–1
73
+ hle?: number | null; // 0–1
74
+ };
75
+ };
76
+ }
77
+
78
+ interface ModelGrepResponse {
79
+ data: ModelGrepModel[];
64
80
  }
65
81
 
66
82
  // ── DataSource ───────────────────────────────────────────────────────────────
@@ -76,6 +92,16 @@ interface DataSourceOptions<T> {
76
92
  empty: T;
77
93
  label: string;
78
94
  skip?: () => boolean;
95
+ /**
96
+ * Optional override for sources that need multiple requests (pagination).
97
+ * Returns the merged raw payload, which is then handed to `parse`/cached
98
+ * exactly as a single response would be.
99
+ */
100
+ fetchRaw?: (
101
+ url: string,
102
+ headers: Record<string, string> | undefined,
103
+ timeoutMs: number,
104
+ ) => Promise<unknown>;
79
105
  }
80
106
 
81
107
  export class DataSource<T> {
@@ -89,6 +115,7 @@ export class DataSource<T> {
89
115
  timeoutMs: 10_000,
90
116
  headers: () => undefined,
91
117
  skip: () => false,
118
+ fetchRaw: defaultFetchRaw,
92
119
  ...opts,
93
120
  };
94
121
  }
@@ -131,14 +158,11 @@ export class DataSource<T> {
131
158
  try {
132
159
  const url =
133
160
  typeof this.opts.url === "function" ? this.opts.url() : this.opts.url;
134
- const response = await fetchWithTimeout(
161
+ const raw = await this.opts.fetchRaw(
135
162
  url,
136
- this.opts.timeoutMs,
137
163
  this.opts.headers(),
164
+ this.opts.timeoutMs,
138
165
  );
139
- if (!response.ok)
140
- throw new Error(`${this.opts.label} fetch failed: ${response.status}`);
141
- const raw = await response.json();
142
166
  const val = this.opts.parse(raw);
143
167
  this._mem = val;
144
168
  void this._writeCache(raw);
@@ -196,6 +220,52 @@ function fetchWithTimeout(
196
220
  );
197
221
  }
198
222
 
223
+ /** Single-request raw fetch — the default DataSource fetch strategy. */
224
+ async function defaultFetchRaw(
225
+ url: string,
226
+ headers: Record<string, string> | undefined,
227
+ timeoutMs: number,
228
+ ): Promise<unknown> {
229
+ const response = await fetchWithTimeout(url, timeoutMs, headers);
230
+ if (!response.ok) throw new Error(`fetch failed: ${response.status}`);
231
+ return response.json();
232
+ }
233
+
234
+ const MODELGREP_PAGE = 200; // modelgrep hard page-size cap
235
+ const MODELGREP_MAX_PAGES = 10; // safety bound (~2000 models)
236
+
237
+ interface ModelGrepPage {
238
+ data?: ModelGrepModel[];
239
+ meta?: { has_more?: boolean; next_offset?: number };
240
+ }
241
+
242
+ /**
243
+ * Paginating fetch for modelgrep: walks `meta.has_more`/`next_offset` and
244
+ * merges every page into one `{ data }` payload so `parse` and the cache see
245
+ * the full catalog as a single response. `url` already carries the query
246
+ * (sort/limit); we only append `&offset=`.
247
+ */
248
+ async function fetchModelGrepAll(
249
+ url: string,
250
+ headers: Record<string, string> | undefined,
251
+ timeoutMs: number,
252
+ ): Promise<{ data: ModelGrepModel[] }> {
253
+ const all: ModelGrepModel[] = [];
254
+ let offset = 0;
255
+ for (let page = 0; page < MODELGREP_MAX_PAGES; page++) {
256
+ const sep = url.includes("?") ? "&" : "?";
257
+ const res = (await defaultFetchRaw(
258
+ `${url}${sep}offset=${offset}`,
259
+ headers,
260
+ timeoutMs,
261
+ )) as ModelGrepPage;
262
+ if (res.data?.length) all.push(...res.data);
263
+ if (!res.meta?.has_more) break;
264
+ offset = res.meta.next_offset ?? offset + MODELGREP_PAGE;
265
+ }
266
+ return { data: all };
267
+ }
268
+
199
269
  // ── Cache dir ─────────────────────────────────────────────────────────────────
200
270
 
201
271
  export const CACHE_DIR = join(
@@ -205,21 +275,13 @@ export const CACHE_DIR = join(
205
275
 
206
276
  // ── Data sources ──────────────────────────────────────────────────────────────
207
277
 
208
- export const modelsDev = new DataSource<ModelsDevApi>({
209
- label: "models.dev",
210
- url: "https://models.dev/api.json",
211
- cachePath: join(CACHE_DIR, "models.json"),
212
- parse: (raw) => raw as ModelsDevApi,
213
- parseCache: (data) => (data as ModelsDevApi) ?? {},
214
- empty: {},
215
- });
216
-
217
- export const benchmark = new DataSource<BenchmarkEntry[]>({
218
- label: "benchlm",
219
- url: "https://benchlm.ai/api/data/leaderboard",
220
- cachePath: join(CACHE_DIR, "benchlm.json"),
221
- parse: (raw) => (raw as BenchmarkResponse).models ?? [],
222
- parseCache: (data) => (data as BenchmarkResponse)?.models ?? [],
278
+ export const modelgrep = new DataSource<ModelGrepModel[]>({
279
+ label: "modelgrep",
280
+ url: `https://modelgrep.com/api/v1/models?benchmarked=1&sort=coding&order=desc&limit=${MODELGREP_PAGE}`,
281
+ cachePath: join(CACHE_DIR, "modelgrep.json"),
282
+ fetchRaw: fetchModelGrepAll,
283
+ parse: (raw) => (raw as ModelGrepResponse).data ?? [],
284
+ parseCache: (data) => (data as ModelGrepResponse)?.data ?? [],
223
285
  empty: [],
224
286
  });
225
287
 
@@ -228,8 +290,9 @@ export const benchmark = new DataSource<BenchmarkEntry[]>({
228
290
  function normalize(id: string): string {
229
291
  return id
230
292
  .toLowerCase()
231
- .replace(/[:@].*$/, "")
232
- .replace(/-\d{8}$/, "");
293
+ .replace(/[:@].*$/, "") // routing suffix (:nitro, @date)
294
+ .replace(/[._]/g, "-") // fold separators: modelgrep `4.5` ↔ Pi routing `4-5`
295
+ .replace(/-\d{8}$/, ""); // trailing -YYYYMMDD
233
296
  }
234
297
 
235
298
  function stripPrefix(id: string): string {
@@ -237,71 +300,172 @@ function stripPrefix(id: string): string {
237
300
  return i >= 0 ? id.slice(i + 1) : id;
238
301
  }
239
302
 
240
- export function buildModelsDevIndex(
241
- api: ModelsDevApi,
242
- ): Map<string, ModelsDevModel> {
243
- const index = new Map<string, ModelsDevModel>();
244
- for (const provider of Object.values(api)) {
245
- if (!provider?.models) continue;
246
- for (const [modelId, model] of Object.entries(provider.models)) {
247
- const m: ModelsDevModel = { ...model, id: modelId };
248
- if (!index.has(modelId)) index.set(modelId, m);
249
- const norm = normalize(modelId);
250
- if (!index.has(norm)) index.set(norm, m);
251
- }
252
- }
253
- return index;
303
+ /** Slug = model id without its maker/provider prefix. */
304
+ function slugOf(id: string): string {
305
+ return id.includes("/") ? id.slice(id.lastIndexOf("/") + 1) : id;
254
306
  }
255
307
 
256
- export function lookupInIndex(
257
- id: string,
258
- index: Map<string, ModelsDevModel>,
259
- ): ModelsDevModel | undefined {
308
+ /**
309
+ * Generic normalized-index lookup: exact slug → normalized slug → fuzzy
310
+ * prefix overlap. Handles routing suffixes (`:nitro`, `@date`, `-YYYYMMDD`)
311
+ * and maker prefixes (e.g. `tencent/hy3-preview:nitro` → `hy3-preview`).
312
+ */
313
+ function findInIndex<T>(id: string, index: Map<string, T>): T | undefined {
260
314
  const stripped = stripPrefix(id);
261
315
  const direct = index.get(stripped) ?? index.get(normalize(stripped));
262
316
  if (direct) return direct;
263
317
  const norm = normalize(stripped);
264
- for (const [key, model] of index) {
265
- if (key.startsWith(norm) || norm.startsWith(key)) return model;
318
+ for (const [key, value] of index) {
319
+ if (key.startsWith(norm) || norm.startsWith(key)) return value;
266
320
  }
267
321
  return undefined;
268
322
  }
269
323
 
270
- export function lookupModelsDev(
271
- provider: string,
324
+ export function lookupInIndex(
272
325
  id: string,
326
+ index: Map<string, ModelsDevModel>,
273
327
  ): ModelsDevModel | undefined {
274
- const data = modelsDev.getCached();
275
- const canonical = id.includes("/") ? id.slice(id.lastIndexOf("/") + 1) : id;
276
- const exact = data[provider]?.models?.[canonical];
277
- if (exact) return exact;
278
- for (const p of Object.keys(data)) {
279
- const hit = data[p]?.models?.[canonical];
280
- if (hit) return hit;
328
+ return findInIndex(id, index);
329
+ }
330
+
331
+ function toModelsDevModel(g: ModelGrepModel): ModelsDevModel {
332
+ return {
333
+ id: slugOf(g.id),
334
+ name: g.name,
335
+ reasoning: g.capabilities?.reasoning,
336
+ modalities: g.modality,
337
+ limit: { context: g.context_length },
338
+ cost: { input: g.pricing?.input, output: g.pricing?.output },
339
+ };
340
+ }
341
+
342
+ export function buildModelsDevIndex(
343
+ source: ModelGrepModel[],
344
+ ): Map<string, ModelsDevModel> {
345
+ const index = new Map<string, ModelsDevModel>();
346
+ for (const g of source) {
347
+ const m = toModelsDevModel(g);
348
+ if (!index.has(m.id)) index.set(m.id, m);
349
+ const norm = normalize(m.id);
350
+ if (!index.has(norm)) index.set(norm, m);
281
351
  }
282
- return undefined;
352
+ return index;
353
+ }
354
+
355
+ export function lookupModelsDev(
356
+ _provider: string,
357
+ id: string,
358
+ ): ModelsDevModel | undefined {
359
+ // Provider prefix differs between Pi routing (cc/ds/openrouter) and modelgrep
360
+ // (anthropic/tencent), so join on the model slug only via the normalized index.
361
+ return findInIndex(id, buildModelsDevIndex(modelgrep.getCached()));
283
362
  }
284
363
 
285
364
  export async function fetchModelsDevIndex(): Promise<
286
365
  Map<string, ModelsDevModel>
287
366
  > {
288
- return buildModelsDevIndex(await modelsDev.get());
367
+ return buildModelsDevIndex(await modelgrep.get());
289
368
  }
290
369
 
291
- function normBench(s: string): string {
292
- return s
293
- .toLowerCase()
294
- .replace(/[-_.]+/g, " ")
295
- .replace(/\s+/g, " ")
296
- .trim();
370
+ // Weighted blend, renormalized over present fields — a missing input dilutes
371
+ // only its own group, never zero-penalizes the whole score.
372
+ function blend(parts: [number, number | null | undefined][]): number | null {
373
+ let weighted = 0;
374
+ let present = 0;
375
+ for (const [w, v] of parts) {
376
+ if (v == null) continue;
377
+ weighted += w * v;
378
+ present += w;
379
+ }
380
+ return present === 0 ? null : weighted / present;
381
+ }
382
+
383
+ const frac = (v: number | null | undefined) => (v == null ? null : v / 100);
384
+
385
+ // AA Intelligence Index ceiling — current leader (Claude Fable 5) scores ~65,
386
+ // so /65 maps the index to ~0–100 with headroom and no clipping.
387
+ const INTELLIGENCE_MAX = 65;
388
+ // Fallback calibration. For the models that carry the index AND the raw benches
389
+ // (deduped overlap, n=29), we fit our heuristic (0–1) to the rescaled index via
390
+ // least-squares:
391
+ // index100 ≈ SLOPE·heuristic + INTERCEPT
392
+ // Heuristic weights below + this line were jointly tuned against the index
393
+ // (R²=0.901, LOOCV-RMSE 6.55pt). Applying it to index-less models maps their
394
+ // heuristic onto the SAME scale as real index scores — a data-fit, not a
395
+ // guessed penalty. Refit if the catalog or weights change.
396
+ const FALLBACK_SLOPE = 120.6;
397
+ const FALLBACK_INTERCEPT = -10.6;
398
+ const clamp01to100 = (x: number) => Math.max(0, Math.min(100, x));
399
+
400
+ // Our coding/agentic-weighted heuristic from the raw evals (each used once —
401
+ // no double-counting with the index). Weights tuned against the AA index:
402
+ // agentic-heavy (.60) since tool-call matters most, coding (.30), reasoning a
403
+ // .10 tiebreaker. Sub-weights likewise fit — tau2 dominates the agentic group.
404
+ function heuristicScore(
405
+ aa: NonNullable<
406
+ NonNullable<ModelGrepModel["benchmarks"]>["artificial_analysis"]
407
+ >,
408
+ ): number | null {
409
+ const coding = blend([
410
+ [0.6, frac(aa.coding)],
411
+ [0.4, aa.scicode],
412
+ ]);
413
+ const agentic = blend([
414
+ [0.7, aa.tau2],
415
+ [0.3, frac(aa.agentic)],
416
+ ]);
417
+ const reasoning = blend([
418
+ [0.6, aa.gpqa],
419
+ [0.4, aa.hle],
420
+ ]);
421
+ return blend([
422
+ [0.3, coding],
423
+ [0.6, agentic],
424
+ [0.1, reasoning],
425
+ ]);
426
+ }
427
+
428
+ // Model score 0–100. Prefer AA's Intelligence Index (authoritative 9-eval
429
+ // composite); when absent, map our heuristic onto the index scale via the
430
+ // fitted line. Null only when nothing is benchmarked.
431
+ function codingScore(
432
+ bench: NonNullable<ModelGrepModel["benchmarks"]>,
433
+ ): number | null {
434
+ const aa = bench.artificial_analysis ?? {};
435
+ if (aa.intelligence != null) {
436
+ return Math.round((aa.intelligence / INTELLIGENCE_MAX) * 100);
437
+ }
438
+ const h = heuristicScore(aa);
439
+ return h == null
440
+ ? null
441
+ : Math.round(clamp01to100(FALLBACK_SLOPE * h + FALLBACK_INTERCEPT));
442
+ }
443
+
444
+ function buildBenchIndex(): Map<string, BenchmarkEntry> {
445
+ const index = new Map<string, BenchmarkEntry>();
446
+ // Rank by our computed score (desc); unscored sink to the bottom, holding
447
+ // source order among themselves.
448
+ const scored = modelgrep.getCached().map((g) => ({
449
+ g,
450
+ score: g.benchmarks ? codingScore(g.benchmarks) : null,
451
+ }));
452
+ scored.sort((a, b) => (b.score ?? -1) - (a.score ?? -1));
453
+ scored.forEach(({ g, score }, i) => {
454
+ const slug = slugOf(g.id);
455
+ const entry: BenchmarkEntry = {
456
+ rank: i + 1,
457
+ model: g.name ?? g.id,
458
+ creator: g.id.split("/")[0] ?? "",
459
+ overallScore: score,
460
+ inputPrice: g.pricing?.input ?? null,
461
+ outputPrice: g.pricing?.output ?? null,
462
+ };
463
+ for (const k of [slug, normalize(slug)])
464
+ if (!index.has(k)) index.set(k, entry);
465
+ });
466
+ return index;
297
467
  }
298
468
 
299
469
  export function lookupBenchmark(modelName: string): BenchmarkEntry | undefined {
300
- const entries = benchmark.getCached();
301
- const needle = normBench(modelName);
302
- return (
303
- entries.find((e) => normBench(e.model) === needle) ??
304
- entries.find((e) => normBench(e.model).includes(needle)) ??
305
- entries.find((e) => needle.includes(normBench(e.model)))
306
- );
470
+ return findInIndex(modelName, buildBenchIndex());
307
471
  }
package/src/index.ts CHANGED
@@ -4,14 +4,15 @@
4
4
  * Warms the shared model data cache on session start so other extensions
5
5
  * (pix-9router, models picker, footer) can read from ~/.cache/pi/* synchronously.
6
6
  *
7
- * Fetches in parallel, non-blocking — Pi session starts immediately.
7
+ * Single non-blocking fetch — Pi session starts immediately.
8
8
  */
9
9
 
10
10
  import type { ExtensionAPI } from "@earendil-works/pi-coding-agent";
11
- import { benchmark, modelsDev } from "./data.ts";
11
+ import { modelgrep } from "./data.ts";
12
12
 
13
13
  export type {
14
14
  BenchmarkEntry,
15
+ ModelGrepModel,
15
16
  ModelsDevApi,
16
17
  ModelsDevModel,
17
18
  } from "./data.ts";
@@ -19,7 +20,6 @@ export type {
19
20
  // Consumers (pix-core, pix-9router, …) import these instead of duplicating
20
21
  // the DataSource implementation and models.dev/BenchLM lookups.
21
22
  export {
22
- benchmark,
23
23
  buildModelsDevIndex,
24
24
  CACHE_DIR,
25
25
  DataSource,
@@ -27,10 +27,9 @@ export {
27
27
  lookupBenchmark,
28
28
  lookupInIndex,
29
29
  lookupModelsDev,
30
- modelsDev,
30
+ modelgrep,
31
31
  } from "./data.ts";
32
32
 
33
33
  export default function (_pi: ExtensionAPI): void {
34
- void modelsDev.get();
35
- void benchmark.get();
34
+ void modelgrep.get();
36
35
  }