@xynogen/pix-data 0.2.1 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/data.test.ts +86 -0
- package/src/data.ts +100 -5
- package/src/index.ts +3 -1
package/package.json
CHANGED
package/src/data.test.ts
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import { afterEach, beforeEach, describe, expect, it } from "bun:test";
|
|
2
2
|
import {
|
|
3
|
+
benchlm,
|
|
3
4
|
buildModelsDevIndex,
|
|
4
5
|
lookupBenchmark,
|
|
5
6
|
lookupInIndex,
|
|
@@ -227,3 +228,88 @@ describe("modelgrep adapters", () => {
|
|
|
227
228
|
expect(lookupBenchmark("nonexistent-model-xyz")).toBeUndefined();
|
|
228
229
|
});
|
|
229
230
|
});
|
|
231
|
+
|
|
232
|
+
// ── benchlm fallback (modelgrep AA null → benchlm) ────────────────────────────
|
|
233
|
+
|
|
234
|
+
describe("benchlm fallback", () => {
|
|
235
|
+
// modelgrep catalog: every model has null benchmarks (real-world shape today)
|
|
236
|
+
const catalog: ModelGrepModel[] = [
|
|
237
|
+
mg("anthropic/claude-opus-4-8", { name: "Claude Opus 4.8" }),
|
|
238
|
+
mg("minimax/minimax-m3", { name: "MiniMax M3" }),
|
|
239
|
+
mg("deepseek/deepseek-v4-pro", { name: "DeepSeek V4 Pro" }),
|
|
240
|
+
mg("qwen/qwen3.7-max", { name: "Qwen3.7 Max" }),
|
|
241
|
+
mg("ghost/uncataloged", { name: "Ghost" }), // not in benchlm either
|
|
242
|
+
];
|
|
243
|
+
// benchlm: real shape (no benchmarks field, just overallScore 0-100)
|
|
244
|
+
const benchlmEntries = [
|
|
245
|
+
{ rank: 1, model: "Claude Opus 4.8 (Max)", overallScore: 95 },
|
|
246
|
+
{ rank: 2, model: "Claude Opus 4.8", overallScore: 93 },
|
|
247
|
+
{ rank: 25, model: "MiniMax M3", overallScore: 78 },
|
|
248
|
+
{ rank: 39, model: "DeepSeek V4 Pro", overallScore: 68 },
|
|
249
|
+
{ rank: 10, model: "Qwen3.7 Max", overallScore: 90 },
|
|
250
|
+
];
|
|
251
|
+
|
|
252
|
+
beforeEach(() => {
|
|
253
|
+
(modelgrep as unknown as { _mem: ModelGrepModel[] })._mem = catalog;
|
|
254
|
+
(benchlm as unknown as { _mem: typeof benchlmEntries })._mem =
|
|
255
|
+
benchlmEntries;
|
|
256
|
+
});
|
|
257
|
+
afterEach(() => {
|
|
258
|
+
(modelgrep as unknown as { _mem: ModelGrepModel[] | null })._mem = null;
|
|
259
|
+
(benchlm as unknown as { _mem: typeof benchlmEntries | null })._mem = null;
|
|
260
|
+
});
|
|
261
|
+
|
|
262
|
+
it("falls back to benchlm when modelgrep benchmarks are null", () => {
|
|
263
|
+
const b = lookupBenchmark("claude-opus-4-8");
|
|
264
|
+
// Two candidates: (Max)=95, base=93 → pick higher
|
|
265
|
+
expect(b?.overallScore).toBe(95);
|
|
266
|
+
});
|
|
267
|
+
|
|
268
|
+
it("prefers the higher-scoring benchlm variant when multiple match", () => {
|
|
269
|
+
const b = lookupBenchmark("minimax-m3");
|
|
270
|
+
expect(b?.overallScore).toBe(78);
|
|
271
|
+
});
|
|
272
|
+
|
|
273
|
+
it("returns null when both modelgrep and benchlm lack the model", () => {
|
|
274
|
+
const b = lookupBenchmark("uncataloged");
|
|
275
|
+
expect(b?.overallScore).toBeNull();
|
|
276
|
+
});
|
|
277
|
+
|
|
278
|
+
it("ranks scored models above unscored when only some have benchlm data", () => {
|
|
279
|
+
// catalog has 5 models, 4 in benchlm → uncataloged sinks to last
|
|
280
|
+
const b = lookupBenchmark("uncataloged");
|
|
281
|
+
expect(b?.rank).toBe(5); // 4 scored + 1 unscored at bottom
|
|
282
|
+
});
|
|
283
|
+
|
|
284
|
+
it("normalizes dots and parens: qwen3.7-max ↔ Qwen3.7 Max", () => {
|
|
285
|
+
const b = lookupBenchmark("qwen3.7-max");
|
|
286
|
+
expect(b?.overallScore).toBe(90);
|
|
287
|
+
});
|
|
288
|
+
});
|
|
289
|
+
|
|
290
|
+
describe("modelgrep AA primary wins over benchlm", () => {
|
|
291
|
+
const catalog: ModelGrepModel[] = [
|
|
292
|
+
mg("anthropic/claude-opus-4-8", {
|
|
293
|
+
bench: { intelligence: 60 }, // AA index: 60/65 → 92
|
|
294
|
+
}),
|
|
295
|
+
];
|
|
296
|
+
const benchlmEntries = [
|
|
297
|
+
{ rank: 1, model: "Claude Opus 4.8", overallScore: 50 },
|
|
298
|
+
];
|
|
299
|
+
|
|
300
|
+
beforeEach(() => {
|
|
301
|
+
(modelgrep as unknown as { _mem: ModelGrepModel[] })._mem = catalog;
|
|
302
|
+
(benchlm as unknown as { _mem: typeof benchlmEntries })._mem =
|
|
303
|
+
benchlmEntries;
|
|
304
|
+
});
|
|
305
|
+
afterEach(() => {
|
|
306
|
+
(modelgrep as unknown as { _mem: ModelGrepModel[] | null })._mem = null;
|
|
307
|
+
(benchlm as unknown as { _mem: typeof benchlmEntries | null })._mem = null;
|
|
308
|
+
});
|
|
309
|
+
|
|
310
|
+
it("uses AA intelligence when present, ignores benchlm", () => {
|
|
311
|
+
const b = lookupBenchmark("claude-opus-4-8");
|
|
312
|
+
// 60/65 * 100 = 92.23 → 92, not benchlm's 50
|
|
313
|
+
expect(b?.overallScore).toBe(92);
|
|
314
|
+
});
|
|
315
|
+
});
|
package/src/data.ts
CHANGED
|
@@ -277,7 +277,7 @@ export const CACHE_DIR = join(
|
|
|
277
277
|
|
|
278
278
|
export const modelgrep = new DataSource<ModelGrepModel[]>({
|
|
279
279
|
label: "modelgrep",
|
|
280
|
-
url: `https://modelgrep.com/api/v1/models?
|
|
280
|
+
url: `https://modelgrep.com/api/v1/models?sort=coding&order=desc&limit=${MODELGREP_PAGE}`,
|
|
281
281
|
cachePath: join(CACHE_DIR, "modelgrep.json"),
|
|
282
282
|
fetchRaw: fetchModelGrepAll,
|
|
283
283
|
parse: (raw) => (raw as ModelGrepResponse).data ?? [],
|
|
@@ -285,6 +285,40 @@ export const modelgrep = new DataSource<ModelGrepModel[]>({
|
|
|
285
285
|
empty: [],
|
|
286
286
|
});
|
|
287
287
|
|
|
288
|
+
// ── BenchLM (fallback coding-score source) ────────────────────────────────────
|
|
289
|
+
// Upstream `benchlm.ai` ships a 0–100 `overallScore` per model with category
|
|
290
|
+
// breakdown (coding/agentic/reasoning/…). Used as a fallback when modelgrep's
|
|
291
|
+
// `benchmarks.artificial_analysis` is null (current state). Same name as
|
|
292
|
+
// before the 4dfb443 swap.
|
|
293
|
+
interface BenchLMCategoryScores {
|
|
294
|
+
coding?: number | null;
|
|
295
|
+
agentic?: number | null;
|
|
296
|
+
reasoning?: number | null;
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
interface BenchLMRawEntry {
|
|
300
|
+
rank: number;
|
|
301
|
+
model: string;
|
|
302
|
+
creator?: string;
|
|
303
|
+
overallScore: number | null;
|
|
304
|
+
categoryScores?: BenchLMCategoryScores;
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
interface BenchLMResponse {
|
|
308
|
+
lastUpdated?: string;
|
|
309
|
+
mode?: string;
|
|
310
|
+
models?: BenchLMRawEntry[];
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
export const benchlm = new DataSource<BenchLMRawEntry[]>({
|
|
314
|
+
label: "benchlm",
|
|
315
|
+
url: "https://benchlm.ai/api/data/leaderboard",
|
|
316
|
+
cachePath: join(CACHE_DIR, "benchlm.json"),
|
|
317
|
+
parse: (raw) => (raw as BenchLMResponse).models ?? [],
|
|
318
|
+
parseCache: (data) => (data as BenchLMResponse)?.models ?? [],
|
|
319
|
+
empty: [],
|
|
320
|
+
});
|
|
321
|
+
|
|
288
322
|
// ── Lookup helpers ─────────────────────────────────────────────────────────────
|
|
289
323
|
|
|
290
324
|
function normalize(id: string): string {
|
|
@@ -443,12 +477,24 @@ function codingScore(
|
|
|
443
477
|
|
|
444
478
|
function buildBenchIndex(): Map<string, BenchmarkEntry> {
|
|
445
479
|
const index = new Map<string, BenchmarkEntry>();
|
|
480
|
+
// BenchLM lookup table: normalized benchlm name → entry, indexed in source
|
|
481
|
+
// order (highest score first when ties exist). Built once per call.
|
|
482
|
+
const benchlmByNorm = new Map<string, BenchLMRawEntry[]>();
|
|
483
|
+
for (const b of benchlm.getCached()) {
|
|
484
|
+
const k = normalizeBenchlmName(b.model);
|
|
485
|
+
if (!k) continue;
|
|
486
|
+
const arr = benchlmByNorm.get(k) ?? [];
|
|
487
|
+
arr.push(b);
|
|
488
|
+
benchlmByNorm.set(k, arr);
|
|
489
|
+
}
|
|
490
|
+
|
|
446
491
|
// Rank by our computed score (desc); unscored sink to the bottom, holding
|
|
447
492
|
// source order among themselves.
|
|
448
|
-
const scored = modelgrep.getCached().map((g) =>
|
|
449
|
-
g
|
|
450
|
-
score
|
|
451
|
-
|
|
493
|
+
const scored = modelgrep.getCached().map((g) => {
|
|
494
|
+
const fromAA = g.benchmarks ? codingScore(g.benchmarks) : null;
|
|
495
|
+
const score = fromAA ?? lookupBenchlmScore(g, benchlmByNorm);
|
|
496
|
+
return { g, score };
|
|
497
|
+
});
|
|
452
498
|
scored.sort((a, b) => (b.score ?? -1) - (a.score ?? -1));
|
|
453
499
|
scored.forEach(({ g, score }, i) => {
|
|
454
500
|
const slug = slugOf(g.id);
|
|
@@ -466,6 +512,55 @@ function buildBenchIndex(): Map<string, BenchmarkEntry> {
|
|
|
466
512
|
return index;
|
|
467
513
|
}
|
|
468
514
|
|
|
515
|
+
// Normalize a benchlm `model` field (e.g. "Claude Opus 4.8 (Max)") to a slug
|
|
516
|
+
// comparable to modelgrep ids (e.g. "claude-opus-4-8"). Drops parenthesized
|
|
517
|
+
// variants, lowercases, folds . _ space → -, strips leading/trailing dashes.
|
|
518
|
+
function normalizeBenchlmName(name: string): string {
|
|
519
|
+
return name
|
|
520
|
+
.replace(/\s*\([^)]*\)\s*/g, " ") // drop "(Max)", "(High)", etc.
|
|
521
|
+
.toLowerCase()
|
|
522
|
+
.replace(/[._\s]+/g, "-")
|
|
523
|
+
.replace(/-+/g, "-")
|
|
524
|
+
.replace(/^-|-$/g, "");
|
|
525
|
+
}
|
|
526
|
+
|
|
527
|
+
// Try to find a benchlm score for a modelgrep model. Match strategy:
|
|
528
|
+
// 1. exact normalized match of modelgrep slug
|
|
529
|
+
// 2. prefix overlap (claude-opus-4-8 ↔ claude-opus-4-8-thinking) — benchlm
|
|
530
|
+
// may list a long-form name; prefer the shortest match on tie (base > variants)
|
|
531
|
+
// 3. if multiple benchlm entries match, return the highest score
|
|
532
|
+
function lookupBenchlmScore(
|
|
533
|
+
g: ModelGrepModel,
|
|
534
|
+
benchlmByNorm: Map<string, BenchLMRawEntry[]>,
|
|
535
|
+
): number | null {
|
|
536
|
+
const slug = slugOf(g.id);
|
|
537
|
+
const norm = normalize(slug);
|
|
538
|
+
|
|
539
|
+
// Collect candidates: exact match + prefix matches (either side).
|
|
540
|
+
const candidates: BenchLMRawEntry[] = [];
|
|
541
|
+
const direct = benchlmByNorm.get(norm);
|
|
542
|
+
if (direct) candidates.push(...direct);
|
|
543
|
+
for (const [key, entries] of benchlmByNorm) {
|
|
544
|
+
if (key === norm) continue;
|
|
545
|
+
if (key.startsWith(norm) || norm.startsWith(key))
|
|
546
|
+
candidates.push(...entries);
|
|
547
|
+
}
|
|
548
|
+
if (candidates.length === 0) return null;
|
|
549
|
+
|
|
550
|
+
// Best entry = highest overallScore. Sort by score desc, then by slug
|
|
551
|
+
// length asc (prefer base name over suffix variants on a tie).
|
|
552
|
+
const best = [...candidates].sort((a, b) => {
|
|
553
|
+
const sa = a.overallScore ?? -Infinity;
|
|
554
|
+
const sb = b.overallScore ?? -Infinity;
|
|
555
|
+
if (sa !== sb) return sb - sa;
|
|
556
|
+
return (
|
|
557
|
+
normalizeBenchlmName(a.model).length -
|
|
558
|
+
normalizeBenchlmName(b.model).length
|
|
559
|
+
);
|
|
560
|
+
})[0];
|
|
561
|
+
return best.overallScore ?? null;
|
|
562
|
+
}
|
|
563
|
+
|
|
469
564
|
/** Map a benchmark score (0–100) to a semantic color token. */
|
|
470
565
|
export function benchScoreColor(
|
|
471
566
|
score: number | null | undefined,
|
package/src/index.ts
CHANGED
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|
*/
|
|
9
9
|
|
|
10
10
|
import type { ExtensionAPI } from "@earendil-works/pi-coding-agent";
|
|
11
|
-
import { modelgrep } from "./data.ts";
|
|
11
|
+
import { benchlm, modelgrep } from "./data.ts";
|
|
12
12
|
|
|
13
13
|
export type {
|
|
14
14
|
BenchmarkEntry,
|
|
@@ -20,6 +20,7 @@ export type {
|
|
|
20
20
|
// Consumers (pix-core, pix-9router, …) import these instead of duplicating
|
|
21
21
|
// the DataSource implementation and models.dev/BenchLM lookups.
|
|
22
22
|
export {
|
|
23
|
+
benchlm,
|
|
23
24
|
benchScoreColor,
|
|
24
25
|
buildModelsDevIndex,
|
|
25
26
|
CACHE_DIR,
|
|
@@ -33,4 +34,5 @@ export {
|
|
|
33
34
|
|
|
34
35
|
export default function (_pi: ExtensionAPI): void {
|
|
35
36
|
void modelgrep.get();
|
|
37
|
+
void benchlm.get();
|
|
36
38
|
}
|