@xynogen/pix-data 0.2.2 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@xynogen/pix-data",
3
- "version": "0.2.2",
3
+ "version": "0.2.3",
4
4
  "description": "Pi extension — shared model data layer (models.dev + BenchLM), cached at ~/.cache/pi",
5
5
  "type": "module",
6
6
  "main": "src/index.ts",
package/src/data.test.ts CHANGED
@@ -1,5 +1,6 @@
1
1
  import { afterEach, beforeEach, describe, expect, it } from "bun:test";
2
2
  import {
3
+ benchlm,
3
4
  buildModelsDevIndex,
4
5
  lookupBenchmark,
5
6
  lookupInIndex,
@@ -227,3 +228,88 @@ describe("modelgrep adapters", () => {
227
228
  expect(lookupBenchmark("nonexistent-model-xyz")).toBeUndefined();
228
229
  });
229
230
  });
231
+
232
+ // ── benchlm fallback (modelgrep AA null → benchlm) ────────────────────────────
233
+
234
+ describe("benchlm fallback", () => {
235
+ // modelgrep catalog: every model has null benchmarks (real-world shape today)
236
+ const catalog: ModelGrepModel[] = [
237
+ mg("anthropic/claude-opus-4-8", { name: "Claude Opus 4.8" }),
238
+ mg("minimax/minimax-m3", { name: "MiniMax M3" }),
239
+ mg("deepseek/deepseek-v4-pro", { name: "DeepSeek V4 Pro" }),
240
+ mg("qwen/qwen3.7-max", { name: "Qwen3.7 Max" }),
241
+ mg("ghost/uncataloged", { name: "Ghost" }), // not in benchlm either
242
+ ];
243
+ // benchlm: real shape (no benchmarks field, just overallScore 0-100)
244
+ const benchlmEntries = [
245
+ { rank: 1, model: "Claude Opus 4.8 (Max)", overallScore: 95 },
246
+ { rank: 2, model: "Claude Opus 4.8", overallScore: 93 },
247
+ { rank: 25, model: "MiniMax M3", overallScore: 78 },
248
+ { rank: 39, model: "DeepSeek V4 Pro", overallScore: 68 },
249
+ { rank: 10, model: "Qwen3.7 Max", overallScore: 90 },
250
+ ];
251
+
252
+ beforeEach(() => {
253
+ (modelgrep as unknown as { _mem: ModelGrepModel[] })._mem = catalog;
254
+ (benchlm as unknown as { _mem: typeof benchlmEntries })._mem =
255
+ benchlmEntries;
256
+ });
257
+ afterEach(() => {
258
+ (modelgrep as unknown as { _mem: ModelGrepModel[] | null })._mem = null;
259
+ (benchlm as unknown as { _mem: typeof benchlmEntries | null })._mem = null;
260
+ });
261
+
262
+ it("falls back to benchlm when modelgrep benchmarks are null", () => {
263
+ const b = lookupBenchmark("claude-opus-4-8");
264
+ // Two candidates: (Max)=95, base=93 → pick higher
265
+ expect(b?.overallScore).toBe(95);
266
+ });
267
+
268
+ it("prefers the higher-scoring benchlm variant when multiple match", () => {
269
+ const b = lookupBenchmark("minimax-m3");
270
+ expect(b?.overallScore).toBe(78);
271
+ });
272
+
273
+ it("returns null when both modelgrep and benchlm lack the model", () => {
274
+ const b = lookupBenchmark("uncataloged");
275
+ expect(b?.overallScore).toBeNull();
276
+ });
277
+
278
+ it("ranks scored models above unscored when only some have benchlm data", () => {
279
+ // catalog has 5 models, 4 in benchlm → uncataloged sinks to last
280
+ const b = lookupBenchmark("uncataloged");
281
+ expect(b?.rank).toBe(5); // 4 scored + 1 unscored at bottom
282
+ });
283
+
284
+ it("normalizes dots and parens: qwen3.7-max ↔ Qwen3.7 Max", () => {
285
+ const b = lookupBenchmark("qwen3.7-max");
286
+ expect(b?.overallScore).toBe(90);
287
+ });
288
+ });
289
+
290
+ describe("modelgrep AA primary wins over benchlm", () => {
291
+ const catalog: ModelGrepModel[] = [
292
+ mg("anthropic/claude-opus-4-8", {
293
+ bench: { intelligence: 60 }, // AA index: 60/65 → 92
294
+ }),
295
+ ];
296
+ const benchlmEntries = [
297
+ { rank: 1, model: "Claude Opus 4.8", overallScore: 50 },
298
+ ];
299
+
300
+ beforeEach(() => {
301
+ (modelgrep as unknown as { _mem: ModelGrepModel[] })._mem = catalog;
302
+ (benchlm as unknown as { _mem: typeof benchlmEntries })._mem =
303
+ benchlmEntries;
304
+ });
305
+ afterEach(() => {
306
+ (modelgrep as unknown as { _mem: ModelGrepModel[] | null })._mem = null;
307
+ (benchlm as unknown as { _mem: typeof benchlmEntries | null })._mem = null;
308
+ });
309
+
310
+ it("uses AA intelligence when present, ignores benchlm", () => {
311
+ const b = lookupBenchmark("claude-opus-4-8");
312
+ // 60/65 * 100 = 92.23 → 92, not benchlm's 50
313
+ expect(b?.overallScore).toBe(92);
314
+ });
315
+ });
package/src/data.ts CHANGED
@@ -285,6 +285,40 @@ export const modelgrep = new DataSource<ModelGrepModel[]>({
285
285
  empty: [],
286
286
  });
287
287
 
288
+ // ── BenchLM (fallback coding-score source) ────────────────────────────────────
289
+ // Upstream `benchlm.ai` ships a 0–100 `overallScore` per model with category
290
+ // breakdown (coding/agentic/reasoning/…). Used as a fallback when modelgrep's
291
+ // `benchmarks.artificial_analysis` is null (current state). Same name as
292
+ // before the 4dfb443 swap.
293
+ interface BenchLMCategoryScores {
294
+ coding?: number | null;
295
+ agentic?: number | null;
296
+ reasoning?: number | null;
297
+ }
298
+
299
+ interface BenchLMRawEntry {
300
+ rank: number;
301
+ model: string;
302
+ creator?: string;
303
+ overallScore: number | null;
304
+ categoryScores?: BenchLMCategoryScores;
305
+ }
306
+
307
+ interface BenchLMResponse {
308
+ lastUpdated?: string;
309
+ mode?: string;
310
+ models?: BenchLMRawEntry[];
311
+ }
312
+
313
+ export const benchlm = new DataSource<BenchLMRawEntry[]>({
314
+ label: "benchlm",
315
+ url: "https://benchlm.ai/api/data/leaderboard",
316
+ cachePath: join(CACHE_DIR, "benchlm.json"),
317
+ parse: (raw) => (raw as BenchLMResponse).models ?? [],
318
+ parseCache: (data) => (data as BenchLMResponse)?.models ?? [],
319
+ empty: [],
320
+ });
321
+
288
322
  // ── Lookup helpers ─────────────────────────────────────────────────────────────
289
323
 
290
324
  function normalize(id: string): string {
@@ -443,12 +477,24 @@ function codingScore(
443
477
 
444
478
  function buildBenchIndex(): Map<string, BenchmarkEntry> {
445
479
  const index = new Map<string, BenchmarkEntry>();
480
+ // BenchLM lookup table: normalized benchlm name → entry, indexed in source
481
+ // order (highest score first when ties exist). Built once per call.
482
+ const benchlmByNorm = new Map<string, BenchLMRawEntry[]>();
483
+ for (const b of benchlm.getCached()) {
484
+ const k = normalizeBenchlmName(b.model);
485
+ if (!k) continue;
486
+ const arr = benchlmByNorm.get(k) ?? [];
487
+ arr.push(b);
488
+ benchlmByNorm.set(k, arr);
489
+ }
490
+
446
491
  // Rank by our computed score (desc); unscored sink to the bottom, holding
447
492
  // source order among themselves.
448
- const scored = modelgrep.getCached().map((g) => ({
449
- g,
450
- score: g.benchmarks ? codingScore(g.benchmarks) : null,
451
- }));
493
+ const scored = modelgrep.getCached().map((g) => {
494
+ const fromAA = g.benchmarks ? codingScore(g.benchmarks) : null;
495
+ const score = fromAA ?? lookupBenchlmScore(g, benchlmByNorm);
496
+ return { g, score };
497
+ });
452
498
  scored.sort((a, b) => (b.score ?? -1) - (a.score ?? -1));
453
499
  scored.forEach(({ g, score }, i) => {
454
500
  const slug = slugOf(g.id);
@@ -466,6 +512,55 @@ function buildBenchIndex(): Map<string, BenchmarkEntry> {
466
512
  return index;
467
513
  }
468
514
 
515
+ // Normalize a benchlm `model` field (e.g. "Claude Opus 4.8 (Max)") to a slug
516
+ // comparable to modelgrep ids (e.g. "claude-opus-4-8"). Drops parenthesized
517
+ // variants, lowercases, folds . _ space → -, strips leading/trailing dashes.
518
+ function normalizeBenchlmName(name: string): string {
519
+ return name
520
+ .replace(/\s*\([^)]*\)\s*/g, " ") // drop "(Max)", "(High)", etc.
521
+ .toLowerCase()
522
+ .replace(/[._\s]+/g, "-")
523
+ .replace(/-+/g, "-")
524
+ .replace(/^-|-$/g, "");
525
+ }
526
+
527
+ // Try to find a benchlm score for a modelgrep model. Match strategy:
528
+ // 1. exact normalized match of modelgrep slug
529
+ // 2. prefix overlap (claude-opus-4-8 ↔ claude-opus-4-8-thinking) — benchlm
530
+ // may list a long-form name; prefer the shortest match on tie (base > variants)
531
+ // 3. if multiple benchlm entries match, return the highest score
532
+ function lookupBenchlmScore(
533
+ g: ModelGrepModel,
534
+ benchlmByNorm: Map<string, BenchLMRawEntry[]>,
535
+ ): number | null {
536
+ const slug = slugOf(g.id);
537
+ const norm = normalize(slug);
538
+
539
+ // Collect candidates: exact match + prefix matches (either side).
540
+ const candidates: BenchLMRawEntry[] = [];
541
+ const direct = benchlmByNorm.get(norm);
542
+ if (direct) candidates.push(...direct);
543
+ for (const [key, entries] of benchlmByNorm) {
544
+ if (key === norm) continue;
545
+ if (key.startsWith(norm) || norm.startsWith(key))
546
+ candidates.push(...entries);
547
+ }
548
+ if (candidates.length === 0) return null;
549
+
550
+ // Best entry = highest overallScore. Sort by score desc, then by slug
551
+ // length asc (prefer base name over suffix variants on a tie).
552
+ const best = [...candidates].sort((a, b) => {
553
+ const sa = a.overallScore ?? -Infinity;
554
+ const sb = b.overallScore ?? -Infinity;
555
+ if (sa !== sb) return sb - sa;
556
+ return (
557
+ normalizeBenchlmName(a.model).length -
558
+ normalizeBenchlmName(b.model).length
559
+ );
560
+ })[0];
561
+ return best.overallScore ?? null;
562
+ }
563
+
469
564
  /** Map a benchmark score (0–100) to a semantic color token. */
470
565
  export function benchScoreColor(
471
566
  score: number | null | undefined,
package/src/index.ts CHANGED
@@ -8,7 +8,7 @@
8
8
  */
9
9
 
10
10
  import type { ExtensionAPI } from "@earendil-works/pi-coding-agent";
11
- import { modelgrep } from "./data.ts";
11
+ import { benchlm, modelgrep } from "./data.ts";
12
12
 
13
13
  export type {
14
14
  BenchmarkEntry,
@@ -20,6 +20,7 @@ export type {
20
20
  // Consumers (pix-core, pix-9router, …) import these instead of duplicating
21
21
  // the DataSource implementation and models.dev/BenchLM lookups.
22
22
  export {
23
+ benchlm,
23
24
  benchScoreColor,
24
25
  buildModelsDevIndex,
25
26
  CACHE_DIR,
@@ -33,4 +34,5 @@ export {
33
34
 
34
35
  export default function (_pi: ExtensionAPI): void {
35
36
  void modelgrep.get();
37
+ void benchlm.get();
36
38
  }