npm - @xynogen/pix-data - Versions diffs - 0.2.1 → 0.2.3 - Mend

@xynogen/pix-data 0.2.1 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
 	"name": "@xynogen/pix-data",
-	"version": "0.2.1",
+	"version": "0.2.3",
 	"description": "Pi extension — shared model data layer (models.dev + BenchLM), cached at ~/.cache/pi",
 	"type": "module",
 	"main": "src/index.ts",

package/src/data.test.ts CHANGED Viewed

@@ -1,5 +1,6 @@
 import { afterEach, beforeEach, describe, expect, it } from "bun:test";
 import {
+	benchlm,
 	buildModelsDevIndex,
 	lookupBenchmark,
 	lookupInIndex,
@@ -227,3 +228,88 @@ describe("modelgrep adapters", () => {
 		expect(lookupBenchmark("nonexistent-model-xyz")).toBeUndefined();
 	});
 });
+// ── benchlm fallback (modelgrep AA null → benchlm) ────────────────────────────
+describe("benchlm fallback", () => {
+	// modelgrep catalog: every model has null benchmarks (real-world shape today)
+	const catalog: ModelGrepModel[] = [
+		mg("anthropic/claude-opus-4-8", { name: "Claude Opus 4.8" }),
+		mg("minimax/minimax-m3", { name: "MiniMax M3" }),
+		mg("deepseek/deepseek-v4-pro", { name: "DeepSeek V4 Pro" }),
+		mg("qwen/qwen3.7-max", { name: "Qwen3.7 Max" }),
+		mg("ghost/uncataloged", { name: "Ghost" }), // not in benchlm either
+	];
+	// benchlm: real shape (no benchmarks field, just overallScore 0-100)
+	const benchlmEntries = [
+		{ rank: 1, model: "Claude Opus 4.8 (Max)", overallScore: 95 },
+		{ rank: 2, model: "Claude Opus 4.8", overallScore: 93 },
+		{ rank: 25, model: "MiniMax M3", overallScore: 78 },
+		{ rank: 39, model: "DeepSeek V4 Pro", overallScore: 68 },
+		{ rank: 10, model: "Qwen3.7 Max", overallScore: 90 },
+	];
+	beforeEach(() => {
+		(modelgrep as unknown as { _mem: ModelGrepModel[] })._mem = catalog;
+		(benchlm as unknown as { _mem: typeof benchlmEntries })._mem =
+			benchlmEntries;
+	});
+	afterEach(() => {
+		(modelgrep as unknown as { _mem: ModelGrepModel[] | null })._mem = null;
+		(benchlm as unknown as { _mem: typeof benchlmEntries | null })._mem = null;
+	});
+	it("falls back to benchlm when modelgrep benchmarks are null", () => {
+		const b = lookupBenchmark("claude-opus-4-8");
+		// Two candidates: (Max)=95, base=93 → pick higher
+		expect(b?.overallScore).toBe(95);
+	});
+	it("prefers the higher-scoring benchlm variant when multiple match", () => {
+		const b = lookupBenchmark("minimax-m3");
+		expect(b?.overallScore).toBe(78);
+	});
+	it("returns null when both modelgrep and benchlm lack the model", () => {
+		const b = lookupBenchmark("uncataloged");
+		expect(b?.overallScore).toBeNull();
+	});
+	it("ranks scored models above unscored when only some have benchlm data", () => {
+		// catalog has 5 models, 4 in benchlm → uncataloged sinks to last
+		const b = lookupBenchmark("uncataloged");
+		expect(b?.rank).toBe(5); // 4 scored + 1 unscored at bottom
+	});
+	it("normalizes dots and parens: qwen3.7-max ↔ Qwen3.7 Max", () => {
+		const b = lookupBenchmark("qwen3.7-max");
+		expect(b?.overallScore).toBe(90);
+	});
+});
+describe("modelgrep AA primary wins over benchlm", () => {
+	const catalog: ModelGrepModel[] = [
+		mg("anthropic/claude-opus-4-8", {
+			bench: { intelligence: 60 }, // AA index: 60/65 → 92
+		}),
+	];
+	const benchlmEntries = [
+		{ rank: 1, model: "Claude Opus 4.8", overallScore: 50 },
+	];
+	beforeEach(() => {
+		(modelgrep as unknown as { _mem: ModelGrepModel[] })._mem = catalog;
+		(benchlm as unknown as { _mem: typeof benchlmEntries })._mem =
+			benchlmEntries;
+	});
+	afterEach(() => {
+		(modelgrep as unknown as { _mem: ModelGrepModel[] | null })._mem = null;
+		(benchlm as unknown as { _mem: typeof benchlmEntries | null })._mem = null;
+	});
+	it("uses AA intelligence when present, ignores benchlm", () => {
+		const b = lookupBenchmark("claude-opus-4-8");
+		// 60/65 * 100 = 92.23 → 92, not benchlm's 50
+		expect(b?.overallScore).toBe(92);
+	});
+});

package/src/data.ts CHANGED Viewed

@@ -277,7 +277,7 @@ export const CACHE_DIR = join(
 export const modelgrep = new DataSource<ModelGrepModel[]>({
 	label: "modelgrep",
-	url: `https://modelgrep.com/api/v1/models?benchmarked=1&sort=coding&order=desc&limit=${MODELGREP_PAGE}`,
+	url: `https://modelgrep.com/api/v1/models?sort=coding&order=desc&limit=${MODELGREP_PAGE}`,
 	cachePath: join(CACHE_DIR, "modelgrep.json"),
 	fetchRaw: fetchModelGrepAll,
 	parse: (raw) => (raw as ModelGrepResponse).data ?? [],
@@ -285,6 +285,40 @@ export const modelgrep = new DataSource<ModelGrepModel[]>({
 	empty: [],
 });
+// ── BenchLM (fallback coding-score source) ────────────────────────────────────
+// Upstream `benchlm.ai` ships a 0–100 `overallScore` per model with category
+// breakdown (coding/agentic/reasoning/…). Used as a fallback when modelgrep's
+// `benchmarks.artificial_analysis` is null (current state). Same name as
+// before the 4dfb443 swap.
+interface BenchLMCategoryScores {
+	coding?: number | null;
+	agentic?: number | null;
+	reasoning?: number | null;
+}
+interface BenchLMRawEntry {
+	rank: number;
+	model: string;
+	creator?: string;
+	overallScore: number | null;
+	categoryScores?: BenchLMCategoryScores;
+}
+interface BenchLMResponse {
+	lastUpdated?: string;
+	mode?: string;
+	models?: BenchLMRawEntry[];
+}
+export const benchlm = new DataSource<BenchLMRawEntry[]>({
+	label: "benchlm",
+	url: "https://benchlm.ai/api/data/leaderboard",
+	cachePath: join(CACHE_DIR, "benchlm.json"),
+	parse: (raw) => (raw as BenchLMResponse).models ?? [],
+	parseCache: (data) => (data as BenchLMResponse)?.models ?? [],
+	empty: [],
+});
 // ── Lookup helpers ─────────────────────────────────────────────────────────────
 function normalize(id: string): string {
@@ -443,12 +477,24 @@ function codingScore(
 function buildBenchIndex(): Map<string, BenchmarkEntry> {
 	const index = new Map<string, BenchmarkEntry>();
+	// BenchLM lookup table: normalized benchlm name → entry, indexed in source
+	// order (highest score first when ties exist). Built once per call.
+	const benchlmByNorm = new Map<string, BenchLMRawEntry[]>();
+	for (const b of benchlm.getCached()) {
+		const k = normalizeBenchlmName(b.model);
+		if (!k) continue;
+		const arr = benchlmByNorm.get(k) ?? [];
+		arr.push(b);
+		benchlmByNorm.set(k, arr);
+	}
 	// Rank by our computed score (desc); unscored sink to the bottom, holding
 	// source order among themselves.
-	const scored = modelgrep.getCached().map((g) => ({
-		g,
-		score: g.benchmarks ? codingScore(g.benchmarks) : null,
-	}));
+	const scored = modelgrep.getCached().map((g) => {
+		const fromAA = g.benchmarks ? codingScore(g.benchmarks) : null;
+		const score = fromAA ?? lookupBenchlmScore(g, benchlmByNorm);
+		return { g, score };
+	});
 	scored.sort((a, b) => (b.score ?? -1) - (a.score ?? -1));
 	scored.forEach(({ g, score }, i) => {
 		const slug = slugOf(g.id);
@@ -466,6 +512,55 @@ function buildBenchIndex(): Map<string, BenchmarkEntry> {
 	return index;
 }
+// Normalize a benchlm `model` field (e.g. "Claude Opus 4.8 (Max)") to a slug
+// comparable to modelgrep ids (e.g. "claude-opus-4-8"). Drops parenthesized
+// variants, lowercases, folds . _ space → -, strips leading/trailing dashes.
+function normalizeBenchlmName(name: string): string {
+	return name
+		.replace(/\s*\([^)]*\)\s*/g, " ") // drop "(Max)", "(High)", etc.
+		.toLowerCase()
+		.replace(/[._\s]+/g, "-")
+		.replace(/-+/g, "-")
+		.replace(/^-|-$/g, "");
+}
+// Try to find a benchlm score for a modelgrep model. Match strategy:
+//   1. exact normalized match of modelgrep slug
+//   2. prefix overlap (claude-opus-4-8 ↔ claude-opus-4-8-thinking) — benchlm
+//      may list a long-form name; prefer the shortest match on tie (base > variants)
+//   3. if multiple benchlm entries match, return the highest score
+function lookupBenchlmScore(
+	g: ModelGrepModel,
+	benchlmByNorm: Map<string, BenchLMRawEntry[]>,
+): number | null {
+	const slug = slugOf(g.id);
+	const norm = normalize(slug);
+	// Collect candidates: exact match + prefix matches (either side).
+	const candidates: BenchLMRawEntry[] = [];
+	const direct = benchlmByNorm.get(norm);
+	if (direct) candidates.push(...direct);
+	for (const [key, entries] of benchlmByNorm) {
+		if (key === norm) continue;
+		if (key.startsWith(norm) || norm.startsWith(key))
+			candidates.push(...entries);
+	}
+	if (candidates.length === 0) return null;
+	// Best entry = highest overallScore. Sort by score desc, then by slug
+	// length asc (prefer base name over suffix variants on a tie).
+	const best = [...candidates].sort((a, b) => {
+		const sa = a.overallScore ?? -Infinity;
+		const sb = b.overallScore ?? -Infinity;
+		if (sa !== sb) return sb - sa;
+		return (
+			normalizeBenchlmName(a.model).length -
+			normalizeBenchlmName(b.model).length
+		);
+	})[0];
+	return best.overallScore ?? null;
+}
 /** Map a benchmark score (0–100) to a semantic color token. */
 export function benchScoreColor(
 	score: number | null | undefined,

package/src/index.ts CHANGED Viewed

@@ -8,7 +8,7 @@
  */
 import type { ExtensionAPI } from "@earendil-works/pi-coding-agent";
-import { modelgrep } from "./data.ts";
+import { benchlm, modelgrep } from "./data.ts";
 export type {
 	BenchmarkEntry,
@@ -20,6 +20,7 @@ export type {
 // Consumers (pix-core, pix-9router, …) import these instead of duplicating
 // the DataSource implementation and models.dev/BenchLM lookups.
 export {
+	benchlm,
 	benchScoreColor,
 	buildModelsDevIndex,
 	CACHE_DIR,
@@ -33,4 +34,5 @@ export {
 export default function (_pi: ExtensionAPI): void {
 	void modelgrep.get();
+	void benchlm.get();
 }