npm - @suluk/models - Versions diffs - 0.1.1 → 0.1.2 - Mend

@suluk/models 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/package.json +1 -1
package/scripts/refresh.ts +6 -3
package/src/index.ts +1 -0
package/src/normalize.ts +3 -2
package/src/openrouter-catalog.json +1 -1
package/src/overlay.ts +43 -0
package/test/catalog.test.ts +12 -6

package/src/overlay.ts ADDED Viewed

@@ -0,0 +1,43 @@
+/**
+ * The Class-B TIER OVERLAY — patches `intel.*` coarse tiers onto the facts-only catalog. Tiers are ADOPTED PUBLIC
+ * PRIORS (cited + asOf-stamped), never our measured facts — we do not self-test. The real pass curates them from
+ * BFCL/IFEval/SWE-bench/GPQA/RULER/MMLU-Pro/LMArena snapshots through `applyBucketing` (see REFRESH.md); this file
+ * also ships a SMALL, conservative seed of well-established frontier standings so the catalog's intelligence
+ * dimension isn't entirely UNKNOWN. UNKNOWN axes are left absent (NEVER imputed to worst).
+ */
+import type { ModelCatalog, Tier } from "./types";
+import { catalogFrom } from "./normalize";
+export type IntelAxis = "agenticToolUse" | "instructionFollowing" | "reasoning" | "coding" | "longCtxComprehension" | "knowledge" | "humanPreference";
+/** Overlay coarse tiers onto matching rows' intel cells, then re-hash (selection now depends on these tiers). */
+export function applyTierOverlay(catalog: ModelCatalog, tiers: Record<string, Partial<Record<IntelAxis, Tier>>>, opts: { source: string; asOf: string }): ModelCatalog {
+  const rows = catalog.rows.map((r) => {
+    const t = tiers[r.id];
+    if (!t) return r;
+    const intel = { ...r.intel };
+    for (const axis of Object.keys(t) as IntelAxis[]) {
+      const v = t[axis];
+      if (v) intel[axis] = { value: v, source: opts.source, asOf: opts.asOf };
+    }
+    return { ...r, intel };
+  });
+  return catalogFrom(rows, catalog.generatedAt);
+}
+/**
+ * A SMALL, conservatively-CITED seed of coarse public standings for headline frontier models (the bootstrap until
+ * the full Class-B curation lands). These are adopted public-consensus priors at a LOW ceiling — verify at source;
+ * tune at review. Absent axes stay UNKNOWN. Source stamped `public-leaderboard-consensus`.
+ */
+export const KNOWN_TIERS: Record<string, Partial<Record<IntelAxis, Tier>>> = {
+  "anthropic/claude-opus-4.1": { agenticToolUse: "frontier", instructionFollowing: "frontier", reasoning: "frontier", coding: "frontier", knowledge: "frontier", humanPreference: "frontier" },
+  "anthropic/claude-opus-4": { agenticToolUse: "frontier", instructionFollowing: "frontier", reasoning: "frontier", coding: "frontier", knowledge: "frontier", humanPreference: "frontier" },
+  "anthropic/claude-sonnet-4.5": { agenticToolUse: "frontier", instructionFollowing: "frontier", reasoning: "strong", coding: "frontier", knowledge: "strong", humanPreference: "strong" },
+  "google/gemini-2.5-pro": { agenticToolUse: "strong", instructionFollowing: "frontier", reasoning: "frontier", coding: "strong", longCtxComprehension: "frontier", knowledge: "frontier", humanPreference: "frontier" },
+  "google/gemini-2.5-flash": { agenticToolUse: "strong", instructionFollowing: "strong", reasoning: "mid", coding: "strong", knowledge: "strong", humanPreference: "strong" },
+  "openai/gpt-5": { agenticToolUse: "frontier", instructionFollowing: "frontier", reasoning: "frontier", coding: "frontier", knowledge: "frontier", humanPreference: "frontier" },
+  "deepseek/deepseek-chat-v3.1": { agenticToolUse: "mid", instructionFollowing: "strong", reasoning: "strong", coding: "strong", knowledge: "strong", humanPreference: "strong" },
+  "x-ai/grok-4.3": { agenticToolUse: "strong", instructionFollowing: "strong", reasoning: "frontier", coding: "strong", knowledge: "frontier", humanPreference: "strong" },
+  "meta-llama/llama-4-maverick": { instructionFollowing: "mid", reasoning: "mid", coding: "mid", knowledge: "strong", humanPreference: "mid" },
+};

package/test/catalog.test.ts CHANGED Viewed

@@ -9,14 +9,20 @@ describe("@suluk/models — the committed OpenRouter catalog", () => {
     for (const r of OPENROUTER_CATALOG.rows.slice(0, 20)) {
       expect(r.id).toBeTruthy();
       expect(r.cost.inputPerMtok.value === null || typeof r.cost.inputPerMtok.value === "number").toBe(true);
-      // benchmark tiers are UNKNOWN in the facts-only catalog (never imputed)
-      expect(r.intel.agenticToolUse.value).toBeNull();
     }
+    // a few frontier rows carry CITED intel tiers (the Class-B seed); the long tail stays UNKNOWN, never imputed
+    const withTier = OPENROUTER_CATALOG.rows.filter((r) => r.intel.reasoning.value !== null);
+    expect(withTier.length).toBeGreaterThan(0);
+    expect(withTier.length).toBeLessThan(OPENROUTER_CATALOG.rows.length);
+    expect(withTier[0].intel.reasoning.source).toBeTruthy();
   });
-  test("the selector runs end-to-end against the real catalog (tool-reliable, min-context)", () => {
-    const r = selectModel({ needsTools: true, minWindowRequired: 200000 }, { profile: "tool-reliable" }, OPENROUTER_CATALOG);
-    expect(r.ranked.length).toBeGreaterThan(0);
-    expect(r.ranked[0].why.passedFilters).toContain("tool-calling");
+  test("the selector runs end-to-end against the real catalog + uses the overlaid tiers", () => {
+    const tool = selectModel({ needsTools: true, minWindowRequired: 200000 }, { profile: "tool-reliable" }, OPENROUTER_CATALOG);
+    expect(tool.ranked.length).toBeGreaterThan(0);
+    expect(tool.ranked[0].why.passedFilters).toContain("tool-calling");
+    // max-reasoning ⇒ a frontier/strong reasoner floats to the top (the overlay is live)
+    const reason = selectModel({ needsTools: true, minWindowRequired: 200000 }, { profile: "max-reasoning" }, OPENROUTER_CATALOG);
+    expect(["frontier", "strong"]).toContain(reason.ranked[0].why.tierByAxis.intelligence.tier);
   });
 });