npm - @suluk/models - Versions diffs - 0.1.1 → 0.1.3 - Mend

@suluk/models 0.1.1 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/package.json +1 -1
package/scripts/refresh.ts +6 -3
package/src/index.ts +1 -0
package/src/normalize.ts +3 -2
package/src/openrouter-catalog.json +1 -1
package/src/overlay.ts +43 -0
package/src/types.ts +8 -2
package/test/catalog.test.ts +12 -6

package/src/overlay.ts ADDED Viewed

@@ -0,0 +1,43 @@
+/**
+ * The Class-B TIER OVERLAY — patches `intel.*` coarse tiers onto the facts-only catalog. Tiers are ADOPTED PUBLIC
+ * PRIORS (cited + asOf-stamped), never our measured facts — we do not self-test. The real pass curates them from
+ * BFCL/IFEval/SWE-bench/GPQA/RULER/MMLU-Pro/LMArena snapshots through `applyBucketing` (see REFRESH.md); this file
+ * also ships a SMALL, conservative seed of well-established frontier standings so the catalog's intelligence
+ * dimension isn't entirely UNKNOWN. UNKNOWN axes are left absent (NEVER imputed to worst).
+ */
+import type { ModelCatalog, Tier } from "./types";
+import { catalogFrom } from "./normalize";
+export type IntelAxis = "agenticToolUse" | "instructionFollowing" | "reasoning" | "coding" | "longCtxComprehension" | "knowledge" | "humanPreference";
+/** Overlay coarse tiers onto matching rows' intel cells, then re-hash (selection now depends on these tiers). */
+export function applyTierOverlay(catalog: ModelCatalog, tiers: Record<string, Partial<Record<IntelAxis, Tier>>>, opts: { source: string; asOf: string }): ModelCatalog {
+  const rows = catalog.rows.map((r) => {
+    const t = tiers[r.id];
+    if (!t) return r;
+    const intel = { ...r.intel };
+    for (const axis of Object.keys(t) as IntelAxis[]) {
+      const v = t[axis];
+      if (v) intel[axis] = { value: v, source: opts.source, asOf: opts.asOf };
+    }
+    return { ...r, intel };
+  });
+  return catalogFrom(rows, catalog.generatedAt);
+}
+/**
+ * A SMALL, conservatively-CITED seed of coarse public standings for headline frontier models (the bootstrap until
+ * the full Class-B curation lands). These are adopted public-consensus priors at a LOW ceiling — verify at source;
+ * tune at review. Absent axes stay UNKNOWN. Source stamped `public-leaderboard-consensus`.
+ */
+export const KNOWN_TIERS: Record<string, Partial<Record<IntelAxis, Tier>>> = {
+  "anthropic/claude-opus-4.1": { agenticToolUse: "frontier", instructionFollowing: "frontier", reasoning: "frontier", coding: "frontier", knowledge: "frontier", humanPreference: "frontier" },
+  "anthropic/claude-opus-4": { agenticToolUse: "frontier", instructionFollowing: "frontier", reasoning: "frontier", coding: "frontier", knowledge: "frontier", humanPreference: "frontier" },
+  "anthropic/claude-sonnet-4.5": { agenticToolUse: "frontier", instructionFollowing: "frontier", reasoning: "strong", coding: "frontier", knowledge: "strong", humanPreference: "strong" },
+  "google/gemini-2.5-pro": { agenticToolUse: "strong", instructionFollowing: "frontier", reasoning: "frontier", coding: "strong", longCtxComprehension: "frontier", knowledge: "frontier", humanPreference: "frontier" },
+  "google/gemini-2.5-flash": { agenticToolUse: "strong", instructionFollowing: "strong", reasoning: "mid", coding: "strong", knowledge: "strong", humanPreference: "strong" },
+  "openai/gpt-5": { agenticToolUse: "frontier", instructionFollowing: "frontier", reasoning: "frontier", coding: "frontier", knowledge: "frontier", humanPreference: "frontier" },
+  "deepseek/deepseek-chat-v3.1": { agenticToolUse: "mid", instructionFollowing: "strong", reasoning: "strong", coding: "strong", knowledge: "strong", humanPreference: "strong" },
+  "x-ai/grok-4.3": { agenticToolUse: "strong", instructionFollowing: "strong", reasoning: "frontier", coding: "strong", knowledge: "frontier", humanPreference: "strong" },
+  "meta-llama/llama-4-maverick": { instructionFollowing: "mid", reasoning: "mid", coding: "mid", knowledge: "strong", humanPreference: "mid" },
+};

package/src/types.ts CHANGED Viewed

@@ -1,6 +1,12 @@
 /**
- * @suluk/models — the catalog schema (council wf_729cde52-cc7). A row is one model+provider endpoint. Decidable
- * OpenRouter facts are NUMBERS/BOOLS; noisy third-party benchmarks are COARSE TIERS (frontier/strong/mid/basic/
+ * @suluk/models — the catalog schema (council wf_729cde52-cc7). A row is keyed BY MODEL (id, capabilities, benchmark
+ * tiers, context window — all per-model). Per-ENDPOINT axes (price/region/data-retention/latency, which differ across
+ * the provider endpoints one model fans out to) belong in a future optional `endpoints[]` sub-list (keying micro-panel
+ * wf_27de1bec-a42: model-keyed HYBRID, @0.6 — RESERVED, not built: OpenRouter routes endpoints at runtime + honors ZDR
+ * via a request flag, and no fleet needs per-endpoint region governance yet). NB until then `gov.region`/`dataRetention`
+ * are per-MODEL and stay UNKNOWN (fail-closed) — do NOT populate a "representative" region (it would silently degrade
+ * fail-closed to fail-OPEN at the endpoint layer — a forged in-region attestation; see C030).
+ * Decidable OpenRouter facts are NUMBERS/BOOLS; noisy third-party benchmarks are COARSE TIERS (frontier/strong/mid/basic/
  * unknown) — never a 2-decimal score (that launders noisy/contaminated public data as precision). Every cell carries
  * {source, asOf}; an unsourced cell is MISSING, never a confident value, and a missing tier is NEVER imputed to
  * worst (that would kill new models). The catalog stores NO cross-axis composite — blending is the selector's job

package/test/catalog.test.ts CHANGED Viewed

@@ -9,14 +9,20 @@ describe("@suluk/models — the committed OpenRouter catalog", () => {
     for (const r of OPENROUTER_CATALOG.rows.slice(0, 20)) {
       expect(r.id).toBeTruthy();
       expect(r.cost.inputPerMtok.value === null || typeof r.cost.inputPerMtok.value === "number").toBe(true);
-      // benchmark tiers are UNKNOWN in the facts-only catalog (never imputed)
-      expect(r.intel.agenticToolUse.value).toBeNull();
     }
+    // a few frontier rows carry CITED intel tiers (the Class-B seed); the long tail stays UNKNOWN, never imputed
+    const withTier = OPENROUTER_CATALOG.rows.filter((r) => r.intel.reasoning.value !== null);
+    expect(withTier.length).toBeGreaterThan(0);
+    expect(withTier.length).toBeLessThan(OPENROUTER_CATALOG.rows.length);
+    expect(withTier[0].intel.reasoning.source).toBeTruthy();
   });
-  test("the selector runs end-to-end against the real catalog (tool-reliable, min-context)", () => {
-    const r = selectModel({ needsTools: true, minWindowRequired: 200000 }, { profile: "tool-reliable" }, OPENROUTER_CATALOG);
-    expect(r.ranked.length).toBeGreaterThan(0);
-    expect(r.ranked[0].why.passedFilters).toContain("tool-calling");
+  test("the selector runs end-to-end against the real catalog + uses the overlaid tiers", () => {
+    const tool = selectModel({ needsTools: true, minWindowRequired: 200000 }, { profile: "tool-reliable" }, OPENROUTER_CATALOG);
+    expect(tool.ranked.length).toBeGreaterThan(0);
+    expect(tool.ranked[0].why.passedFilters).toContain("tool-calling");
+    // max-reasoning ⇒ a frontier/strong reasoner floats to the top (the overlay is live)
+    const reason = selectModel({ needsTools: true, minWindowRequired: 200000 }, { profile: "max-reasoning" }, OPENROUTER_CATALOG);
+    expect(["frontier", "strong"]).toContain(reason.ranked[0].why.tierByAxis.intelligence.tier);
   });
 });