npm - @sanity/ailf - Versions diffs - 4.6.0 → 6.0.0 - Mend

@sanity/ailf 4.6.0 → 6.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (185) hide show

package/dist/pipeline/failure-modes.js CHANGED Viewed

@@ -1,52 +1,47 @@
 /**
  * pipeline/failure-modes.ts
  *
- * Keyword-based failure mode classifier for grader reasoning text,
- * cross-referenced with ceiling decomposition data.
+ * Ceiling-cross-check failure-mode validator + report assembly.
  *
- * Phase 3a of the Scenario Matrix implementation.
+ * The grader emits `failureMode` directly under the per-dimension taxonomy
+ * (Plan 03-02 — `packages/eval/src/grader/`). This module consumes the
+ * grader's emission as the source of truth and uses the surviving ceiling
+ * decomposition (`classifyByCeiling`) only as a CONFIDENCE VALIDATOR — it
+ * cross-checks the emitted mode against structural score signals and emits
+ * a D0049 `Confidence` triple stamped with `derivation: "ceiling-cross-check"`.
  *
- * The classifier uses two signal sources:
- * 1. Keyword matching on grader reason text (primary)
- * 2. Ceiling decomposition structural signals (supplementary)
+ * The legacy keyword-pattern classifier (and its five regex pattern
+ * constants) was deleted in Plan 03-03 — its production coverage was ~1%
+ * (Doc 03 §"Where classification lives"), and resurrecting it as a fallback
+ * is explicitly out of scope.
  *
- * When both sources agree, confidence is boosted. When only ceiling
- * signals are available, they serve as a fallback for unclassified cases.
- *
- * @see docs/archive/exec-plans/scenario-matrix-implementation/phase-3-gap-analysis.md
+ * @see docs/decisions/D0005-grader-model-separation.md — single grader emits
+ *      failureMode under the per-dimension taxonomy
+ * @see docs/decisions/D0049-shared-confidence-contract.md — Confidence triple
+ *      shape and `ceiling-cross-check` derivation tag
  */
-import { detectFeatureArea } from "../_vendor/ailf-core/index.js";
+import { LEGACY_FAILURE_MODES, detectFeatureArea } from "../_vendor/ailf-core/index.js";
 // ---------------------------------------------------------------------------
 // Constants
 // ---------------------------------------------------------------------------
 /** Only classify judgments with scores below this threshold */
 const CLASSIFICATION_THRESHOLD = 60;
-/** All failure mode types for initializing empty counts */
-const ALL_MODES = [
-    "api-error",
-    "incorrect-docs",
-    "missing-docs",
-    "model-limitation",
-    "outdated-docs",
-    "poor-structure",
-    "unclassified",
-];
-// ---------------------------------------------------------------------------
-// Keyword patterns
-// ---------------------------------------------------------------------------
-/** API error pattern — checked FIRST to prevent timeout errors containing
- *  "deprecated" from being misclassified as outdated-docs. */
-const API_ERROR_PATTERN = /\[api-error\]|timeout|timed out|rate limit|429|503|ECONNRESET|ETIMEDOUT|socket hang up|fetch failed/i;
-const OUTDATED_PATTERN = /deprecated|old api|v[0-9]+ syntax|no longer supported|legacy|previous version|outdated|superseded|replaced by/i;
-const MISSING_PATTERN = /no documentation|not covered|had to guess|not found|missing.*doc|no.*information|undocumented|couldn't find|without.*documentation/i;
-const INCORRECT_PATTERN = /contradicts|incorrect.*doc|doc.*incorrect|wrong.*doc|doc.*wrong|documentation says.*but|factual error|inaccurate|misleading.*doc/i;
-const POOR_STRUCTURE_PATTERN = /unclear|ambiguous|couldn't determine|conflicting|confusing|hard to follow|poorly organized|scattered|fragmented/i;
 // ---------------------------------------------------------------------------
 // Public API
 // ---------------------------------------------------------------------------
 /**
  * Build a complete failure mode report from grader judgments and scores.
  *
+ * The grader-emitted `judgment.failureMode` is the source of truth (Plan
+ * 03-03 — keyword classifier deleted). `validateFailureMode` cross-checks
+ * the emission against ceiling decomposition and stamps a D0049 confidence.
+ *
+ * The `FailureMode` triple shape (`mode`, `confidence`, `source`) is
+ * preserved for backward compatibility with downstream consumers
+ * (gap-analysis, manifest emission) — the bucketed `confidence` enum maps
+ * 1:1 from `Confidence.level`, and `source` is always `"ceiling"` now that
+ * the keyword path is gone.
+ *
  * @param judgments - All grader judgments from the evaluation
  * @param scores - Per-area feature scores (for ceiling decomposition)
  * @returns Failure mode report with per-area breakdowns
@@ -66,9 +61,23 @@ export function buildFailureModeReport(judgments, scores) {
         const areaScore = area ? scoreByArea.get(area) : undefined;
         const ceilingScore = areaScore?.ceilingScore ?? 100;
         const floorScore = areaScore?.floorScore ?? 0;
-        const classification = classifyFailureMode(judgment, ceilingScore, floorScore);
+        // Source the failure mode from the grader's emission. CR-02:
+        // FailureModeType is open-set (`string`) since Plan 03-02
+        // introduced per-dimension extensions (`false-floor`,
+        // `spec-mismatch`, `tool-misuse`, `factual-error`, …). The report
+        // surfaces the emission directly so downstream consumers see the
+        // grader's actual taxonomy choice rather than a collapsed
+        // `"unclassified"` bucket.
+        const emittedMode = readEmittedMode(judgment);
+        // Cross-check the grader's emission against ceiling decomposition.
+        const stamp = validateFailureMode(judgment, ceilingScore, floorScore);
+        const classification = {
+            confidence: stamp.level,
+            mode: emittedMode,
+            source: "ceiling",
+        };
         classifiedJudgments.push({ classification, judgment });
-        summary[classification.mode]++;
+        summary[classification.mode] = (summary[classification.mode] ?? 0) + 1;
         // Per-area tracking
         if (area) {
             if (!byArea[area]) {
@@ -79,7 +88,8 @@ export function buildFailureModeReport(judgments, scores) {
                     totalJudgments: 0,
                 };
             }
-            byArea[area].modes[classification.mode]++;
+            byArea[area].modes[classification.mode] =
+                (byArea[area].modes[classification.mode] ?? 0) + 1;
             byArea[area].totalJudgments++;
         }
     }
@@ -99,28 +109,74 @@ export function buildFailureModeReport(judgments, scores) {
     };
 }
 /**
- * Classify the failure mode of a low-scoring grader judgment.
+ * Cross-check a grader-emitted `failureMode` against ceiling decomposition
+ * and emit a D0049 `Confidence` triple stamped with
+ * `derivation: "ceiling-cross-check"`.
  *
- * Uses keyword matching on the reason text, then cross-references with
- * ceiling decomposition data for structural confirmation.
+ * Replaces the deleted keyword-pattern + ceiling combine classifier — the
+ * grader's emission is now the source of truth for the mode itself; this
+ * function only stamps confidence based on whether the structural ceiling
+ * signal agrees.
  *
- * @param judgment - The grader judgment to classify
+ *  - `level: "high"` (`signalsPresent: 2`) — grader emission and ceiling
+ *    decomposition agree on the same mode.
+ *  - `level: "medium"` (`signalsPresent: 2`) — both signals present but
+ *    disagree. The live pipeline increments
+ *    `GraderReliability.failureModeCalibration` ONLY on this branch — a
+ *    true calibration miss requires both signals to be present.
+ *  - `level: "low"` (`signalsPresent: 1`) — only the grader's emission;
+ *    ceiling decomposition produced no structural signal. Not a
+ *    calibration miss (we have nothing to cross-check against).
+ *  - `level: "low"` (`signalsPresent: 0`) — passing scores
+ *    (`>= CLASSIFICATION_THRESHOLD`) don't classify; emit absent.
+ *
+ * @param judgment - The grader judgment carrying `failureMode` + `score`
  * @param ceilingScore - The area's ceiling score (with-docs best case)
  * @param floorScore - The area's floor score (no-docs baseline)
- * @returns Classified failure mode with confidence level
+ * @returns D0049 Confidence triple stamped `derivation: "ceiling-cross-check"`
+ *
+ * @see docs/decisions/D0005-grader-model-separation.md
+ * @see docs/decisions/D0049-shared-confidence-contract.md
  */
-export function classifyFailureMode(judgment, ceilingScore, floorScore) {
-    // Passing scores don't need failure mode analysis
+export function validateFailureMode(judgment, ceilingScore, floorScore) {
+    // Passing scores don't classify — emit low-confidence absent.
     if (judgment.score >= CLASSIFICATION_THRESHOLD) {
-        return { confidence: "low", mode: "unclassified", source: "keyword" };
+        return {
+            level: "low",
+            signalsPresent: 0,
+            derivation: "ceiling-cross-check",
+        };
     }
-    const reason = judgment.reason.toLowerCase();
-    // Step 1: Keyword-based classification
-    const keywordMode = classifyByKeyword(reason);
-    // Step 2: Ceiling-based structural classification
     const ceilingMode = classifyByCeiling(judgment.score, ceilingScore, floorScore);
-    // Step 3: Combine signals
-    return combineClassifications(keywordMode, ceilingMode);
+    if (!ceilingMode) {
+        // No structural ceiling signal — the grader's emission stands but
+        // there's nothing to cross-check against. Surface as low-confidence
+        // (signalsPresent: 1) so the caller can distinguish "we have one
+        // signal, not two" from "the two signals disagree" — and leave
+        // failureModeCalibration alone (folding this case in over-counts
+        // the reliability metric, see CR-04 in the Phase 3 review).
+        return {
+            level: "low",
+            signalsPresent: 1,
+            derivation: "ceiling-cross-check",
+        };
+    }
+    if (ceilingMode.mode === judgment.failureMode) {
+        // Both signals agree → high confidence stamp.
+        return {
+            level: "high",
+            signalsPresent: 2,
+            derivation: "ceiling-cross-check",
+        };
+    }
+    // Both signals present and disagree — the actual calibration-miss
+    // branch. The caller increments GraderReliability.failureModeCalibration
+    // only when signalsPresent === 2 here.
+    return {
+        level: "medium",
+        signalsPresent: 2,
+        derivation: "ceiling-cross-check",
+    };
 }
 // ---------------------------------------------------------------------------
 // Formatting
@@ -134,10 +190,13 @@ export function formatFailureModesConsole(report) {
     lines.push("");
     lines.push(`  ${report.totalJudgments} judgments analyzed, ${report.classificationRate.toFixed(0)}% classified`);
     lines.push("");
-    // Summary table
+    // Summary table — legacy modes first (in canonical order), then any
+    // per-dimension extensions present in the run sorted by count desc.
+    // CR-02: extensions are no longer narrowed to "unclassified"; the
+    // formatter now surfaces them rather than dropping the signal.
     lines.push("  Mode                Count");
     lines.push("  ──────────────────  ─────");
-    for (const mode of ALL_MODES) {
+    for (const mode of orderedSummaryKeys(report.summary)) {
         const count = report.summary[mode] ?? 0;
         if (count > 0) {
             const icon = modeIcon(mode);
@@ -169,10 +228,12 @@ export function formatFailureModesMarkdown(report) {
     }
     lines.push(`**${report.totalJudgments} judgments** analyzed, **${report.classificationRate.toFixed(0)}%** classified`);
     lines.push("");
-    // Summary table
+    // Summary table — legacy modes first, per-dimension extensions after
+    // (CR-02 — emission is now visible in aggregation rather than being
+    // collapsed to 'unclassified').
     lines.push("| Mode | Count | % |");
     lines.push("|------|-------|---|");
-    for (const mode of ALL_MODES) {
+    for (const mode of orderedSummaryKeys(report.summary)) {
         const count = report.summary[mode] ?? 0;
         if (count > 0) {
             const pct = report.totalJudgments > 0
@@ -203,7 +264,31 @@ export function formatFailureModesMarkdown(report) {
 // ---------------------------------------------------------------------------
 // Internal helpers
 // ---------------------------------------------------------------------------
-/** Classify by ceiling decomposition structural signals */
+/**
+ * Read the grader's emitted failureMode as the open-set
+ * `FailureModeType` (string). Per-dimension extensions from Plan 03-02
+ * (`false-floor`, `spec-mismatch`, `tool-misuse`, `factual-error`, …)
+ * survive the report aggregation as their own buckets — narrowing them
+ * to `"unclassified"` (the pre-CR-02 behavior) silently dropped at
+ * least 11 documented legal modes from `report.summary` and
+ * `report.byArea[*].topMode`. An absent or empty `failureMode` still
+ * buckets as `"unclassified"` so consumers see a stable label rather
+ * than an empty key.
+ */
+function readEmittedMode(judgment) {
+    const emitted = judgment.failureMode;
+    if (typeof emitted !== "string" || emitted.length === 0) {
+        return "unclassified";
+    }
+    return emitted;
+}
+/**
+ * Classify by ceiling-decomposition structural signals — preserved
+ * verbatim from the pre-Plan-03-03 implementation. The function itself
+ * does not change; only its CALLER (`validateFailureMode`) changes how
+ * the output is consumed (confidence stamp instead of parallel
+ * classification signal).
+ */
 function classifyByCeiling(score, ceilingScore, floorScore) {
     const docLift = ceilingScore - floorScore;
     // Negative Doc Lift: docs are actively harmful
@@ -228,55 +313,6 @@ function classifyByCeiling(score, ceilingScore, floorScore) {
     }
     return null;
 }
-/** Classify by keyword matching on the reason text */
-function classifyByKeyword(reason) {
-    // API errors checked first — prevents timeout messages containing
-    // "deprecated" from being misclassified as outdated-docs.
-    if (API_ERROR_PATTERN.test(reason)) {
-        return { confidence: "high", mode: "api-error", source: "keyword" };
-    }
-    if (OUTDATED_PATTERN.test(reason)) {
-        return { confidence: "high", mode: "outdated-docs", source: "keyword" };
-    }
-    if (MISSING_PATTERN.test(reason)) {
-        return { confidence: "high", mode: "missing-docs", source: "keyword" };
-    }
-    if (INCORRECT_PATTERN.test(reason)) {
-        return { confidence: "medium", mode: "incorrect-docs", source: "keyword" };
-    }
-    if (POOR_STRUCTURE_PATTERN.test(reason)) {
-        return { confidence: "medium", mode: "poor-structure", source: "keyword" };
-    }
-    return null;
-}
-/**
- * Combine keyword and ceiling classifications.
- *
- * Priority:
- * 1. If both agree on mode → high confidence, source = "keyword+ceiling"
- * 2. If keyword matched → use keyword result
- * 3. If only ceiling matched → use ceiling result (lower confidence)
- * 4. If neither matched → unclassified
- */
-function combineClassifications(keyword, ceiling) {
-    if (keyword && ceiling) {
-        if (keyword.mode === ceiling.mode) {
-            // Both agree — boost confidence
-            return {
-                confidence: "high",
-                mode: keyword.mode,
-                source: "keyword+ceiling",
-            };
-        }
-        // Disagree — prefer keyword (it has more signal)
-        return keyword;
-    }
-    if (keyword)
-        return keyword;
-    if (ceiling)
-        return ceiling;
-    return { confidence: "low", mode: "unclassified", source: "keyword" };
-}
 /**
  * Resolve area name from a task ID or description.
  *
@@ -310,15 +346,22 @@ function resolveArea(taskId, scoreByArea) {
     }
     return undefined;
 }
-/** Find the most common failure mode */
+/**
+ * Find the most common failure mode in the per-area `modes` record.
+ *
+ * Iterates every key in the record (CR-02 — record is open-set since
+ * Plan 03-02 introduced per-dimension extensions) and picks the
+ * highest-count classified mode. Falls back to "unclassified" when
+ * the area has no classified emissions at all.
+ */
 function findTopMode(modes) {
     let topMode = "unclassified";
     let topCount = 0;
-    for (const mode of ALL_MODES) {
+    for (const [mode, count] of Object.entries(modes)) {
         if (mode === "unclassified")
             continue; // Prefer classified modes
-        if ((modes[mode] ?? 0) > topCount) {
-            topCount = modes[mode];
+        if (count > topCount) {
+            topCount = count;
             topMode = mode;
         }
     }
@@ -327,19 +370,33 @@ function findTopMode(modes) {
         return "unclassified";
     return topMode;
 }
-/** Initialize mode counts to zero */
+/**
+ * Initialize the per-area / per-summary mode-count record.
+ *
+ * Pre-allocates buckets for the legacy literacy modes (CR-02 — keeps
+ * stable presence for downstream consumers like Studio columns) and
+ * leaves per-dimension extensions to be added on first emission.
+ */
 function initModeCounts() {
-    return {
-        "api-error": 0,
-        "incorrect-docs": 0,
-        "missing-docs": 0,
-        "model-limitation": 0,
-        "outdated-docs": 0,
-        "poor-structure": 0,
-        unclassified: 0,
-    };
+    const counts = {};
+    for (const mode of LEGACY_FAILURE_MODES) {
+        counts[mode] = 0;
+    }
+    return counts;
+}
+/** Stable display order for the summary tables — legacy first, then extensions. */
+function orderedSummaryKeys(summary) {
+    const legacy = LEGACY_FAILURE_MODES.filter((m) => m in summary);
+    const extensions = Object.keys(summary)
+        .filter((m) => !isLegacyMode(m))
+        .sort((a, b) => (summary[b] ?? 0) - (summary[a] ?? 0));
+    return [...legacy, ...extensions];
+}
+const LEGACY_MODE_SET = new Set(LEGACY_FAILURE_MODES);
+function isLegacyMode(mode) {
+    return LEGACY_MODE_SET.has(mode);
 }
-/** Get icon for a failure mode */
+/** Get icon for a failure mode — legacy modes have dedicated icons; extensions fall back to a neutral marker. */
 function modeIcon(mode) {
     switch (mode) {
         case "api-error":
@@ -356,5 +413,9 @@ function modeIcon(mode) {
             return "🏗️";
         case "unclassified":
             return "❓";
+        default:
+            // Per-dimension extensions (Plan 03-02) — neutral icon, the mode
+            // name in the table still identifies the family.
+            return "•";
     }
 }

package/dist/pipeline/map-request-to-config.js CHANGED Viewed

@@ -54,6 +54,7 @@ export function mapRequestToConfig(request, rootDir) {
         noRemoteCache: request.noRemoteCache ?? false,
         graderContext: request.graderContext,
         graderReplications: request.graderReplications,
+        borderlineReplications: request.borderlineReplications,
         urls: request.urls,
         headers: request.headers,
         allowedOrigins: request.allowedOrigins,

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@sanity/ailf",
-  "version": "4.6.0",
+  "version": "6.0.0",
   "private": false,
   "publishConfig": {
     "access": "public"
@@ -52,15 +52,16 @@
     "@types/js-yaml": "^4.0.9",
     "@types/node": "^22.13.1",
     "nock": "^14.0.13",
+    "simple-statistics": "7.8.9",
     "tsx": "^4.19.2",
     "typescript": "^5.7.3",
     "vitest": "^4.1.5",
-    "@sanity/ailf-shared": "0.1.0",
-    "@sanity/ailf-core": "0.1.0"
+    "@sanity/ailf-core": "0.1.0",
+    "@sanity/ailf-shared": "0.1.0"
   },
   "scripts": {
-    "build": "tsc && tsx scripts/bundle-workspace-deps.ts",
-    "generate-configs": "tsx src/cli.ts generate-configs",
+    "build": "tsc && tsc -p tsconfig.scripts.json && tsx scripts/bundle-workspace-deps.ts",
+    "generate-configs": "tsx src/cli.ts generate-configs && tsx scripts/generate-diagnosis-config.ts",
     "fetch-docs": "tsx src/cli.ts fetch-docs",
     "measure-retrieval": "tsx src/cli.ts measure-retrieval",
     "eval": "tsx src/cli.ts eval",
@@ -77,6 +78,7 @@
     "pipeline": "tsx src/cli.ts pipeline",
     "validate": "tsx src/cli.ts validate config",
     "test": "vitest run",
+    "test:compiler": "AILF_E2E=1 vitest run src/pipeline/compiler/__tests__",
     "test:e2e": "AILF_E2E=1 vitest run src/__tests__/e2e",
     "test:e2e:adapters": "AILF_E2E=1 vitest run src/adapters",
     "test:e2e:api": "AILF_E2E_API=1 vitest run src/__tests__/api-tier2-tenant-integration.test.ts src/__tests__/gcs-artifact-writer-roundtrip.test.ts",