npm - @sanity/ailf - Versions diffs - 4.5.0 → 5.0.0 - Mend

@sanity/ailf 4.5.0 → 5.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (139) hide show

package/canonical/grader-references/agent-harness-tools.yaml +42 -0
package/canonical/grader-references/knowledge-probe-recall.yaml +36 -0
package/canonical/grader-references/mcp-server-spec.yaml +51 -0
package/canonical/grader-references/portable-text.yaml +48 -0
package/config/rubrics.ts +38 -2
package/dist/_vendor/ailf-core/artifact-registry.d.ts +197 -2
package/dist/_vendor/ailf-core/artifact-registry.js +419 -5
package/dist/_vendor/ailf-core/examples/index.d.ts +125 -26
package/dist/_vendor/ailf-core/examples/index.js +146 -47
package/dist/_vendor/ailf-core/ports/context.d.ts +26 -0
package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
package/dist/_vendor/ailf-core/ports/index.js +1 -0
package/dist/_vendor/ailf-core/ports/llm-client.d.ts +112 -0
package/dist/_vendor/ailf-core/ports/llm-client.js +68 -0
package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +15 -0
package/dist/_vendor/ailf-core/schemas/branded-string.d.ts +40 -0
package/dist/_vendor/ailf-core/schemas/branded-string.js +45 -0
package/dist/_vendor/ailf-core/schemas/confidence-schema.d.ts +36 -0
package/dist/_vendor/ailf-core/schemas/confidence-schema.js +32 -0
package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
package/dist/_vendor/ailf-core/schemas/eval-config.js +8 -4
package/dist/_vendor/ailf-core/schemas/index.d.ts +2 -0
package/dist/_vendor/ailf-core/schemas/index.js +9 -0
package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +1 -0
package/dist/_vendor/ailf-core/schemas/pipeline-request.js +1 -0
package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +34 -8
package/dist/_vendor/ailf-core/schemas/pipeline.js +23 -1
package/dist/_vendor/ailf-core/services/diagnosis/registry.d.ts +40 -0
package/dist/_vendor/ailf-core/services/diagnosis/registry.js +25 -0
package/dist/_vendor/ailf-core/services/diagnosis-runner.d.ts +19 -0
package/dist/_vendor/ailf-core/services/diagnosis-runner.js +19 -0
package/dist/_vendor/ailf-core/services/index.d.ts +2 -0
package/dist/_vendor/ailf-core/services/index.js +5 -0
package/dist/_vendor/ailf-core/services/report-to-markdown.js +3 -2
package/dist/_vendor/ailf-core/types/attribution.d.ts +82 -0
package/dist/_vendor/ailf-core/types/attribution.js +18 -0
package/dist/_vendor/ailf-core/types/branded-ids.d.ts +26 -1
package/dist/_vendor/ailf-core/types/branded-ids.js +80 -4
package/dist/_vendor/ailf-core/types/confidence.d.ts +68 -0
package/dist/_vendor/ailf-core/types/confidence.js +56 -0
package/dist/_vendor/ailf-core/types/diagnosis.d.ts +169 -0
package/dist/_vendor/ailf-core/types/diagnosis.js +17 -0
package/dist/_vendor/ailf-core/types/generalized-task.d.ts +16 -1
package/dist/_vendor/ailf-core/types/grader-judgment.d.ts +125 -0
package/dist/_vendor/ailf-core/types/grader-judgment.js +30 -0
package/dist/_vendor/ailf-core/types/index.d.ts +82 -29
package/dist/_vendor/ailf-core/types/index.js +16 -1
package/dist/_vendor/ailf-core/types/legacy-grader-judgment.d.ts +55 -0
package/dist/_vendor/ailf-core/types/legacy-grader-judgment.js +30 -0
package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +1 -0
package/dist/_vendor/ailf-core/types/repo-config.d.ts +8 -0
package/dist/_vendor/ailf-shared/document-ref.d.ts +1 -1
package/dist/adapters/api-client/build-request.d.ts +1 -0
package/dist/adapters/api-client/build-request.js +3 -0
package/dist/adapters/attribution/attribution-meta-writer.d.ts +35 -0
package/dist/adapters/attribution/attribution-meta-writer.js +34 -0
package/dist/adapters/attribution/index.d.ts +9 -0
package/dist/adapters/attribution/index.js +8 -0
package/dist/adapters/attribution/per-entry-attribution-writer.d.ts +56 -0
package/dist/adapters/attribution/per-entry-attribution-writer.js +49 -0
package/dist/adapters/config-sources/file-config-adapter.js +1 -0
package/dist/adapters/grader-outputs/index.d.ts +10 -0
package/dist/adapters/grader-outputs/index.js +8 -0
package/dist/adapters/grader-outputs/legacy/index.d.ts +11 -0
package/dist/adapters/grader-outputs/legacy/index.js +10 -0
package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.d.ts +49 -0
package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.js +48 -0
package/dist/adapters/grader-outputs/promptfoo-grader-output.d.ts +102 -0
package/dist/adapters/grader-outputs/promptfoo-grader-output.js +93 -0
package/dist/adapters/index.d.ts +3 -0
package/dist/adapters/index.js +4 -0
package/dist/adapters/llm/anthropic-llm-client.d.ts +48 -0
package/dist/adapters/llm/anthropic-llm-client.js +205 -0
package/dist/adapters/llm/fake-llm-client.d.ts +49 -0
package/dist/adapters/llm/fake-llm-client.js +63 -0
package/dist/adapters/llm/index.d.ts +9 -0
package/dist/adapters/llm/index.js +4 -0
package/dist/adapters/llm/openai-llm-client.d.ts +44 -0
package/dist/adapters/llm/openai-llm-client.js +168 -0
package/dist/adapters/llm/pricing.d.ts +12 -0
package/dist/adapters/llm/pricing.js +8 -0
package/dist/adapters/llm/retry.d.ts +56 -0
package/dist/adapters/llm/retry.js +66 -0
package/dist/adapters/task-sources/content-lake-task-source.d.ts +5 -1
package/dist/adapters/task-sources/content-lake-task-source.js +28 -2
package/dist/adapters/task-sources/repo-schemas.d.ts +90 -22
package/dist/adapters/task-sources/repo-schemas.js +19 -2
package/dist/artifact-capture/api-gateway-artifact-writer.js +2 -1
package/dist/artifact-capture/batching-api-gateway-artifact-writer.js +2 -1
package/dist/artifact-capture/gcs-artifact-writer.js +3 -1
package/dist/artifact-capture/local-fs-artifact-writer.js +3 -1
package/dist/commands/calculate-scores.js +1 -1
package/dist/commands/explain-handler.js +1 -1
package/dist/commands/lookup-doc.d.ts +1 -1
package/dist/commands/lookup-doc.js +3 -3
package/dist/commands/pipeline-action.d.ts +6 -0
package/dist/commands/pipeline-action.js +2 -0
package/dist/commands/remote-pipeline.js +1 -0
package/dist/composition-root.d.ts +59 -1
package/dist/composition-root.js +95 -0
package/dist/config/rubrics.ts +38 -2
package/dist/grader/agent-harness.d.ts +14 -0
package/dist/grader/agent-harness.js +17 -0
package/dist/grader/common.d.ts +17 -0
package/dist/grader/common.js +21 -0
package/dist/grader/index.d.ts +38 -0
package/dist/grader/index.js +75 -0
package/dist/grader/knowledge-probe.d.ts +14 -0
package/dist/grader/knowledge-probe.js +18 -0
package/dist/grader/literacy.d.ts +13 -0
package/dist/grader/literacy.js +17 -0
package/dist/grader/mcp.d.ts +14 -0
package/dist/grader/mcp.js +18 -0
package/dist/orchestration/build-app-context.js +1 -0
package/dist/orchestration/build-step-sequence.js +5 -0
package/dist/orchestration/steps/calculate-scores-step.js +23 -1
package/dist/orchestration/steps/compute-attribution-step.d.ts +44 -0
package/dist/orchestration/steps/compute-attribution-step.js +279 -0
package/dist/orchestration/steps/gap-analysis-step.js +35 -7
package/dist/orchestration/steps/index.d.ts +1 -0
package/dist/orchestration/steps/index.js +1 -0
package/dist/pipeline/attribution.d.ts +15 -0
package/dist/pipeline/attribution.js +18 -9
package/dist/pipeline/borderline-consensus-runner.d.ts +63 -0
package/dist/pipeline/borderline-consensus-runner.js +124 -0
package/dist/pipeline/borderline-detector.d.ts +24 -0
package/dist/pipeline/borderline-detector.js +26 -0
package/dist/pipeline/calculate-scores.d.ts +114 -3
package/dist/pipeline/calculate-scores.js +426 -24
package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
package/dist/pipeline/compiler/literacy-bridge.js +35 -17
package/dist/pipeline/compiler/rubric-resolution.d.ts +15 -0
package/dist/pipeline/compiler/rubric-resolution.js +9 -1
package/dist/pipeline/compute-attribution.d.ts +80 -0
package/dist/pipeline/compute-attribution.js +196 -0
package/dist/pipeline/failure-modes.d.ts +52 -17
package/dist/pipeline/failure-modes.js +178 -117
package/dist/pipeline/map-request-to-config.js +1 -0
package/package.json +6 -4

package/dist/pipeline/calculate-scores.js CHANGED Viewed

@@ -29,9 +29,11 @@
  */
 import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
 import { join } from "path";
-import { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, resolveVariantMode, } from "../_vendor/ailf-core/index.js";
+import { classifyRubric, detectFeatureArea, extractUrlMetadata, generateJudgmentId, mergeScores, parseRubricScore, resolveVariantMode, } from "../_vendor/ailf-core/index.js";
 import { calculateCost } from "../agent-observer/pricing.js";
 import { ConsoleLogger } from "../adapters/loggers/index.js";
+import { GraderJudgmentSchema, graderJudgmentsVersion, } from "../adapters/grader-outputs/promptfoo-grader-output.js";
+import { validateFailureMode } from "./failure-modes.js";
 import { analyzeSourceIsolation, } from "../assertions/source-isolation.js";
 import { checkResultsExist } from "./checks.js";
 import { loadRubricTemplates } from "./rubric-loader.js";
@@ -99,6 +101,54 @@ export function calculateScoresPerModel(resultsPath, goldProfile, baselineProfil
 // ---------------------------------------------------------------------------
 // URL extraction from assertion metadata
 // ---------------------------------------------------------------------------
+/**
+ * Synthesize a Phase 3 GRAD-05 fallback judgment when the strict-schema
+ * parse fails. Required GRAD-02 fields are populated with sensible
+ * defaults; the `judgmentId` is salted with `runId` (when threaded
+ * through) so consumers can dedup the natural composite key
+ * `(taskId, modelId, dimension)` per-run rather than colliding across
+ * re-runs of the same task.
+ *
+ * The branded-id construction lives in `generateJudgmentId` from
+ * `@sanity/ailf-core/branded-ids` — that's the single audited-cast site
+ * for the `as JudgmentId` exit, mirroring how `generateRunId` owns the
+ * `as RunId` cast.
+ */
+function synthesizeUnparsedJudgment(input) {
+    const { taskId, modelId, dimension, score, reason, outputFailure, runId } = input;
+    return {
+        dimension,
+        modelId,
+        ...(outputFailure && { outputFailure: true }),
+        reason,
+        score,
+        taskId,
+        judgmentId: generateJudgmentId({
+            taskId,
+            modelId,
+            dimension,
+            ...(runId ? { runId } : {}),
+        }),
+        subJudgments: [],
+        docCitations: [],
+        failureMode: "unclassified",
+        confidence: {
+            level: "low",
+            signalsPresent: 0,
+            // Sentinel — IF this leaks downstream past
+            // validateGraderJudgmentsCalibration the tag is obviously a
+            // synthesize-time placeholder rather than a real ceiling-cross-check
+            // result. The validator overwrites it with "ceiling-cross-check"
+            // along the live pipeline path.
+            derivation: "synthesized-pre-cross-check",
+        },
+        hallucinationCheckedAgainst: [],
+        metadata: {
+            graderModel: "unknown",
+            graderJudgmentsVersion,
+        },
+    };
+}
 /**
  * Extract grader judgments (reason text + scores) from evaluation results.
  *
@@ -107,8 +157,15 @@ export function calculateScoresPerModel(resultsPath, goldProfile, baselineProfil
  * assertion produces one GraderJudgment entry.
  *
  * Phase 3a prerequisite: structured judgment data for failure mode extraction.
+ *
+ * @param resultsPath - Path to the Promptfoo results JSON file.
+ * @param telemetry - Optional reliability counter (Plan 03-03). When passed,
+ *   `parseFailures` is incremented on every strict-schema rejection so the
+ *   live pipeline can surface schema drift over time.
+ *   `runId` (when supplied) is threaded into synthesized fall-back judgment
+ *   ids so dedup keys are unique per-run.
  */
-export function extractGraderJudgments(resultsPath) {
+export function extractGraderJudgments(resultsPath, telemetry) {
     const results = readAndNormalizeResults(resultsPath);
     const judgments = [];
     for (const result of results) {
@@ -127,36 +184,242 @@ export function extractGraderJudgments(resultsPath) {
                 continue;
             }
             const score = parseRubricScore(comp);
-            // Extract the reason text — the grader's reasoning
-            let reason = comp.reason ?? "";
-            if (reason) {
-                // Try to parse JSON reason to extract the reason field
+            // Extract the reason text — the grader's reasoning. Plan 03-01
+            // (D0045 trust boundary): the inline `JSON.parse + as`-cast at
+            // this site is replaced with `GraderJudgmentSchema.safeParse`
+            // so that grader output flows through a validated schema before
+            // it enters the scoring pipeline. On parse failure we fall to
+            // an `unclassified`-shape Phase 1 judgment built from the raw
+            // reason string — NEVER fall back to the legacy parser (Pitfall
+            // 4: strict and legacy schemas are deliberate siblings, not a
+            // fall-through chain).
+            const reasonRaw = comp.reason ?? "";
+            let parsedJudgment = null;
+            let reason = reasonRaw;
+            if (reasonRaw) {
                 try {
-                    const parsed = JSON.parse(reason);
-                    const obj = parsed;
-                    if (typeof obj.reason === "string") {
-                        ;
-                        ({ reason } = obj);
+                    const candidate = JSON.parse(reasonRaw);
+                    // The strict schema asserts the full GraderJudgment surface.
+                    // safeParse handles non-object inputs (number, array, etc.)
+                    // by failing — we don't pre-narrow here.
+                    const candidateObj = candidate && typeof candidate === "object" ? candidate : {};
+                    const result = GraderJudgmentSchema.safeParse({
+                        ...candidateObj,
+                        taskId,
+                        modelId,
+                        dimension: kind,
+                    });
+                    if (result.success) {
+                        parsedJudgment = result.data;
+                        reason = result.data.reason;
+                    }
+                    else {
+                        // Parse failure — drop to failureMode='unclassified' below.
+                        // Plan 03-03: increment graderReliability.parseFailures so the
+                        // live pipeline surfaces schema drift over time. When no
+                        // telemetry sink is passed, the increment is a no-op (preserves
+                        // the pre-Plan-03-03 calling shape for any callers that don't
+                        // care about reliability counters).
+                        if (telemetry) {
+                            telemetry.reliability.parseFailures =
+                                (telemetry.reliability.parseFailures ?? 0) + 1;
+                        }
+                        reason = reasonRaw;
                     }
                 }
                 catch {
-                    // Not JSON — use raw reason string
+                    // Not JSON (legacy free-prose) — keep raw reason. The
+                    // unclassified-fall-back path below covers this case.
                 }
             }
             // Also flag synthesized api-error judgments as output failures
             const outputFailure = isOutputFailure || reason.startsWith("[api-error]");
-            judgments.push({
-                dimension: kind,
-                modelId,
-                ...(outputFailure && { outputFailure: true }),
-                reason,
-                score,
-                taskId,
-            });
+            if (parsedJudgment !== null) {
+                // Strict-shape parse succeeded — preserve the parsed structured
+                // surface (additive GRAD-02 fields if the grader emitted them)
+                // and overlay pipeline-side semantics (outputFailure flag, the
+                // post-extraction score, and outer-context fields).
+                //
+                // The grader's emitted `score` from the JSON envelope is
+                // intentionally discarded — `parseRubricScore(comp)` (the
+                // component-level score Promptfoo computed) wins. Phase 3 keeps
+                // this asymmetry intentionally; the GRAD-06 cutover plan
+                // revisits whether the grader-emitted score should win or
+                // surface a tolerance-mismatch parseFailure variant.
+                judgments.push({
+                    ...parsedJudgment,
+                    dimension: kind,
+                    modelId,
+                    taskId,
+                    score,
+                    ...(outputFailure && { outputFailure: true }),
+                    reason,
+                });
+            }
+            else {
+                // Phase 3 GRAD-05 fallback — strict-schema parse failed. The
+                // GRAD-02 surface is now required, so we synthesize sensible
+                // defaults that mark this judgment as unparsed:
+                //   - failureMode: "unclassified" (per per-dimension taxonomy in
+                //     packages/eval/src/grader/common.ts).
+                //   - confidence: low / 0 signals / "synthesized-pre-cross-check"
+                //     (D0049 — sentinel placeholder; overwritten by
+                //     validateGraderJudgmentsCalibration with "ceiling-cross-check"
+                //     along the live pipeline path. The placeholder is a distinct
+                //     tag so the unparsed state is obvious if it leaks downstream).
+                //   - hallucinationCheckedAgainst, subJudgments, docCitations:
+                //     empty arrays.
+                //   - judgmentId: deterministic taskId::modelId::dimension stamp
+                //     so consumers can dedupe on it.
+                judgments.push(synthesizeUnparsedJudgment({
+                    taskId,
+                    modelId,
+                    dimension: kind,
+                    score,
+                    reason,
+                    outputFailure,
+                    ...(telemetry?.runId ? { runId: telemetry.runId } : {}),
+                }));
+            }
         }
     }
     return judgments;
 }
+/**
+ * Stamp every grader judgment with a D0049 ceiling-cross-check confidence
+ * triple and increment `GraderReliability.failureModeCalibration` whenever
+ * the grader's emitted `failureMode` disagrees with the
+ * ceiling-decomposition mode.
+ *
+ * Plan 03-03 — the grader's emitted `failureMode` is the source of truth
+ * for the mode itself (Plan 03-02 per-dimension taxonomies); this pass
+ * stamps confidence based on whether the structural ceiling signal agrees
+ * and surfaces calibration drift as a counter on `GraderReliability`.
+ *
+ * The function mutates `judgments` in place — it overlays
+ * `judgment.confidence` with the ceiling-cross-check stamp. If a judgment
+ * already carries a confidence from the strict-schema parse (Plan 03-01),
+ * the ceiling-cross-check stamp REPLACES it because the validator's
+ * derivation tag is the live-pipeline contract; the parsed-shape
+ * confidence (if emitted by the grader) is preserved on the original
+ * `parsedJudgment` upstream of this site.
+ *
+ * `hallucinationCheckedAgainst` is NOT populated here — `extractGraderJudgments`
+ * does not have access to `task.contextDocs ∪ run.documentManifest` at this
+ * site (the union travels through a separate path in
+ * `gap-analysis-step.ts`'s document-enrichment flow). Plan 03-04 will
+ * couple the doc-union population at the strict-schema flip site so the
+ * field is populated alongside the required-flip.
+ *
+ * @param judgments - Grader judgments produced by `extractGraderJudgments`.
+ * @param scores - Per-area feature scores; `ceilingScore` and `floorScore`
+ *   come from this lookup. Missing areas default to ceiling 100, floor 0
+ *   (preserves the pre-Plan-03-03 fall-back from `buildFailureModeReport`).
+ * @param reliability - `GraderReliability` sink whose
+ *   `failureModeCalibration` counter is incremented on disagreement.
+ *
+ * @see docs/decisions/D0005-grader-model-separation.md
+ * @see docs/decisions/D0049-shared-confidence-contract.md
+ */
+export function validateGraderJudgmentsCalibration(judgments, scores, reliability) {
+    const scoreByArea = new Map();
+    for (const score of scores) {
+        scoreByArea.set(score.feature, score);
+    }
+    for (const judgment of judgments) {
+        // Resolve area for ceiling/floor lookup. Mirrors the resolution
+        // strategy used inside buildFailureModeReport but kept local so we
+        // don't widen the failure-modes module surface.
+        const detected = detectFeatureArea(judgment.taskId);
+        let area;
+        if (detected !== "other" && scoreByArea.has(detected)) {
+            area = detected;
+        }
+        else {
+            for (const candidate of scoreByArea.keys()) {
+                if (judgment.taskId.startsWith(candidate)) {
+                    area = candidate;
+                    break;
+                }
+            }
+        }
+        const areaScore = area ? scoreByArea.get(area) : undefined;
+        const ceilingScore = areaScore?.ceilingScore ?? 100;
+        const floorScore = areaScore?.floorScore ?? 0;
+        // Snapshot the pre-overwrite derivation BEFORE replacing
+        // `judgment.confidence` with the ceiling-cross-check stamp. The
+        // synthesized parse-fallback sentinel is the structurally correct
+        // discriminator for "this judgment came from a parse failure, not a
+        // genuine grader emission". Reading it before overwrite preserves
+        // the signal that L627 erases. Equivalent in shape to a typed
+        // sentinel check — derivation is the contract field for upstream
+        // provenance.
+        const isSynthesizedFallback = judgment.confidence?.derivation === "synthesized-pre-cross-check";
+        const stamp = validateFailureMode(judgment, ceilingScore, floorScore);
+        judgment.confidence = stamp;
+        // A calibration miss requires BOTH signals present and disagreeing.
+        // signalsPresent === 1 (grader-only, no ceiling pattern) is NOT a
+        // miss — we have nothing to cross-check against. Folding it in
+        // over-counted whenever classifyByCeiling returned null. Skip the
+        // synthesized parse-fallback judgment whose hard-coded
+        // failureMode: "unclassified" would otherwise increment the counter
+        // every time ceiling decomposition picks ANY classified mode —
+        // that's a parseFailures concern, not a calibration concern. We
+        // detect "synthesized fallback" via the derivation sentinel rather
+        // than `failureMode !== "unclassified"` — the latter under-counts
+        // when a grader genuinely emits "unclassified" as a classification.
+        if (stamp.level === "medium" &&
+            stamp.signalsPresent === 2 &&
+            !isSynthesizedFallback) {
+            reliability.failureModeCalibration =
+                (reliability.failureModeCalibration ?? 0) + 1;
+        }
+    }
+}
+/**
+ * Populate Pitfall #11 hallucination cross-check fields on grader
+ * judgments (Plan 03-04 GRAD-05).
+ *
+ * For each judgment, sets `hallucinationCheckedAgainst` to the union of
+ * (a) the slugs of docs the task declared in `context.docs` and (b) the
+ * run's full document manifest. For each entry in `judgment.docCitations`
+ * that carries a `slug`, sets `hallucinated: !union.has(slug)` — a slug
+ * that does not appear in either set is a fabrication, not a real
+ * resolvable doc.
+ *
+ * Mutates `judgments` in place. Slug-less citations are left with
+ * `hallucinated` undefined since the lookup key is the slug per the
+ * GraderJudgment domain doc-comment ("slug does not resolve against the
+ * task's contextDocs set").
+ *
+ * Called from `gap-analysis-step` (literacy mode only) after
+ * `descToDocRefs` and `documentManifest` are built — both inputs are
+ * unavailable at extract time. Non-literacy modes skip this step
+ * entirely (the runtime contract has no canonical doc set to check
+ * against).
+ *
+ * @param judgments - Grader judgments to enrich. Mutated in place.
+ * @param taskDocSlugs - Map from base task description (the form judgment
+ *   `taskId` carries after stripping the `(gold)` / `(baseline)` suffix)
+ *   to the slugs declared in that task's `context.docs`.
+ * @param manifestSlugs - All slugs in the run's document manifest.
+ */
+export function populateHallucinationFields(judgments, taskDocSlugs, manifestSlugs) {
+    const manifestSet = new Set(manifestSlugs);
+    for (const judgment of judgments) {
+        const baseDesc = judgment.taskId.replace(/\s*\((gold|baseline)\)\s*$/, "");
+        const taskSlugs = taskDocSlugs.get(baseDesc) ?? [];
+        const union = new Set(manifestSet);
+        for (const slug of taskSlugs)
+            union.add(slug);
+        judgment.hallucinationCheckedAgainst = [...union];
+        for (const citation of judgment.docCitations) {
+            if (typeof citation.slug === "string" && citation.slug.length > 0) {
+                citation.hallucinated = !union.has(citation.slug);
+            }
+        }
+    }
+}
 /**
  * Maximum characters (JS string length, not bytes) to store for model
  * response output. ASCII-heavy responses at this cap JSON-encode to ~1 MB;
@@ -998,7 +1261,56 @@ export function scoreAgenticResultsPerModel(resultsPath, profile, preflightOptio
  */
 // mergeScores — imported from @sanity/ailf-core above
 const CRITICAL_THRESHOLD = 40;
-export function calculateAndWriteScores(options) {
+/**
+ * Build a regrade callback for the borderline runner from the original
+ * Promptfoo results file. Resolves each judgment back to its
+ * `(taskId, modelId, dimension)` component result, then calls the
+ * supplied `regradeOnce` to score the response against the rubric
+ * again. Returns the original judgment score when the lookup or
+ * regrade call fails — the runner treats that as a degenerate replica
+ * (the consensus median absorbs it).
+ */
+function buildBorderlineRegrader(input) {
+    // Lazily build the lookup map on first call so we don't pay the
+    // file-read cost when the runner short-circuits on no borderline
+    // judgments. Merges componentResults across every supplied results
+    // file so full-mode (baseline + agentic) judgments resolve to their
+    // original (responseText, rubricText) pair regardless of source file.
+    let lookup = null;
+    return async (judgment) => {
+        if (lookup === null) {
+            lookup = new Map();
+            for (const path of input.resultsPaths) {
+                const results = readAndNormalizeResults(path);
+                for (const result of results) {
+                    const taskId = result.description;
+                    const modelId = result.providerId ?? result.providerLabel ?? "unknown";
+                    for (const comp of result.gradingResult.componentResults) {
+                        if (comp.assertion?.type !== "llm-rubric")
+                            continue;
+                        const dimension = classifyRubric(comp);
+                        if (!dimension)
+                            continue;
+                        const rubricText = typeof comp.assertion.value === "string"
+                            ? comp.assertion.value
+                            : "";
+                        const responseText = result.response?.output ?? "";
+                        lookup.set(`${taskId}::${modelId}::${dimension}`, {
+                            responseText,
+                            rubricText,
+                        });
+                    }
+                }
+            }
+        }
+        const ctx = lookup.get(`${judgment.taskId}::${judgment.modelId}::${judgment.dimension}`);
+        if (!ctx)
+            return judgment.score;
+        const replica = await input.regradeOnce(ctx.responseText, ctx.rubricText);
+        return replica ?? judgment.score;
+    };
+}
+export async function calculateAndWriteScores(options) {
     const ROOT = options.rootDir;
     const log = options.logger ?? new ConsoleLogger();
     const sourceName = options.source;
@@ -1026,6 +1338,44 @@ export function calculateAndWriteScores(options) {
     const baselineResultsPath = options.resultsPath ?? join(ROOT, "results", "latest", "eval-results.json");
     // Agentic results path (only used in full mode)
     const agenticResultsPath = join(ROOT, "results", "latest", "eval-results-agentic.json");
+    // Borderline-consensus pass — shared across all four scoring modes
+    // (literacy, agent-harness, knowledge-probe, mcp-server). Each mode's
+    // persist site invokes this AFTER `extractGraderJudgments` and BEFORE
+    // `writeFileSync(grader-judgments.json)` so the persisted file carries
+    // the consensus median rather than single-replica scores. Returns the
+    // per-judgment consistency map (or null) so the literacy branch can
+    // also persist `borderline-consistency.json`.
+    const runBorderlinePass = async (judgments, resultsPaths) => {
+        if (judgments.length === 0 ||
+            !options.borderlineConsensusRunner ||
+            !options.borderlineRegradeOnce) {
+            return null;
+        }
+        const regrade = buildBorderlineRegrader({
+            resultsPaths,
+            regradeOnce: options.borderlineRegradeOnce,
+            logger: log,
+        });
+        try {
+            const { judgments: regraded, consistencyByJudgment } = await options.borderlineConsensusRunner({
+                judgments,
+                regrade,
+                logger: log,
+            });
+            // Mutate-in-place so subsequent steps (validateGraderJudgmentsCalibration,
+            // persist) see the consensus-merged scores.
+            judgments.length = 0;
+            judgments.push(...regraded);
+            if (consistencyByJudgment.size > 0) {
+                log.info(`Borderline consensus merged ${consistencyByJudgment.size} judgment(s)`);
+            }
+            return consistencyByJudgment;
+        }
+        catch (err) {
+            log.warn(`Borderline consensus pass failed — falling back to single-replica scores: ${err instanceof Error ? err.message : String(err)}`);
+            return null;
+        }
+    };
     // Validate baseline results file
     const resultsIssues = checkResultsExist(ROOT, baselineResultsPath);
     const resultsErrors = resultsIssues.filter((i) => i.severity === "error");
@@ -1079,7 +1429,14 @@ export function calculateAndWriteScores(options) {
         log.info("Score summary written to results/latest/score-summary.json");
         // Extract and persist grader judgments
         const judgments = extractGraderJudgments(baselineResultsPath);
+        const borderlineConsistency = await runBorderlinePass(judgments, [
+            baselineResultsPath,
+        ]);
         if (judgments.length > 0) {
+            if (borderlineConsistency && borderlineConsistency.size > 0) {
+                writeFileSync(join(outDir, "borderline-consistency.json"), JSON.stringify(Object.fromEntries(borderlineConsistency.entries()), null, 2));
+                log.info(`Borderline consistency written to results/latest/borderline-consistency.json (${borderlineConsistency.size} entries)`);
+            }
             writeFileSync(join(outDir, "grader-judgments.json"), JSON.stringify(judgments, null, 2));
             log.info(`Grader judgments written to results/latest/grader-judgments.json (${judgments.length} judgments)`);
         }
@@ -1130,7 +1487,14 @@ export function calculateAndWriteScores(options) {
         writeFileSync(join(outDir, "score-summary.json"), JSON.stringify(summary, null, 2));
         log.info("Score summary written to results/latest/score-summary.json");
         const judgments = extractGraderJudgments(baselineResultsPath);
+        const borderlineConsistency = await runBorderlinePass(judgments, [
+            baselineResultsPath,
+        ]);
         if (judgments.length > 0) {
+            if (borderlineConsistency && borderlineConsistency.size > 0) {
+                writeFileSync(join(outDir, "borderline-consistency.json"), JSON.stringify(Object.fromEntries(borderlineConsistency.entries()), null, 2));
+                log.info(`Borderline consistency written to results/latest/borderline-consistency.json (${borderlineConsistency.size} entries)`);
+            }
             writeFileSync(join(outDir, "grader-judgments.json"), JSON.stringify(judgments, null, 2));
             log.info(`Grader judgments written to results/latest/grader-judgments.json (${judgments.length} judgments)`);
         }
@@ -1232,16 +1596,54 @@ export function calculateAndWriteScores(options) {
     mkdirSync(outDir, { recursive: true });
     writeFileSync(join(outDir, "score-summary.json"), JSON.stringify(summary, null, 2));
     log.info("Score summary written to results/latest/score-summary.json");
-    // Extract and persist grader judgments (Phase 3a: failure mode extraction)
-    const judgments = extractGraderJudgments(baselineResultsPath);
+    // Extract and persist grader judgments (Phase 3a: failure mode extraction).
+    //
+    // Plan 03-03 — wire calibration telemetry. The strict-schema parse
+    // failure counter (`parseFailures`) is incremented during extraction;
+    // the ceiling-cross-check disagreement counter (`failureModeCalibration`)
+    // is incremented during the post-extraction validation pass below.
+    const reliability = { graderModel: "unknown" };
+    const judgments = extractGraderJudgments(baselineResultsPath, {
+        reliability,
+        ...(options.runId ? { runId: options.runId } : {}),
+    });
     // In full mode, also extract judgments from agentic results
     if (mode === LiteracyVariant.FULL && existsSync(agenticResultsPath)) {
-        const agenticJudgments = extractGraderJudgments(agenticResultsPath);
+        const agenticJudgments = extractGraderJudgments(agenticResultsPath, {
+            reliability,
+            ...(options.runId ? { runId: options.runId } : {}),
+        });
         judgments.push(...agenticJudgments);
     }
+    // Borderline-consensus pass — re-grade the ±5 borderline subset N times
+    // and merge medians back into the canonical judgments BEFORE
+    // `validateGraderJudgmentsCalibration` runs, so the calibration counter
+    // sees the consensus-merged scores rather than single-replica noise.
+    // In full mode, the merged `judgments` array includes entries sourced
+    // from both baseline and agentic result files — pass both paths so the
+    // regrader's lookup map can resolve agentic-sourced judgments back to
+    // their (responseText, rubricText) pair instead of falling through to
+    // the unchanged-score branch.
+    const borderlineConsistency = await runBorderlinePass(judgments, mode === LiteracyVariant.FULL && existsSync(agenticResultsPath)
+        ? [baselineResultsPath, agenticResultsPath]
+        : [baselineResultsPath]);
     if (judgments.length > 0) {
+        // Stamp each judgment with the D0049 ceiling-cross-check confidence
+        // triple and accumulate `failureModeCalibration` disagreement counts.
+        validateGraderJudgmentsCalibration(judgments, scores, reliability);
+        if (borderlineConsistency && borderlineConsistency.size > 0) {
+            writeFileSync(join(outDir, "borderline-consistency.json"), JSON.stringify(Object.fromEntries(borderlineConsistency.entries()), null, 2));
+            log.info(`Borderline consistency written to results/latest/borderline-consistency.json (${borderlineConsistency.size} entries)`);
+        }
         writeFileSync(join(outDir, "grader-judgments.json"), JSON.stringify(judgments, null, 2));
         log.info(`Grader judgments written to results/latest/grader-judgments.json (${judgments.length} judgments)`);
+        if (reliability.parseFailures !== undefined ||
+            reliability.failureModeCalibration !== undefined) {
+            log.debug("Grader reliability telemetry", {
+                parseFailures: reliability.parseFailures ?? 0,
+                failureModeCalibration: reliability.failureModeCalibration ?? 0,
+            });
+        }
     }
     // Extract and persist per-test results (D0029: model output + metadata)
     const testResults = extractStoredTestResults(baselineResultsPath);

package/dist/pipeline/compiler/literacy-bridge.d.ts CHANGED Viewed

@@ -18,7 +18,7 @@
  *
  * @see docs/archive/exec-plans/architecture-overhaul/phase-7-migrate-literacy.md
  */
-import type { LiteracyTaskDefinition } from "../../_vendor/ailf-core/index.d.ts";
+import { type LiteracyTaskDefinition } from "../../_vendor/ailf-core/index.d.ts";
 import { type LiteracyCompileResult } from "./mode-handlers/literacy/index.js";
 import type { PreflightRubricContext } from "./rubric-resolution.js";
 import { type LiteracyEvalSubMode } from "../normalize-mode.js";

package/dist/pipeline/compiler/literacy-bridge.js CHANGED Viewed

@@ -18,6 +18,7 @@
  *
  * @see docs/archive/exec-plans/architecture-overhaul/phase-7-migrate-literacy.md
  */
+import { RubricConfigSchema, formatZodErrors, } from "../../_vendor/ailf-core/index.js";
 import { compileLiteracyTask, } from "./mode-handlers/literacy/index.js";
 import { tryLoadConfigFile } from "./config-loader.js";
 import { buildTaskGraph } from "./task-graph-builder.js";
@@ -152,23 +153,40 @@ function loadRubricResolutionInput(rootDir) {
     const result = tryLoadConfigFile("rubrics", rootDir);
     if (!result)
         return undefined;
-    try {
-        const parsed = result.data;
-        if (!parsed?.templates)
-            return undefined;
-        const templates = {};
-        for (const [key, val] of Object.entries(parsed.templates)) {
-            const t = val;
-            templates[key] = {
-                header: String(t.header ?? ""),
-                scale: t.scale ?? [],
-                dimension: t.dimension ? String(t.dimension) : undefined,
-                criteria_label: t.criteria_label ? String(t.criteria_label) : undefined,
-            };
-        }
-        return { templates };
+    // Run the canonical RubricConfigSchema (Plan 03-01) at the trust
+    // boundary. Hand-rolled .ailf/rubrics.{yaml,json} configs and test
+    // fixtures previously bypassed validation: tryLoadConfigFile only
+    // ran the format-specific parser, then per-field unsafe casts read
+    // the result. With the strict GraderJudgmentSchema from Plan 03-04,
+    // a missing footer silently became "" and every grader emission
+    // fell through to the synthesized 'unclassified' fallback (CR-03).
+    // Fail loudly with a Zod-formatted message instead so config drift
+    // is caught at load time, not at every grader emission.
+    const parseResult = RubricConfigSchema.safeParse(result.data);
+    if (!parseResult.success) {
+        const lines = formatZodErrors(parseResult.error);
+        throw new Error(`Invalid rubric config at ${result.filePath}:\n${lines.join("\n")}`);
     }
-    catch {
-        return undefined;
+    const parsed = parseResult.data;
+    const templates = {};
+    for (const [key, t] of Object.entries(parsed.templates)) {
+        // Plan 03-02 — thread per-dimension failureModes through to the
+        // runtime prompt assembler when the config stamped the field via
+        // `failureModesForDimension(dimension)`.
+        templates[key] = {
+            header: t.header,
+            scale: t.scale,
+            ...(t.dimension !== undefined ? { dimension: t.dimension } : {}),
+            ...(t.criteria_label !== undefined && t.criteria_label !== null
+                ? { criteria_label: t.criteria_label }
+                : {}),
+            ...(t.failureModes && t.failureModes.length > 0
+                ? { failureModes: t.failureModes }
+                : {}),
+        };
     }
+    // Plan 03-01 — footer is min(1) per RubricConfigSchema, so safeParse
+    // above guarantees a non-empty string here. No defensive empty-string
+    // fall-back needed.
+    return { templates, footer: parsed.footer };
 }

package/dist/pipeline/compiler/rubric-resolution.d.ts CHANGED Viewed

@@ -23,7 +23,22 @@ export interface RubricResolutionInput {
         dimension?: string;
         header: string;
         scale: string[];
+        /**
+         * Plan 03-02 — per-dimension legal failure-mode list emitted into the
+         * rubric prompt body before `${rubricConfig.footer}`. Sourced from
+         * `failureModesForDimension(dimension)` in
+         * `packages/eval/src/grader/index.ts` so the grader is told which modes
+         * are legal for this dimension family.
+         */
+        failureModes?: readonly string[];
     }>;
+    /**
+     * Plan 03-01 — formerly hard-coded literal at L98-102; now sourced
+     * from RubricConfig.footer. The footer documents the target wire
+     * format the grader emits (structured GraderJudgment shape sketch in
+     * Phase 3+; legacy `{score, reason}` JSON pre-Phase-3).
+     */
+    footer: string;
 }
 /**
  * Resolve a templated LLM-rubric assertion into a fully assembled

package/dist/pipeline/compiler/rubric-resolution.js CHANGED Viewed

@@ -54,10 +54,18 @@ export function resolveTemplatedAssertion(assertion, rubricConfig, graderProvide
     const preflightSection = preflightContext && template.dimension === "code-correctness"
         ? buildPreflightSection(preflightContext)
         : "";
+    // Plan 03-02 — when the template carries a per-dimension legal-mode
+    // list, announce it to the grader before the structured-shape footer
+    // (Plan 03-01). Empty list => no announcement (preserves prior wording
+    // for templates that haven't yet been wired to a taxonomy family).
+    const failureModesLine = template.failureModes && template.failureModes.length > 0
+        ? `\nThe "failureMode" must be one of: ${template.failureModes.join(", ")}.\n\n`
+        : "";
     const rubricValue = preflightSection +
         `${template.header}\n${scaleText}\n\n` +
         `${template.criteria_label ?? "Check for:"}\n${criteriaText}\n\n` +
-        `Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}`;
+        failureModesLine +
+        `${rubricConfig.footer}`;
     const rubricPrompt = canonicalReference
         ? buildRubricPromptWithReference(rubricValue, canonicalReference)
         : undefined;