npm - @sanity/ailf - Versions diffs - 7.1.0 → 7.2.0 - Mend

@sanity/ailf 7.1.0 → 7.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +10 -0
package/dist/_vendor/ailf-core/schemas/index.d.ts +1 -0
package/dist/_vendor/ailf-core/schemas/index.js +4 -0
package/dist/_vendor/ailf-core/schemas/report.d.ts +11 -0
package/dist/_vendor/ailf-core/schemas/report.js +14 -0
package/dist/_vendor/ailf-core/schemas/user.d.ts +22 -0
package/dist/_vendor/ailf-core/schemas/user.js +23 -0
package/dist/_vendor/ailf-core/types/index.d.ts +29 -0
package/dist/_vendor/ailf-core/types/index.js +13 -0
package/dist/_vendor/ailf-core/types/user.d.ts +49 -0
package/dist/_vendor/ailf-core/types/user.js +1 -0
package/dist/_vendor/ailf-shared/document-ref.d.ts +29 -1
package/dist/_vendor/ailf-shared/document-ref.js +23 -1
package/dist/_vendor/ailf-shared/generated/help-content.js +26 -14
package/dist/_vendor/ailf-shared/index.d.ts +1 -1
package/dist/_vendor/ailf-shared/index.js +1 -0
package/dist/_vendor/ailf-shared/owner-teams.js +19 -6
package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +15 -1
package/dist/adapters/grader-outputs/promptfoo-grader-output.d.ts +2 -2
package/dist/adapters/task-sources/content-lake-task-source.js +12 -7
package/dist/orchestration/steps/compute-attribution-step.d.ts +2 -2
package/dist/orchestration/steps/compute-attribution-step.js +17 -2
package/dist/orchestration/steps/gap-analysis-step.d.ts +2 -2
package/dist/orchestration/steps/gap-analysis-step.js +29 -10
package/dist/orchestration/steps/publish-report-step.d.ts +15 -1
package/dist/orchestration/steps/publish-report-step.js +63 -6
package/dist/pipeline/calculate-scores.d.ts +13 -1
package/dist/pipeline/calculate-scores.js +125 -22
package/dist/pipeline/enrichment-preconditions.d.ts +52 -0
package/dist/pipeline/enrichment-preconditions.js +84 -0
package/dist/pipeline/extract-grader-judgments-resilient.d.ts +88 -0
package/dist/pipeline/extract-grader-judgments-resilient.js +122 -0
package/dist/report-store.d.ts +1 -0
package/dist/report-store.js +2 -0
package/dist/sanity/queries.d.ts +1 -1
package/dist/sanity/queries.js +1 -0
package/dist/sources.js +40 -2
package/package.json +1 -1

package/dist/orchestration/steps/gap-analysis-step.js CHANGED Viewed

@@ -18,6 +18,7 @@ import { existsSync, mkdirSync, readFileSync, renameSync, writeFileSync, } from
 import { join, resolve } from "path";
 import { assoc, isSlugRef } from "../../_vendor/ailf-core/index.js";
 import { emitFileContents } from "../../artifact-capture/emit-file.js";
+import { classifyEnrichmentInputs, degradedEnrichmentError, } from "../../pipeline/enrichment-preconditions.js";
 export class GapAnalysisStep {
     name = "gap-analysis";
     optional = true;
@@ -34,12 +35,29 @@ export class GapAnalysisStep {
         }
         return [];
     }
-    async execute(ctx) {
+    async execute(ctx, state) {
         const root = ctx.config.rootDir;
         const start = Date.now();
         const judgmentsPath = resolve(root, "results", "latest", "grader-judgments.json");
         const scoreSummaryPath = resolve(root, "results", "latest", "score-summary.json");
-        if (!existsSync(judgmentsPath)) {
+        // Distinguish a legitimate skip (no graded eval ran this pipeline) from a
+        // degraded run where a full eval scored tests but no judgments persisted.
+        // The latter must fail loud — returning a benign `skipped` is what let
+        // reports publish with a score but no test details.
+        //
+        // A remote cache hit restores score-summary.json (with testCount) from a
+        // prior report but never writes grader-judgments.json, so judgments are
+        // legitimately absent — that is a benign skip, not a degraded full eval.
+        const fromRemoteCache = (state?.remoteCacheHits?.size ?? 0) > 0;
+        const inputs = classifyEnrichmentInputs(root);
+        if (inputs.kind === "judgments-missing-after-eval" && !fromRemoteCache) {
+            return {
+                durationMs: Date.now() - start,
+                status: "failed",
+                error: degradedEnrichmentError("gap-analysis", inputs.scoredTestCount),
+            };
+        }
+        if (inputs.kind !== "ready") {
             return {
                 status: "skipped",
                 reason: "No grader-judgments.json — run a full evaluation first",
@@ -82,14 +100,15 @@ export class GapAnalysisStep {
                 const resolveRefs = (slugs) => slugs
                     .map((slug) => {
                     const m = refBySlug.get(slug);
-                    return m
-                        ? {
-                            documentId: m._id,
-                            revision: m._rev,
-                            slug: m.slug,
-                            title: m.title,
-                        }
-                        : { documentId: "", slug, title: slug };
+                    if (!m)
+                        return { documentId: "", slug, title: slug };
+                    return {
+                        documentId: m._id,
+                        revision: m._rev,
+                        slug: m.slug,
+                        ...(m.path ? { path: m.path } : {}),
+                        title: m.title,
+                    };
                 })
                     .filter((r) => r.documentId !== "");
                 // ── Build description→docs mapping from TaskSource ─────────

package/dist/orchestration/steps/publish-report-step.d.ts CHANGED Viewed

@@ -10,7 +10,7 @@
  * - P5: Local-first (pipeline never fails because of a store write)
  * - P6: Sinks are fire-and-forget (failures logged, not thrown)
  */
-import { type AppContext, type PipelineState, type PipelineStep, type PromptfooUrlEntry, type ReportAutoScope, type ScoreSummary, type StepResult, type ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
+import { type AppContext, type PipelineState, type PipelineStep, type PromptfooUrlEntry, type ReportAutoScope, type ReportDegradation, type ScoreSummary, type StepResult, type ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
 import { type ProvenanceInput } from "../../pipeline/provenance.js";
 export declare class PublishReportStep implements PipelineStep {
     private readonly pipelineStart;
@@ -25,6 +25,20 @@ export declare class PublishReportStep implements PipelineStep {
     check(): ValidationIssue[];
     execute(ctx: AppContext, state: PipelineState): Promise<StepResult>;
 }
+/**
+ * Detect whether a report should publish as degraded.
+ *
+ * The symptom is a scored run whose per-test details never landed: a full
+ * eval counted tests (`scores[].testCount > 0`) but `summary.testResults` is
+ * absent because gap-analysis skipped or failed. Such a report renders an
+ * empty "no tests" state in Studio despite carrying a score. Returns the
+ * marker enumerating which enrichment surfaces are missing, or `undefined`
+ * for a healthy report (or a run with no scored tests, where an empty report
+ * is legitimate).
+ *
+ * Exported for unit testing — production callers reach it via execute().
+ */
+export declare function detectReportDegradation(summary: ScoreSummary): ReportDegradation | undefined;
 /**
  * Assemble provenance input from the score summary and pipeline context.
  *

package/dist/orchestration/steps/publish-report-step.js CHANGED Viewed

@@ -110,9 +110,15 @@ export class PublishReportStep {
         // agentBehavior arrays) point at their external artifacts via
         // `id = manifestEntryKey`; Studio hydrates on drill-down.
         const slimSummary = buildSlimReportSummary(summary, ctx.config.mode);
+        // Degraded-report detection (the "no tests on a scored report" symptom):
+        // a full eval scored tests but the gap-analysis enrichment never landed.
+        // Computed from the full summary read above — independent of which
+        // upstream step skipped — so the marker fires regardless of the cause.
+        const degraded = detectReportDegradation(summary);
         const report = {
             comparison: comparison ?? undefined,
             completedAt: now,
+            ...(degraded ? { degraded } : {}),
             durationMs,
             id: reportId,
             provenance,
@@ -192,6 +198,45 @@ export class PublishReportStep {
 // ---------------------------------------------------------------------------
 // Helpers
 // ---------------------------------------------------------------------------
+/**
+ * Detect whether a report should publish as degraded.
+ *
+ * The symptom is a scored run whose per-test details never landed: a full
+ * eval counted tests (`scores[].testCount > 0`) but `summary.testResults` is
+ * absent because gap-analysis skipped or failed. Such a report renders an
+ * empty "no tests" state in Studio despite carrying a score. Returns the
+ * marker enumerating which enrichment surfaces are missing, or `undefined`
+ * for a healthy report (or a run with no scored tests, where an empty report
+ * is legitimate).
+ *
+ * Exported for unit testing — production callers reach it via execute().
+ */
+export function detectReportDegradation(summary) {
+    const scoredTestCount = (summary.scores ?? []).reduce((n, s) => n + (typeof s.testCount === "number" ? s.testCount : 0), 0);
+    const hasTestResults = (summary.testResults?.length ?? 0) > 0;
+    if (scoredTestCount === 0 || hasTestResults)
+        return undefined;
+    // `testResults` is the load-bearing signal (its absence is the rendered
+    // "no tests" symptom). The remaining fields are best-effort detail: some
+    // are literacy-only (e.g. documentManifest), so they may appear here for a
+    // degraded non-literacy run even though that mode never produces them.
+    const missing = ["testResults"];
+    if (!summary.failureModes)
+        missing.push("failureModes");
+    if (!summary.lowScoringJudgments?.length)
+        missing.push("lowScoringJudgments");
+    if (!summary.documentManifest?.length)
+        missing.push("documentManifest");
+    if (!summary.recommendations)
+        missing.push("recommendations");
+    return {
+        reason: "enrichment-missing",
+        missing,
+        detail: `Evaluation scored ${scoredTestCount} test(s) but enrichment did not ` +
+            `complete; per-test details and failure analysis are unavailable for ` +
+            `this report.`,
+    };
+}
 /**
  * Assemble provenance input from the score summary and pipeline context.
  *
@@ -214,20 +259,32 @@ export function buildProvenanceInput(summary, ctx, options, autoScope) {
     //      summary.source undefined). Without this fallback, the report
     //      reads "production" regardless of what the dashboard sent.
     //   3. "production" — last-resort built-in default.
-    if (summary.source?.name === undefined && ctx.config.source) {
+    //
+    // Per-field fallbacks (dataset/projectId/perspective) only fire when
+    // `summary.source` itself is absent — i.e. the loadSource throw was
+    // swallowed. When summary.source is present, trust what the fetch
+    // actually used; papering over a missing `perspective` from
+    // `ctx.config.perspectiveOverride` makes provenance claim a release
+    // was used when it wasn't (W0295).
+    const sourceResolved = summary.source?.name !== undefined;
+    if (!sourceResolved && ctx.config.source) {
         ctx.logger.warn(`[publish-report] summary.source is missing; falling back to ctx.config.source="${ctx.config.source}" for provenance.source.name`);
     }
     const source = {
         baseUrl: summary.source?.baseUrl ?? "https://www.sanity.io/docs",
-        dataset: summary.source?.dataset ?? ctx.config.datasetOverride ?? "next",
+        dataset: sourceResolved
+            ? (summary.source.dataset ?? "next")
+            : (ctx.config.datasetOverride ?? "next"),
         documentIds: [],
         llmsTxt: (summary.source?.baseUrl ?? "https://www.sanity.io/docs") + "/llms.txt",
         name: summary.source?.name ?? ctx.config.source ?? "production",
-        perspective: summary.source?.perspective ??
-            ctx.config.perspectiveOverride ??
-            undefined,
+        perspective: sourceResolved
+            ? summary.source.perspective
+            : (ctx.config.perspectiveOverride ?? undefined),
         priorityDomain: "sanity.io",
-        projectId: summary.source?.projectId ?? ctx.config.projectIdOverride ?? "3do82whm",
+        projectId: sourceResolved
+            ? summary.source.projectId
+            : (ctx.config.projectIdOverride ?? "3do82whm"),
         studioOrigin: "https://admin.sanity.io",
         urls: [],
     };

package/dist/pipeline/calculate-scores.d.ts CHANGED Viewed

@@ -187,6 +187,13 @@ export declare function validateGraderJudgmentsCalibration(judgments: GraderJudg
  * @param manifestSlugs - All slugs in the run's document manifest.
  */
 export declare function populateHallucinationFields(judgments: GraderJudgment[], taskDocSlugs: Map<string, string[]>, manifestSlugs: Iterable<string>): void;
+/**
+ * Per-variant scoring profiles passed to {@link extractStoredTestResults}.
+ * Each profile maps dimension id → weight. Variants whose dimensions don't
+ * intersect the supplied keys yield `compositeScore: undefined` rather than
+ * a misleading 0.
+ */
+export type StoredTestResultProfiles = Partial<Record<"gold" | "baseline", Record<string, number>>>;
 /**
  * Extract per-test results with model output from evaluation results.
  *
@@ -194,9 +201,14 @@ export declare function populateHallucinationFields(judgments: GraderJudgment[],
  * shape including response.output (truncated), latency, and cost.
  * One StoredTestResult per test × model combination.
  *
+ * When `profiles` is provided, each entry's `compositeScore` is computed as
+ * the weighted mean of its dimension scores using the profile matching its
+ * detected `variant`. Without profiles, `compositeScore` is omitted — legacy
+ * behavior preserved.
+ *
  * See D0029 and docs/design-docs/score-drill-down.md (Phase 1).
  */
-export declare function extractStoredTestResults(resultsPath: string): StoredTestResult[];
+export declare function extractStoredTestResults(resultsPath: string, profiles?: StoredTestResultProfiles): StoredTestResult[];
 /**
  * W0198 — aggregate every per-test `SymbolPreflightReport` into a single
  * resolver-health summary. Returns `undefined` when the run had no

package/dist/pipeline/calculate-scores.js CHANGED Viewed

@@ -41,6 +41,7 @@ import { resolveProfile } from "./profile-resolution.js";
 import { loadSource } from "../sources.js";
 import { LiteracyVariant } from "./normalize-mode.js";
 import { scoreTestGroup, } from "./compiler/scoring-bridge.js";
+import { extractGraderJudgmentsResilient, } from "./extract-grader-judgments-resilient.js";
 // Re-export from core for backward compatibility.
 // Existing imports from this file continue to work unchanged.
 export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, } from "../_vendor/ailf-core/index.js";
@@ -321,6 +322,54 @@ export function extractGraderJudgments(resultsPath, telemetry) {
     }
     return judgments;
 }
+/**
+ * Light parse of a results file's entry count — diagnostics only. Avoids the
+ * full normalize + debug logging of `readAndNormalizeResults`. Returns 0 when
+ * the file is missing or unparseable.
+ */
+function countResultEntries(resultsPath) {
+    try {
+        const file = JSON.parse(readFileSync(resultsPath, "utf-8"));
+        const wrapper = file.results ?? file;
+        return Array.isArray(wrapper.results) ? wrapper.results.length : 0;
+    }
+    catch {
+        return 0;
+    }
+}
+/**
+ * Count classifiable llm-rubric components in a results file — i.e. the number
+ * of judgments a healthy `extractGraderJudgments` should produce. Used only to
+ * set the severity of a persistent-empty extraction: a file with classifiable
+ * components but 0 extracted judgments is an error; a file with none (all
+ * api-errors / no llm-rubric) is a benign empty.
+ *
+ * Deliberately an independent count path (not `extractGraderJudgments`) so the
+ * cross-check is meaningful. Returns 0 when the file is missing or unparseable.
+ */
+function countClassifiableRubricComponents(resultsPath) {
+    if (!existsSync(resultsPath))
+        return 0;
+    let n = 0;
+    for (const result of readAndNormalizeResults(resultsPath)) {
+        for (const comp of result.gradingResult.componentResults) {
+            if (comp.assertion?.type === "llm-rubric" && classifyRubric(comp)) {
+                n += 1;
+            }
+        }
+    }
+    return n;
+}
+/**
+ * Shared dependency bundle for `extractGraderJudgmentsResilient` — wires the
+ * real extractor + fs counters. Defined once so all persist sites self-heal
+ * identically.
+ */
+const resilientJudgmentDeps = {
+    countClassifiable: countClassifiableRubricComponents,
+    countResults: countResultEntries,
+    extract: extractGraderJudgments,
+};
 /**
  * Stamp every grader judgment with a D0049 ceiling-cross-check confidence
  * triple and increment `GraderReliability.failureModeCalibration` whenever
@@ -469,6 +518,26 @@ export function populateHallucinationFields(judgments, taskDocSlugs, manifestSlu
  * `responseOutputTruncated` still flips for the extreme tail.
  */
 const MAX_RESPONSE_OUTPUT_LENGTH = 1_000_000;
+/**
+ * Weighted mean of dimension scores. Mirrors the dashboard's read-side
+ * fallback in `apps/dashboard/src/data/projections/test-entries.ts` so writer
+ * and reader stay aligned. Returns `undefined` when no dimension matches the
+ * profile (caller decides whether that signals misconfiguration).
+ */
+function computeStoredCompositeScore(dimensions, weights) {
+    let weighted = 0;
+    let totalWeight = 0;
+    for (const dim of dimensions) {
+        const w = weights[dim.dimension];
+        if (w === undefined)
+            continue;
+        weighted += dim.score * w;
+        totalWeight += w;
+    }
+    if (totalWeight === 0)
+        return undefined;
+    return Math.round(weighted / totalWeight);
+}
 /**
  * Extract per-test results with model output from evaluation results.
  *
@@ -476,9 +545,14 @@ const MAX_RESPONSE_OUTPUT_LENGTH = 1_000_000;
  * shape including response.output (truncated), latency, and cost.
  * One StoredTestResult per test × model combination.
  *
+ * When `profiles` is provided, each entry's `compositeScore` is computed as
+ * the weighted mean of its dimension scores using the profile matching its
+ * detected `variant`. Without profiles, `compositeScore` is omitted — legacy
+ * behavior preserved.
+ *
  * See D0029 and docs/design-docs/score-drill-down.md (Phase 1).
  */
-export function extractStoredTestResults(resultsPath) {
+export function extractStoredTestResults(resultsPath, profiles) {
     const results = readAndNormalizeResults(resultsPath);
     const testResults = [];
     for (const result of results) {
@@ -523,8 +597,13 @@ export function extractStoredTestResults(resultsPath) {
             dimensions.push({ dimension, reason, score });
         }
         const tokenUsage = result.response?.tokenUsage;
+        const profileForVariant = profiles?.[variant];
+        const compositeScore = profileForVariant
+            ? computeStoredCompositeScore(dimensions, profileForVariant)
+            : undefined;
         testResults.push({
             area,
+            ...(compositeScore !== undefined && { compositeScore }),
             cost: result.cost || undefined,
             dimensions,
             latencyMs: result.latencyMs,
@@ -1464,7 +1543,7 @@ export async function calculateAndWriteScores(options) {
         writeFileSync(join(outDir, "score-summary.json"), JSON.stringify(summary, null, 2));
         log.info("Score summary written to results/latest/score-summary.json");
         // Extract and persist grader judgments
-        const judgments = extractGraderJudgments(baselineResultsPath);
+        const judgments = await extractGraderJudgmentsResilient([baselineResultsPath], undefined, log, { deps: resilientJudgmentDeps });
         const borderlineConsistency = await runBorderlinePass(judgments, [
             baselineResultsPath,
         ]);
@@ -1477,7 +1556,12 @@ export async function calculateAndWriteScores(options) {
             log.info(`Grader judgments written to results/latest/grader-judgments.json (${judgments.length} judgments)`);
         }
         // Extract and persist per-test results (D0029: model output + metadata)
-        const testResults = extractStoredTestResults(baselineResultsPath);
+        // Agent-harness produces a single profile shared across detected variants
+        // (the docs/no-docs split doesn't apply — there is no gold/baseline pair).
+        const testResults = extractStoredTestResults(baselineResultsPath, {
+            gold: agentProfile,
+            baseline: agentProfile,
+        });
         if (testResults.length > 0) {
             writeFileSync(join(outDir, "test-results.json"), JSON.stringify(testResults, null, 2));
             log.info(`Test results written to results/latest/test-results.json (${testResults.length} results)`);
@@ -1522,7 +1606,7 @@ export async function calculateAndWriteScores(options) {
         mkdirSync(outDir, { recursive: true });
         writeFileSync(join(outDir, "score-summary.json"), JSON.stringify(summary, null, 2));
         log.info("Score summary written to results/latest/score-summary.json");
-        const judgments = extractGraderJudgments(baselineResultsPath);
+        const judgments = await extractGraderJudgmentsResilient([baselineResultsPath], undefined, log, { deps: resilientJudgmentDeps });
         const borderlineConsistency = await runBorderlinePass(judgments, [
             baselineResultsPath,
         ]);
@@ -1534,7 +1618,13 @@ export async function calculateAndWriteScores(options) {
             writeFileSync(join(outDir, "grader-judgments.json"), JSON.stringify(judgments, null, 2));
             log.info(`Grader judgments written to results/latest/grader-judgments.json (${judgments.length} judgments)`);
         }
-        const testResults = extractStoredTestResults(baselineResultsPath);
+        // Knowledge-probe deletes vars.docs in the compiler, so every entry's
+        // detected variant is "baseline" — supply the probe profile under both
+        // keys so the composite is populated regardless of detection.
+        const testResults = extractStoredTestResults(baselineResultsPath, {
+            gold: probeProfile,
+            baseline: probeProfile,
+        });
         if (testResults.length > 0) {
             writeFileSync(join(outDir, "test-results.json"), JSON.stringify(testResults, null, 2));
             log.info(`Test results written to results/latest/test-results.json (${testResults.length} results)`);
@@ -1548,9 +1638,15 @@ export async function calculateAndWriteScores(options) {
     // doc-coverage excluded). See docs/design-docs/named-scoring-profiles.md.
     const goldProfile = resolveProfile("literacy", "gold", rubricConfig, LiteracyVariant.STANDARD);
     const baselineProfileWeights = resolveProfile("literacy", LiteracyVariant.STANDARD, rubricConfig, LiteracyVariant.STANDARD);
+    // Hoisted so the post-scoring extractStoredTestResults call against the
+    // agentic results file can attach the matching profile (W0291).
+    const agenticProfile = mode === LiteracyVariant.FULL && existsSync(agenticResultsPath)
+        ? resolveProfile("literacy", "gold", rubricConfig, LiteracyVariant.AGENTIC)
+        : undefined;
     log.debug("Loaded scoring profiles", {
         gold: goldProfile,
         baseline: baselineProfileWeights,
+        ...(agenticProfile && { agentic: agenticProfile }),
     });
     const baselineScores = calculateScores(baselineResultsPath, goldProfile, baselineProfileWeights, preflightOptions);
     log.debug("Baseline scores calculated", {
@@ -1577,7 +1673,8 @@ export async function calculateAndWriteScores(options) {
     let evaluationMode;
     if (mode === LiteracyVariant.FULL && existsSync(agenticResultsPath)) {
         log.info(`\nReading agentic results from: ${agenticResultsPath}`);
-        const agenticProfile = resolveProfile("literacy", "gold", rubricConfig, LiteracyVariant.AGENTIC);
+        // Non-null assertion safe — the outer guard hoisting agenticProfile uses
+        // the same condition; if we entered this block, the profile was resolved.
         const agenticScores = scoreAgenticResults(agenticResultsPath, agenticProfile, preflightOptions);
         log.debug("Agentic scores calculated", {
             featureCount: Object.keys(agenticScores).length,
@@ -1639,18 +1736,14 @@ export async function calculateAndWriteScores(options) {
     // the ceiling-cross-check disagreement counter (`failureModeCalibration`)
     // is incremented during the post-extraction validation pass below.
     const reliability = { graderModel: "unknown" };
-    const judgments = extractGraderJudgments(baselineResultsPath, {
-        reliability,
-        ...(options.runId ? { runId: options.runId } : {}),
-    });
-    // In full mode, also extract judgments from agentic results
-    if (mode === LiteracyVariant.FULL && existsSync(agenticResultsPath)) {
-        const agenticJudgments = extractGraderJudgments(agenticResultsPath, {
-            reliability,
-            ...(options.runId ? { runId: options.runId } : {}),
-        });
-        judgments.push(...agenticJudgments);
-    }
+    // Extract through the resilient wrapper so an empty result from the transient
+    // read anomaly is instrumented and self-healed rather than silently skipping
+    // the grader-judgments persist. In full mode both the baseline and agentic
+    // result files are graded against the shared telemetry.
+    const judgmentResultPaths = mode === LiteracyVariant.FULL && existsSync(agenticResultsPath)
+        ? [baselineResultsPath, agenticResultsPath]
+        : [baselineResultsPath];
+    const judgments = await extractGraderJudgmentsResilient(judgmentResultPaths, { reliability, ...(options.runId ? { runId: options.runId } : {}) }, log, { deps: resilientJudgmentDeps });
     // Borderline-consensus pass — re-grade the ±5 borderline subset N times
     // and merge medians back into the canonical judgments BEFORE
     // `validateGraderJudgmentsCalibration` runs, so the calibration counter
@@ -1681,11 +1774,21 @@ export async function calculateAndWriteScores(options) {
             });
         }
     }
-    // Extract and persist per-test results (D0029: model output + metadata)
-    const testResults = extractStoredTestResults(baselineResultsPath);
-    // In full mode, also extract test results from agentic results
+    // Extract and persist per-test results (D0029: model output + metadata).
+    // Literacy gold (with-docs) entries score against the default profile;
+    // baseline (without-docs) entries score against the output-only profile.
+    const testResults = extractStoredTestResults(baselineResultsPath, {
+        gold: goldProfile,
+        baseline: baselineProfileWeights,
+    });
+    // In full mode, also extract test results from agentic results — the
+    // agentic file's gold entries score against the agentic profile while
+    // baseline entries (if any leak through) still use the literacy baseline.
     if (mode === LiteracyVariant.FULL && existsSync(agenticResultsPath)) {
-        const agenticTestResults = extractStoredTestResults(agenticResultsPath);
+        const agenticTestResults = extractStoredTestResults(agenticResultsPath, {
+            gold: agenticProfile,
+            baseline: baselineProfileWeights,
+        });
         testResults.push(...agenticTestResults);
     }
     if (testResults.length > 0) {

package/dist/pipeline/enrichment-preconditions.d.ts ADDED Viewed

@@ -0,0 +1,52 @@
+/**
+ * pipeline/enrichment-preconditions.ts
+ *
+ * Classifies the inputs the post-scoring enrichment steps (gap-analysis,
+ * compute-attribution) depend on, so a missing `grader-judgments.json` can be
+ * told apart as either a legitimate skip (no graded eval ran this pipeline) or
+ * a degraded outcome (a full eval scored tests but no judgments persisted).
+ *
+ * The degraded case is the failure these steps must stop swallowing:
+ * `calculate-scores` wrote `score-summary.json` with per-area `testCount > 0`
+ * but never wrote `grader-judgments.json`, so the enrichment steps self-skip
+ * and the report ships with no test details while still showing a score.
+ * Distinguishing the two is what lets the steps fail loud instead of returning
+ * a benign `skipped`.
+ */
+/**
+ * Outcome of classifying the enrichment inputs under `results/latest/`.
+ *
+ * - `ready` — `grader-judgments.json` is present and non-empty; enrichment
+ *   can run.
+ * - `no-full-eval` — no graded eval produced judgments this run. A legitimate
+ *   skip: standalone gap-analysis on cached results, a non-graded run, or an
+ *   eval that scored nothing.
+ * - `judgments-missing-after-eval` — a full eval scored tests
+ *   (`score-summary.json` carries `testCount > 0`) yet `grader-judgments.json`
+ *   is missing or empty. This is the degraded condition the steps surface.
+ */
+export type EnrichmentInputs = {
+    kind: "ready";
+    judgmentCount: number;
+} | {
+    kind: "no-full-eval";
+} | {
+    kind: "judgments-missing-after-eval";
+    scoredTestCount: number;
+};
+/**
+ * Classify the enrichment inputs for a run by inspecting
+ * `results/latest/grader-judgments.json` and `score-summary.json`.
+ *
+ * Pure read-only filesystem inspection — never throws on malformed input; a
+ * file that does not parse to the expected shape is treated as absent so that
+ * "no usable judgments" and "no usable summary" both collapse to a single
+ * branch.
+ */
+export declare function classifyEnrichmentInputs(rootDir: string): EnrichmentInputs;
+/**
+ * Build the fail-loud error message for the degraded
+ * `judgments-missing-after-eval` case. Shared by the enrichment steps so the
+ * pipeline-result and job-document surfaces carry one consistent wording.
+ */
+export declare function degradedEnrichmentError(step: string, scoredTestCount: number): string;

package/dist/pipeline/enrichment-preconditions.js ADDED Viewed

@@ -0,0 +1,84 @@
+/**
+ * pipeline/enrichment-preconditions.ts
+ *
+ * Classifies the inputs the post-scoring enrichment steps (gap-analysis,
+ * compute-attribution) depend on, so a missing `grader-judgments.json` can be
+ * told apart as either a legitimate skip (no graded eval ran this pipeline) or
+ * a degraded outcome (a full eval scored tests but no judgments persisted).
+ *
+ * The degraded case is the failure these steps must stop swallowing:
+ * `calculate-scores` wrote `score-summary.json` with per-area `testCount > 0`
+ * but never wrote `grader-judgments.json`, so the enrichment steps self-skip
+ * and the report ships with no test details while still showing a score.
+ * Distinguishing the two is what lets the steps fail loud instead of returning
+ * a benign `skipped`.
+ */
+import { existsSync, readFileSync } from "node:fs";
+import { resolve } from "node:path";
+/**
+ * Classify the enrichment inputs for a run by inspecting
+ * `results/latest/grader-judgments.json` and `score-summary.json`.
+ *
+ * Pure read-only filesystem inspection — never throws on malformed input; a
+ * file that does not parse to the expected shape is treated as absent so that
+ * "no usable judgments" and "no usable summary" both collapse to a single
+ * branch.
+ */
+export function classifyEnrichmentInputs(rootDir) {
+    const judgmentCount = countGraderJudgments(rootDir);
+    if (judgmentCount > 0) {
+        return { kind: "ready", judgmentCount };
+    }
+    const scoredTestCount = scoredTestCountFromSummary(rootDir);
+    if (scoredTestCount > 0) {
+        return { kind: "judgments-missing-after-eval", scoredTestCount };
+    }
+    return { kind: "no-full-eval" };
+}
+/**
+ * Build the fail-loud error message for the degraded
+ * `judgments-missing-after-eval` case. Shared by the enrichment steps so the
+ * pipeline-result and job-document surfaces carry one consistent wording.
+ */
+export function degradedEnrichmentError(step, scoredTestCount) {
+    return (`${step}: grader-judgments.json missing after a full eval — ` +
+        `${scoredTestCount} test(s) scored but 0 grader judgments persisted. ` +
+        `The report is marked degraded rather than published as healthy.`);
+}
+/**
+ * Count the judgments in `grader-judgments.json`. Returns 0 when the file is
+ * absent, unreadable, not valid JSON, or not an array — every "no usable
+ * judgments" shape collapses to 0 so callers branch on a single number. An
+ * empty array is therefore indistinguishable from a missing file by design
+ * (both are "no judgments persisted").
+ */
+function countGraderJudgments(rootDir) {
+    const path = resolve(rootDir, "results", "latest", "grader-judgments.json");
+    if (!existsSync(path))
+        return 0;
+    try {
+        const parsed = JSON.parse(readFileSync(path, "utf-8"));
+        return Array.isArray(parsed) ? parsed.length : 0;
+    }
+    catch {
+        return 0;
+    }
+}
+/**
+ * Sum the per-area `testCount` from `score-summary.json` — the signal that a
+ * full eval scored tests this run. Returns 0 when the summary is absent,
+ * unreadable, or carries no scored tests.
+ */
+function scoredTestCountFromSummary(rootDir) {
+    const path = resolve(rootDir, "results", "latest", "score-summary.json");
+    if (!existsSync(path))
+        return 0;
+    try {
+        const parsed = JSON.parse(readFileSync(path, "utf-8"));
+        const scores = Array.isArray(parsed.scores) ? parsed.scores : [];
+        return scores.reduce((sum, s) => sum + (typeof s.testCount === "number" ? s.testCount : 0), 0);
+    }
+    catch {
+        return 0;
+    }
+}