npm - @sanity/ailf - Versions diffs - 7.0.1 → 7.1.2 - Mend

@sanity/ailf 7.0.1 → 7.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (81) hide show

package/config/rubrics.ts +12 -13
package/dist/_vendor/ailf-core/ports/context.d.ts +45 -3
package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +10 -0
package/dist/_vendor/ailf-core/ports/index.d.ts +1 -1
package/dist/_vendor/ailf-core/schemas/branded-string.d.ts +9 -1
package/dist/_vendor/ailf-core/schemas/branded-string.js +16 -6
package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +2 -0
package/dist/_vendor/ailf-core/schemas/pipeline-request.js +7 -0
package/dist/_vendor/ailf-core/schemas/report.d.ts +12 -0
package/dist/_vendor/ailf-core/schemas/report.js +2 -0
package/dist/_vendor/ailf-core/schemas/team.d.ts +22 -0
package/dist/_vendor/ailf-core/schemas/team.js +63 -0
package/dist/_vendor/ailf-core/types/grader-judgment.d.ts +51 -0
package/dist/_vendor/ailf-core/types/index.d.ts +8 -1
package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +17 -0
package/dist/_vendor/ailf-core/types/team.d.ts +65 -0
package/dist/_vendor/ailf-core/types/team.js +1 -0
package/dist/_vendor/ailf-shared/document-ref.d.ts +29 -1
package/dist/_vendor/ailf-shared/document-ref.js +23 -1
package/dist/_vendor/ailf-shared/eval-modes.d.ts +2 -0
package/dist/_vendor/ailf-shared/eval-modes.js +5 -0
package/dist/_vendor/ailf-shared/event-types.d.ts +15 -0
package/dist/_vendor/ailf-shared/event-types.js +23 -0
package/dist/_vendor/ailf-shared/generated/help-content.js +2 -2
package/dist/_vendor/ailf-shared/index.d.ts +5 -3
package/dist/_vendor/ailf-shared/index.js +5 -2
package/dist/_vendor/ailf-shared/member-roles.d.ts +16 -0
package/dist/_vendor/ailf-shared/member-roles.js +16 -0
package/dist/_vendor/ailf-shared/owner-teams.d.ts +19 -0
package/dist/_vendor/ailf-shared/owner-teams.js +26 -6
package/dist/_vendor/ailf-shared/run-context.d.ts +8 -1
package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +15 -1
package/dist/adapters/grader-outputs/promptfoo-grader-output.d.ts +65 -1
package/dist/adapters/grader-outputs/promptfoo-grader-output.js +35 -0
package/dist/adapters/task-sources/changed-docs-filter.d.ts +12 -0
package/dist/adapters/task-sources/changed-docs-filter.js +30 -0
package/dist/adapters/task-sources/content-lake-task-source.js +14 -8
package/dist/adapters/task-sources/repo-task-source.js +2 -1
package/dist/commands/pipeline-action.d.ts +4 -3
package/dist/commands/pipeline-action.js +7 -5
package/dist/commands/run.js +2 -2
package/dist/config/rubrics.ts +12 -13
package/dist/job-store.d.ts +18 -0
package/dist/job-store.js +34 -0
package/dist/orchestration/build-app-context.js +8 -1
package/dist/orchestration/pipeline-orchestrator.js +46 -1
package/dist/orchestration/steps/compare-step.d.ts +7 -0
package/dist/orchestration/steps/compare-step.js +59 -23
package/dist/orchestration/steps/fetch-docs-step.js +3 -0
package/dist/orchestration/steps/finalize-run-step.js +2 -0
package/dist/orchestration/steps/gap-analysis-step.js +9 -8
package/dist/orchestration/steps/generate-configs-step.d.ts +32 -1
package/dist/orchestration/steps/generate-configs-step.js +47 -13
package/dist/orchestration/steps/grader-consistency-step.js +11 -0
package/dist/orchestration/steps/publish-report-step.d.ts +12 -1
package/dist/orchestration/steps/publish-report-step.js +36 -8
package/dist/pipeline/cache-hit-restore.d.ts +14 -1
package/dist/pipeline/cache-hit-restore.js +17 -0
package/dist/pipeline/calculate-scores.d.ts +13 -1
package/dist/pipeline/calculate-scores.js +123 -29
package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +7 -2
package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +13 -4
package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +1 -1
package/dist/pipeline/compiler/provider-assembler.d.ts +15 -1
package/dist/pipeline/compiler/provider-assembler.js +16 -3
package/dist/pipeline/failure-modes.d.ts +20 -10
package/dist/pipeline/failure-modes.js +84 -15
package/dist/pipeline/map-request-to-config.js +2 -0
package/dist/pipeline/normalize-mode.d.ts +1 -1
package/dist/pipeline/normalize-mode.js +2 -0
package/dist/pipeline/run-context.d.ts +16 -1
package/dist/pipeline/run-context.js +12 -1
package/dist/pipeline/validate.d.ts +8 -4
package/dist/pipeline/validate.js +8 -18
package/dist/report-store.d.ts +14 -1
package/dist/report-store.js +32 -0
package/dist/sanity/client.js +2 -2
package/dist/sanity/queries.d.ts +1 -1
package/dist/sanity/queries.js +1 -0
package/dist/sources.js +40 -2
package/package.json +1 -1

package/dist/pipeline/calculate-scores.d.ts CHANGED Viewed

@@ -187,6 +187,13 @@ export declare function validateGraderJudgmentsCalibration(judgments: GraderJudg
  * @param manifestSlugs - All slugs in the run's document manifest.
  */
 export declare function populateHallucinationFields(judgments: GraderJudgment[], taskDocSlugs: Map<string, string[]>, manifestSlugs: Iterable<string>): void;
+/**
+ * Per-variant scoring profiles passed to {@link extractStoredTestResults}.
+ * Each profile maps dimension id → weight. Variants whose dimensions don't
+ * intersect the supplied keys yield `compositeScore: undefined` rather than
+ * a misleading 0.
+ */
+export type StoredTestResultProfiles = Partial<Record<"gold" | "baseline", Record<string, number>>>;
 /**
  * Extract per-test results with model output from evaluation results.
  *
@@ -194,9 +201,14 @@ export declare function populateHallucinationFields(judgments: GraderJudgment[],
  * shape including response.output (truncated), latency, and cost.
  * One StoredTestResult per test × model combination.
  *
+ * When `profiles` is provided, each entry's `compositeScore` is computed as
+ * the weighted mean of its dimension scores using the profile matching its
+ * detected `variant`. Without profiles, `compositeScore` is omitted — legacy
+ * behavior preserved.
+ *
  * See D0029 and docs/design-docs/score-drill-down.md (Phase 1).
  */
-export declare function extractStoredTestResults(resultsPath: string): StoredTestResult[];
+export declare function extractStoredTestResults(resultsPath: string, profiles?: StoredTestResultProfiles): StoredTestResult[];
 /**
  * W0198 — aggregate every per-test `SymbolPreflightReport` into a single
  * resolver-health summary. Returns `undefined` when the run had no

package/dist/pipeline/calculate-scores.js CHANGED Viewed

@@ -32,7 +32,7 @@ import { join } from "path";
 import { classifyRubric, detectFeatureArea, extractUrlMetadata, generateJudgmentId, mergeScores, parseRubricScore, resolveVariantMode, } from "../_vendor/ailf-core/index.js";
 import { calculateCost } from "../agent-observer/pricing.js";
 import { ConsoleLogger } from "../adapters/loggers/index.js";
-import { GraderJudgmentSchema, graderJudgmentsVersion, } from "../adapters/grader-outputs/promptfoo-grader-output.js";
+import { GraderEmittedJudgmentSchema, graderJudgmentsVersion, } from "../adapters/grader-outputs/promptfoo-grader-output.js";
 import { validateFailureMode } from "./failure-modes.js";
 import { analyzeSourceIsolation, } from "../assertions/source-isolation.js";
 import { checkResultsExist } from "./checks.js";
@@ -184,34 +184,70 @@ export function extractGraderJudgments(resultsPath, telemetry) {
                 continue;
             }
             const score = parseRubricScore(comp);
-            // Extract the reason text — the grader's reasoning. Plan 03-01
-            // (D0045 trust boundary): the inline `JSON.parse + as`-cast at
-            // this site is replaced with `GraderJudgmentSchema.safeParse`
-            // so that grader output flows through a validated schema before
-            // it enters the scoring pipeline. On parse failure we fall to
-            // an `unclassified`-shape Phase 1 judgment built from the raw
-            // reason string — NEVER fall back to the legacy parser (Pitfall
-            // 4: strict and legacy schemas are deliberate siblings, not a
-            // fall-through chain).
+            // Extract the reason text — the grader's reasoning. W0273 splits
+            // the parse boundary into a wire shape (`GraderEmittedJudgmentSchema`
+            // — only fields the LLM controls) and a storage shape
+            // (`GraderJudgmentSchema` — full strict surface). The pipeline
+            // parses against the wire shape, then synthesizes the pipeline-owned
+            // fields (judgmentId, metadata.{graderModel,graderJudgmentsVersion},
+            // hallucinationCheckedAgainst) plus the result-context fields
+            // (taskId, modelId, dimension) to build the full storage shape.
+            //
+            // On parse failure we fall to an `unclassified`-shape Phase 1
+            // judgment built from the raw reason string — NEVER fall back to
+            // the legacy parser (Pitfall 4: strict and legacy schemas are
+            // deliberate siblings, not a fall-through chain).
             const reasonRaw = comp.reason ?? "";
             let parsedJudgment = null;
             let reason = reasonRaw;
             if (reasonRaw) {
                 try {
                     const candidate = JSON.parse(reasonRaw);
-                    // The strict schema asserts the full GraderJudgment surface.
-                    // safeParse handles non-object inputs (number, array, etc.)
-                    // by failing — we don't pre-narrow here.
+                    // The wire schema asserts only the LLM-emit subset. safeParse
+                    // handles non-object inputs (number, array, etc.) by failing —
+                    // we don't pre-narrow here.
                     const candidateObj = candidate && typeof candidate === "object" ? candidate : {};
-                    const result = GraderJudgmentSchema.safeParse({
-                        ...candidateObj,
-                        taskId,
-                        modelId,
-                        dimension: kind,
-                    });
+                    const result = GraderEmittedJudgmentSchema.safeParse(candidateObj);
                     if (result.success) {
-                        parsedJudgment = result.data;
-                        reason = result.data.reason;
+                        const emitted = result.data;
+                        parsedJudgment = {
+                            // Result-context fields — pipeline-supplied:
+                            taskId,
+                            modelId,
+                            dimension: kind,
+                            // Wire-emitted fields — LLM-controlled:
+                            score: emitted.score,
+                            reason: emitted.reason,
+                            failureMode: emitted.failureMode,
+                            subJudgments: emitted.subJudgments,
+                            docCitations: emitted.docCitations,
+                            confidence: emitted.confidence,
+                            ...(emitted.outputFailure && {
+                                outputFailure: emitted.outputFailure,
+                            }),
+                            // Pipeline-owned fields — synthesized:
+                            judgmentId: generateJudgmentId({
+                                taskId,
+                                modelId,
+                                dimension: kind,
+                                ...(telemetry?.runId ? { runId: telemetry.runId } : {}),
+                            }),
+                            // hallucinationCheckedAgainst is filled in later by
+                            // populateHallucinationFields (gap-analysis-step.ts) — it
+                            // needs the run.documentManifest union that isn't visible
+                            // here. Empty array is the documented pre-fill placeholder.
+                            hallucinationCheckedAgainst: [],
+                            metadata: {
+                                // graderModel is threaded via the existing
+                                // telemetry.reliability channel. When upstream wires the
+                                // real grader-provider alias into reliability.graderModel,
+                                // it propagates here automatically; today it's "unknown"
+                                // (matching the pre-W0273 synthesized-fallback default).
+                                graderModel: telemetry?.reliability.graderModel ?? "unknown",
+                                graderJudgmentsVersion,
+                            },
+                        };
+                        reason = emitted.reason;
                     }
                     else {
                         // Parse failure — drop to failureMode='unclassified' below.
@@ -433,6 +469,26 @@ export function populateHallucinationFields(judgments, taskDocSlugs, manifestSlu
  * `responseOutputTruncated` still flips for the extreme tail.
  */
 const MAX_RESPONSE_OUTPUT_LENGTH = 1_000_000;
+/**
+ * Weighted mean of dimension scores. Mirrors the dashboard's read-side
+ * fallback in `apps/dashboard/src/data/projections/test-entries.ts` so writer
+ * and reader stay aligned. Returns `undefined` when no dimension matches the
+ * profile (caller decides whether that signals misconfiguration).
+ */
+function computeStoredCompositeScore(dimensions, weights) {
+    let weighted = 0;
+    let totalWeight = 0;
+    for (const dim of dimensions) {
+        const w = weights[dim.dimension];
+        if (w === undefined)
+            continue;
+        weighted += dim.score * w;
+        totalWeight += w;
+    }
+    if (totalWeight === 0)
+        return undefined;
+    return Math.round(weighted / totalWeight);
+}
 /**
  * Extract per-test results with model output from evaluation results.
  *
@@ -440,9 +496,14 @@ const MAX_RESPONSE_OUTPUT_LENGTH = 1_000_000;
  * shape including response.output (truncated), latency, and cost.
  * One StoredTestResult per test × model combination.
  *
+ * When `profiles` is provided, each entry's `compositeScore` is computed as
+ * the weighted mean of its dimension scores using the profile matching its
+ * detected `variant`. Without profiles, `compositeScore` is omitted — legacy
+ * behavior preserved.
+ *
  * See D0029 and docs/design-docs/score-drill-down.md (Phase 1).
  */
-export function extractStoredTestResults(resultsPath) {
+export function extractStoredTestResults(resultsPath, profiles) {
     const results = readAndNormalizeResults(resultsPath);
     const testResults = [];
     for (const result of results) {
@@ -487,8 +548,13 @@ export function extractStoredTestResults(resultsPath) {
             dimensions.push({ dimension, reason, score });
         }
         const tokenUsage = result.response?.tokenUsage;
+        const profileForVariant = profiles?.[variant];
+        const compositeScore = profileForVariant
+            ? computeStoredCompositeScore(dimensions, profileForVariant)
+            : undefined;
         testResults.push({
             area,
+            ...(compositeScore !== undefined && { compositeScore }),
             cost: result.cost || undefined,
             dimensions,
             latencyMs: result.latencyMs,
@@ -1441,7 +1507,12 @@ export async function calculateAndWriteScores(options) {
             log.info(`Grader judgments written to results/latest/grader-judgments.json (${judgments.length} judgments)`);
         }
         // Extract and persist per-test results (D0029: model output + metadata)
-        const testResults = extractStoredTestResults(baselineResultsPath);
+        // Agent-harness produces a single profile shared across detected variants
+        // (the docs/no-docs split doesn't apply — there is no gold/baseline pair).
+        const testResults = extractStoredTestResults(baselineResultsPath, {
+            gold: agentProfile,
+            baseline: agentProfile,
+        });
         if (testResults.length > 0) {
             writeFileSync(join(outDir, "test-results.json"), JSON.stringify(testResults, null, 2));
             log.info(`Test results written to results/latest/test-results.json (${testResults.length} results)`);
@@ -1498,7 +1569,13 @@ export async function calculateAndWriteScores(options) {
             writeFileSync(join(outDir, "grader-judgments.json"), JSON.stringify(judgments, null, 2));
             log.info(`Grader judgments written to results/latest/grader-judgments.json (${judgments.length} judgments)`);
         }
-        const testResults = extractStoredTestResults(baselineResultsPath);
+        // Knowledge-probe deletes vars.docs in the compiler, so every entry's
+        // detected variant is "baseline" — supply the probe profile under both
+        // keys so the composite is populated regardless of detection.
+        const testResults = extractStoredTestResults(baselineResultsPath, {
+            gold: probeProfile,
+            baseline: probeProfile,
+        });
         if (testResults.length > 0) {
             writeFileSync(join(outDir, "test-results.json"), JSON.stringify(testResults, null, 2));
             log.info(`Test results written to results/latest/test-results.json (${testResults.length} results)`);
@@ -1512,9 +1589,15 @@ export async function calculateAndWriteScores(options) {
     // doc-coverage excluded). See docs/design-docs/named-scoring-profiles.md.
     const goldProfile = resolveProfile("literacy", "gold", rubricConfig, LiteracyVariant.STANDARD);
     const baselineProfileWeights = resolveProfile("literacy", LiteracyVariant.STANDARD, rubricConfig, LiteracyVariant.STANDARD);
+    // Hoisted so the post-scoring extractStoredTestResults call against the
+    // agentic results file can attach the matching profile (W0291).
+    const agenticProfile = mode === LiteracyVariant.FULL && existsSync(agenticResultsPath)
+        ? resolveProfile("literacy", "gold", rubricConfig, LiteracyVariant.AGENTIC)
+        : undefined;
     log.debug("Loaded scoring profiles", {
         gold: goldProfile,
         baseline: baselineProfileWeights,
+        ...(agenticProfile && { agentic: agenticProfile }),
     });
     const baselineScores = calculateScores(baselineResultsPath, goldProfile, baselineProfileWeights, preflightOptions);
     log.debug("Baseline scores calculated", {
@@ -1541,7 +1624,8 @@ export async function calculateAndWriteScores(options) {
     let evaluationMode;
     if (mode === LiteracyVariant.FULL && existsSync(agenticResultsPath)) {
         log.info(`\nReading agentic results from: ${agenticResultsPath}`);
-        const agenticProfile = resolveProfile("literacy", "gold", rubricConfig, LiteracyVariant.AGENTIC);
+        // Non-null assertion safe — the outer guard hoisting agenticProfile uses
+        // the same condition; if we entered this block, the profile was resolved.
         const agenticScores = scoreAgenticResults(agenticResultsPath, agenticProfile, preflightOptions);
         log.debug("Agentic scores calculated", {
             featureCount: Object.keys(agenticScores).length,
@@ -1645,11 +1729,21 @@ export async function calculateAndWriteScores(options) {
             });
         }
     }
-    // Extract and persist per-test results (D0029: model output + metadata)
-    const testResults = extractStoredTestResults(baselineResultsPath);
-    // In full mode, also extract test results from agentic results
+    // Extract and persist per-test results (D0029: model output + metadata).
+    // Literacy gold (with-docs) entries score against the default profile;
+    // baseline (without-docs) entries score against the output-only profile.
+    const testResults = extractStoredTestResults(baselineResultsPath, {
+        gold: goldProfile,
+        baseline: baselineProfileWeights,
+    });
+    // In full mode, also extract test results from agentic results — the
+    // agentic file's gold entries score against the agentic profile while
+    // baseline entries (if any leak through) still use the literacy baseline.
     if (mode === LiteracyVariant.FULL && existsSync(agenticResultsPath)) {
-        const agenticTestResults = extractStoredTestResults(agenticResultsPath);
+        const agenticTestResults = extractStoredTestResults(agenticResultsPath, {
+            gold: agenticProfile,
+            baseline: baselineProfileWeights,
+        });
         testResults.push(...agenticTestResults);
     }
     if (testResults.length > 0) {

package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts CHANGED Viewed

@@ -6,7 +6,7 @@
  */
 import type { LiteracyTaskDefinition } from "../../../../_vendor/ailf-core/index.d.ts";
 import type { PromptfooAssertion } from "../../assertion-mapper.js";
-import type { LiteracyCompileOptions } from "./types.js";
+import type { LiteracyCompileOptions, RubricResolutionInput } from "./types.js";
 export declare function resolveAssertions(task: LiteracyTaskDefinition, options: LiteracyCompileOptions | undefined, warnings: string[], canonicalReference?: string): PromptfooAssertion[];
 /**
  * Build baseline assertions matching the legacy expand-tasks behavior.
@@ -14,5 +14,10 @@ export declare function resolveAssertions(task: LiteracyTaskDefinition, options:
  * - "full": all assertions carried over
  * - "abbreviated": only first llm-rubric with shortened prompt
  * - "none": no assertions
+ *
+ * `rubricConfig` supplies the W0273 wire-shape footer for the abbreviated
+ * mode's synthetic rubric. Without it the abbreviated emission would fail
+ * `GraderEmittedJudgmentSchema.safeParse` (missing failureMode,
+ * subJudgments, docCitations, confidence).
  */
-export declare function buildBaselineAssertions(goldAssertions: PromptfooAssertion[], rubricMode?: "abbreviated" | "full" | "none"): PromptfooAssertion[];
+export declare function buildBaselineAssertions(goldAssertions: PromptfooAssertion[], rubricMode?: "abbreviated" | "full" | "none", rubricConfig?: RubricResolutionInput): PromptfooAssertion[];

package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js CHANGED Viewed

@@ -45,8 +45,10 @@ function buildDocCoverageAssertion(rubricConfig, graderProvider, canonicalRefere
         return null;
     const template = rubricConfig.templates["doc-coverage"];
     const scaleText = template.scale.map((s) => `- ${s}`).join("\n");
-    const rubricValue = `${template.header}\n${scaleText}\n\n` +
-        `Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}`;
+    // W0273 — use the centralized wire-shape footer so the grader emission
+    // parses against GraderEmittedJudgmentSchema. The pre-W0273 short
+    // {score, reason} footer caused 100% parse failures starting 2026-05-11.
+    const rubricValue = `${template.header}\n${scaleText}\n\n` + `${rubricConfig.footer}`;
     // doc-coverage benefits from the same authoritative reference — the grader
     // needs the doc content to judge whether the candidate actually used what
     // was documented.
@@ -92,8 +94,13 @@ function buildDocCoverageRubricPrompt(rubric, reference) {
  * - "full": all assertions carried over
  * - "abbreviated": only first llm-rubric with shortened prompt
  * - "none": no assertions
+ *
+ * `rubricConfig` supplies the W0273 wire-shape footer for the abbreviated
+ * mode's synthetic rubric. Without it the abbreviated emission would fail
+ * `GraderEmittedJudgmentSchema.safeParse` (missing failureMode,
+ * subJudgments, docCitations, confidence).
  */
-export function buildBaselineAssertions(goldAssertions, rubricMode) {
+export function buildBaselineAssertions(goldAssertions, rubricMode, rubricConfig) {
     const mode = rubricMode ?? "full";
     if (mode === "none")
         return [];
@@ -106,10 +113,12 @@ export function buildBaselineAssertions(goldAssertions, rubricMode) {
         if (a.type === "llm-rubric") {
             if (!foundFirst) {
                 foundFirst = true;
+                const footer = rubricConfig?.footer ??
+                    'Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}';
                 abbreviated.push({
                     type: "llm-rubric",
                     value: "Score task completion from 0 to 100 (same criteria as above).\n" +
-                        'Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}',
+                        footer,
                     ...(a.provider ? { provider: a.provider } : {}),
                 });
             }

package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js CHANGED Viewed

@@ -134,7 +134,7 @@ function buildTestCases(task, evalMode, options, warnings) {
     if (evalMode !== "agentic") {
         const baselineEnabled = task.baseline?.enabled !== false;
         if (baselineEnabled) {
-            const baselineAssertions = buildBaselineAssertions(assertions, task.baseline?.rubric);
+            const baselineAssertions = buildBaselineAssertions(assertions, task.baseline?.rubric, options?.rubricConfig);
             tests.push({
                 description: `${taskTitle} (baseline)`,
                 vars: {

package/dist/pipeline/compiler/provider-assembler.d.ts CHANGED Viewed

@@ -50,8 +50,15 @@ export interface ModelsAndProviders {
  * Returns provider arrays keyed by literacy variant name (baseline,
  * agentic, observed). These are consumed by the YAML writer to produce
  * the per-variant promptfoo config files.
+ *
+ * `loaded` (optional) lets callers pre-load and pre-filter the
+ * `ModelsConfig` so a caller-side filter (e.g. W0281's
+ * `filterModelsByRequest`) actually takes effect on the assembled
+ * providers — building providers from the unfiltered set would silently
+ * defeat the filter, since promptfoo decides which LLMs to call from the
+ * providers array, not the returned `models` field.
  */
-export declare function loadModelsAndProviders(rootDir: string, source?: ResolvedSourceConfig, searchMode?: string, allowedOrigins?: string[]): ModelsAndProviders;
+export declare function loadModelsAndProviders(rootDir: string, source?: ResolvedSourceConfig, searchMode?: string, allowedOrigins?: string[], loaded?: ModelsConfig): ModelsAndProviders;
 /**
  * Resolve `maxToolRounds` for an agentic variant (W0134).
  *
@@ -60,3 +67,10 @@ export declare function loadModelsAndProviders(rootDir: string, source?: Resolve
  * > hard fallback (5).
  */
 export declare function resolveMaxToolRounds(models: ModelsConfig, model: ModelsConfig["models"][number], variant: "agentic-naive" | "agentic-optimized"): number;
+/**
+ * Load the `ModelsConfig` for `rootDir` from disk. Exported so callers
+ * that need to pre-filter the model set before provider assembly (e.g.
+ * `PipelineRequest.models`) can hand the filtered config back to
+ * `loadModelsAndProviders` via its optional `loaded` parameter.
+ */
+export declare function loadModelsYaml(rootDir: string): ModelsConfig;

package/dist/pipeline/compiler/provider-assembler.js CHANGED Viewed

@@ -64,9 +64,16 @@ function applyReplaySwap(providers) {
  * Returns provider arrays keyed by literacy variant name (baseline,
  * agentic, observed). These are consumed by the YAML writer to produce
  * the per-variant promptfoo config files.
+ *
+ * `loaded` (optional) lets callers pre-load and pre-filter the
+ * `ModelsConfig` so a caller-side filter (e.g. W0281's
+ * `filterModelsByRequest`) actually takes effect on the assembled
+ * providers — building providers from the unfiltered set would silently
+ * defeat the filter, since promptfoo decides which LLMs to call from the
+ * providers array, not the returned `models` field.
  */
-export function loadModelsAndProviders(rootDir, source, searchMode, allowedOrigins) {
-    const models = loadModelsYaml(rootDir);
+export function loadModelsAndProviders(rootDir, source, searchMode, allowedOrigins, loaded) {
+    const models = loaded ?? loadModelsYaml(rootDir);
     return {
         models,
         providers: {
@@ -203,6 +210,12 @@ export function resolveMaxToolRounds(models, model, variant) {
 // ---------------------------------------------------------------------------
 // Helpers
 // ---------------------------------------------------------------------------
-function loadModelsYaml(rootDir) {
+/**
+ * Load the `ModelsConfig` for `rootDir` from disk. Exported so callers
+ * that need to pre-filter the model set before provider assembly (e.g.
+ * `PipelineRequest.models`) can hand the filtered config back to
+ * `loadModelsAndProviders` via its optional `loaded` parameter.
+ */
+export function loadModelsYaml(rootDir) {
     return loadConfigFile("models", rootDir).data;
 }

package/dist/pipeline/failure-modes.d.ts CHANGED Viewed

@@ -1,24 +1,34 @@
 /**
  * pipeline/failure-modes.ts
  *
- * Ceiling-cross-check failure-mode validator + report assembly.
+ * Ceiling-cross-check failure-mode validator + report assembly + keyword
+ * fallback classifier.
  *
  * The grader emits `failureMode` directly under the per-dimension taxonomy
- * (Plan 03-02 — `packages/eval/src/grader/`). This module consumes the
- * grader's emission as the source of truth and uses the surviving ceiling
- * decomposition (`classifyByCeiling`) only as a CONFIDENCE VALIDATOR — it
- * cross-checks the emitted mode against structural score signals and emits
- * a D0049 `Confidence` triple stamped with `derivation: "ceiling-cross-check"`.
+ * (Plan 03-02 — `packages/eval/src/grader/`) when its structured response
+ * is available to the pipeline. In practice (W0273 discovery), Promptfoo's
+ * `llm-rubric` post-processor extracts `score` + `reason` from the grader's
+ * JSON envelope and discards the rest of the structured surface — including
+ * `failureMode`. The wire-shape footer instructs the LLM correctly but the
+ * structured fields never reach `extractGraderJudgments`, so every emission
+ * arrives as the synthesized `failureMode: "unclassified"` placeholder.
  *
- * The legacy keyword-pattern classifier (and its five regex pattern
- * constants) was deleted in Plan 03-03 — its production coverage was ~1%
- * (Doc 03 §"Where classification lives"), and resurrecting it as a fallback
- * is explicitly out of scope.
+ * To restore the pre-2026-05-11 classification rate (15-23% → 0% → 15-23%),
+ * a keyword-pattern classifier is run as a FALLBACK when the grader's
+ * emitted mode is `"unclassified"` and the score is below the classification
+ * threshold. Plan 03-03 deleted this classifier in favor of grader-emission
+ * source-of-truth; W0273 reinstates it because the grader-emission path is
+ * blocked by Promptfoo's lossy `llm-rubric` parsing. The long-term fix
+ * (capturing the grader's full structured response) is tracked separately.
+ *
+ * `classifyByCeiling` continues to serve as the confidence cross-check.
  *
  * @see docs/decisions/D0005-grader-model-separation.md — single grader emits
  *      failureMode under the per-dimension taxonomy
  * @see docs/decisions/D0049-shared-confidence-contract.md — Confidence triple
  *      shape and `ceiling-cross-check` derivation tag
+ * @see docs/audits/2026-05-22-empty-gap-analysis-regression.md — W0273 root
+ *      cause (Promptfoo strips structured fields)
  */
 import type { Confidence } from "../_vendor/ailf-core/index.d.ts";
 import type { FailureModeReport, FeatureScore, GraderJudgment } from "./types.js";

package/dist/pipeline/failure-modes.js CHANGED Viewed

@@ -1,24 +1,34 @@
 /**
  * pipeline/failure-modes.ts
  *
- * Ceiling-cross-check failure-mode validator + report assembly.
+ * Ceiling-cross-check failure-mode validator + report assembly + keyword
+ * fallback classifier.
  *
  * The grader emits `failureMode` directly under the per-dimension taxonomy
- * (Plan 03-02 — `packages/eval/src/grader/`). This module consumes the
- * grader's emission as the source of truth and uses the surviving ceiling
- * decomposition (`classifyByCeiling`) only as a CONFIDENCE VALIDATOR — it
- * cross-checks the emitted mode against structural score signals and emits
- * a D0049 `Confidence` triple stamped with `derivation: "ceiling-cross-check"`.
+ * (Plan 03-02 — `packages/eval/src/grader/`) when its structured response
+ * is available to the pipeline. In practice (W0273 discovery), Promptfoo's
+ * `llm-rubric` post-processor extracts `score` + `reason` from the grader's
+ * JSON envelope and discards the rest of the structured surface — including
+ * `failureMode`. The wire-shape footer instructs the LLM correctly but the
+ * structured fields never reach `extractGraderJudgments`, so every emission
+ * arrives as the synthesized `failureMode: "unclassified"` placeholder.
  *
- * The legacy keyword-pattern classifier (and its five regex pattern
- * constants) was deleted in Plan 03-03 — its production coverage was ~1%
- * (Doc 03 §"Where classification lives"), and resurrecting it as a fallback
- * is explicitly out of scope.
+ * To restore the pre-2026-05-11 classification rate (15-23% → 0% → 15-23%),
+ * a keyword-pattern classifier is run as a FALLBACK when the grader's
+ * emitted mode is `"unclassified"` and the score is below the classification
+ * threshold. Plan 03-03 deleted this classifier in favor of grader-emission
+ * source-of-truth; W0273 reinstates it because the grader-emission path is
+ * blocked by Promptfoo's lossy `llm-rubric` parsing. The long-term fix
+ * (capturing the grader's full structured response) is tracked separately.
+ *
+ * `classifyByCeiling` continues to serve as the confidence cross-check.
  *
  * @see docs/decisions/D0005-grader-model-separation.md — single grader emits
  *      failureMode under the per-dimension taxonomy
  * @see docs/decisions/D0049-shared-confidence-contract.md — Confidence triple
  *      shape and `ceiling-cross-check` derivation tag
+ * @see docs/audits/2026-05-22-empty-gap-analysis-regression.md — W0273 root
+ *      cause (Promptfoo strips structured fields)
  */
 import { LEGACY_FAILURE_MODES, detectFeatureArea } from "../_vendor/ailf-core/index.js";
 // ---------------------------------------------------------------------------
@@ -27,6 +37,20 @@ import { LEGACY_FAILURE_MODES, detectFeatureArea } from "../_vendor/ailf-core/in
 /** Only classify judgments with scores below this threshold */
 const CLASSIFICATION_THRESHOLD = 60;
 // ---------------------------------------------------------------------------
+// Keyword patterns (W0273 fallback)
+//
+// Verbatim from the pre-Plan-03-03 implementation. Used only when the
+// grader's emitted `failureMode` is `"unclassified"` — the grader's
+// emission still wins whenever it actually reaches the pipeline.
+// ---------------------------------------------------------------------------
+/** API error pattern — checked FIRST to prevent timeout errors containing
+ *  "deprecated" from being misclassified as outdated-docs. */
+const API_ERROR_PATTERN = /\[api-error\]|timeout|timed out|rate limit|429|503|ECONNRESET|ETIMEDOUT|socket hang up|fetch failed/i;
+const OUTDATED_PATTERN = /deprecated|old api|v[0-9]+ syntax|no longer supported|legacy|previous version|outdated|superseded|replaced by/i;
+const MISSING_PATTERN = /no documentation|not covered|had to guess|not found|missing.*doc|no.*information|undocumented|couldn't find|without.*documentation/i;
+const INCORRECT_PATTERN = /contradicts|incorrect.*doc|doc.*incorrect|wrong.*doc|doc.*wrong|documentation says.*but|factual error|inaccurate|misleading.*doc/i;
+const POOR_STRUCTURE_PATTERN = /unclear|ambiguous|couldn't determine|conflicting|confusing|hard to follow|poorly organized|scattered|fragmented/i;
+// ---------------------------------------------------------------------------
 // Public API
 // ---------------------------------------------------------------------------
 /**
@@ -69,13 +93,25 @@ export function buildFailureModeReport(judgments, scores) {
         // grader's actual taxonomy choice rather than a collapsed
         // `"unclassified"` bucket.
         const emittedMode = readEmittedMode(judgment);
+        // W0273 fallback — when the grader's emitted mode is "unclassified"
+        // (the synthesized-unparsed-judgment placeholder; in practice this
+        // is every judgment today because Promptfoo's llm-rubric strips the
+        // grader's structured response), try keyword classification against
+        // the reason prose. Gated on score < CLASSIFICATION_THRESHOLD so
+        // passing judgments don't get spurious classifications.
+        const keywordFallback = emittedMode === "unclassified" &&
+            judgment.score < CLASSIFICATION_THRESHOLD
+            ? classifyByKeyword(judgment.reason)
+            : null;
         // Cross-check the grader's emission against ceiling decomposition.
         const stamp = validateFailureMode(judgment, ceilingScore, floorScore);
-        const classification = {
-            confidence: stamp.level,
-            mode: emittedMode,
-            source: "ceiling",
-        };
+        const classification = keywordFallback
+            ? keywordFallback
+            : {
+                confidence: stamp.level,
+                mode: emittedMode,
+                source: "ceiling",
+            };
         classifiedJudgments.push({ classification, judgment });
         summary[classification.mode] = (summary[classification.mode] ?? 0) + 1;
         // Per-area tracking
@@ -282,6 +318,39 @@ function readEmittedMode(judgment) {
     }
     return emitted;
 }
+/**
+ * Classify the failure mode of a low-scoring grader judgment by matching
+ * keyword patterns against the reason prose. Returns `null` when no
+ * pattern matches. Patterns checked in priority order (API errors first
+ * so timeout messages containing "deprecated" don't get misclassified
+ * as outdated-docs).
+ *
+ * W0273 — reinstated as a fallback when the grader's emitted failureMode
+ * is "unclassified". Plan 03-03 deleted this code in favor of grader-
+ * emission source-of-truth; the deletion is reversed here because
+ * Promptfoo's llm-rubric post-processor strips the grader's structured
+ * response (only score + reason survive into `comp.*`), so the
+ * grader-emission path produces 0% classification on every run.
+ */
+function classifyByKeyword(reason) {
+    const lower = reason.toLowerCase();
+    if (API_ERROR_PATTERN.test(lower)) {
+        return { confidence: "high", mode: "api-error", source: "keyword" };
+    }
+    if (OUTDATED_PATTERN.test(lower)) {
+        return { confidence: "high", mode: "outdated-docs", source: "keyword" };
+    }
+    if (MISSING_PATTERN.test(lower)) {
+        return { confidence: "high", mode: "missing-docs", source: "keyword" };
+    }
+    if (INCORRECT_PATTERN.test(lower)) {
+        return { confidence: "medium", mode: "incorrect-docs", source: "keyword" };
+    }
+    if (POOR_STRUCTURE_PATTERN.test(lower)) {
+        return { confidence: "medium", mode: "poor-structure", source: "keyword" };
+    }
+    return null;
+}
 /**
  * Classify by ceiling-decomposition structural signals — preserved
  * verbatim from the pre-Plan-03-03 implementation. The function itself

package/dist/pipeline/map-request-to-config.js CHANGED Viewed

@@ -37,6 +37,7 @@ export function mapRequestToConfig(request, rootDir) {
         mode,
         variant,
         debug: mapDebug(request.debug),
+        models: request.models,
         areas: request.areas,
         tasks: request.tasks,
         changedDocs: request.changedDocs,
@@ -46,6 +47,7 @@ export function mapRequestToConfig(request, rootDir) {
         compareEnabled: request.compare ?? false,
         compareThreshold: request.compareThreshold,
         compareBaseline: request.compareBaseline,
+        compareBaselineReportId: request.compareBaselineReportId,
         gapAnalysisEnabled: request.gapAnalysis ?? true,
         publishEnabled: request.publish ?? publishDefault,
         publishTag: request.publishTag,

package/dist/pipeline/normalize-mode.d.ts CHANGED Viewed

@@ -35,7 +35,7 @@ export type LiteracyVariantName = (typeof LiteracyVariant)[keyof typeof Literacy
 export type LiteracyEvalSubMode = typeof LiteracyVariant.STANDARD | typeof LiteracyVariant.AGENTIC;
 export interface NormalizedMode {
     mode: EvalMode;
-    variant?: string;
+    variant?: LiteracyVariantName;
 }
 /**
  * Normalize a raw CLI mode string to a canonical mode + optional variant.

package/dist/pipeline/normalize-mode.js CHANGED Viewed

@@ -55,6 +55,8 @@ const ALL_ACCEPTED = [
 export function normalizeMode(input) {
     if (LEGACY_LITERACY_VARIANTS.has(input)) {
         console.warn(`⚠ Deprecated: --mode ${input} is a legacy alias. Use --mode literacy --variant ${input} instead.`);
+        // The membership check above narrows `input` to LITERACY_VARIANTS — the
+        // cast is to the closed type, not a widening.
         return { mode: "literacy", variant: input };
     }
     if (CANONICAL_MODES.has(input)) {