npm - @sanity/ailf - Versions diffs - 7.0.1 → 7.1.0 - Mend

@sanity/ailf 7.0.1 → 7.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (72) hide show

package/config/rubrics.ts +12 -13
package/dist/_vendor/ailf-core/ports/context.d.ts +45 -3
package/dist/_vendor/ailf-core/ports/index.d.ts +1 -1
package/dist/_vendor/ailf-core/schemas/branded-string.d.ts +9 -1
package/dist/_vendor/ailf-core/schemas/branded-string.js +16 -6
package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +2 -0
package/dist/_vendor/ailf-core/schemas/pipeline-request.js +7 -0
package/dist/_vendor/ailf-core/schemas/report.d.ts +12 -0
package/dist/_vendor/ailf-core/schemas/report.js +2 -0
package/dist/_vendor/ailf-core/schemas/team.d.ts +22 -0
package/dist/_vendor/ailf-core/schemas/team.js +63 -0
package/dist/_vendor/ailf-core/types/grader-judgment.d.ts +51 -0
package/dist/_vendor/ailf-core/types/index.d.ts +8 -1
package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +17 -0
package/dist/_vendor/ailf-core/types/team.d.ts +65 -0
package/dist/_vendor/ailf-core/types/team.js +1 -0
package/dist/_vendor/ailf-shared/eval-modes.d.ts +2 -0
package/dist/_vendor/ailf-shared/eval-modes.js +5 -0
package/dist/_vendor/ailf-shared/event-types.d.ts +15 -0
package/dist/_vendor/ailf-shared/event-types.js +23 -0
package/dist/_vendor/ailf-shared/generated/help-content.js +2 -2
package/dist/_vendor/ailf-shared/index.d.ts +4 -2
package/dist/_vendor/ailf-shared/index.js +4 -2
package/dist/_vendor/ailf-shared/member-roles.d.ts +16 -0
package/dist/_vendor/ailf-shared/member-roles.js +16 -0
package/dist/_vendor/ailf-shared/owner-teams.d.ts +19 -0
package/dist/_vendor/ailf-shared/owner-teams.js +7 -0
package/dist/_vendor/ailf-shared/run-context.d.ts +8 -1
package/dist/adapters/grader-outputs/promptfoo-grader-output.d.ts +65 -1
package/dist/adapters/grader-outputs/promptfoo-grader-output.js +35 -0
package/dist/adapters/task-sources/changed-docs-filter.d.ts +12 -0
package/dist/adapters/task-sources/changed-docs-filter.js +30 -0
package/dist/adapters/task-sources/content-lake-task-source.js +2 -1
package/dist/adapters/task-sources/repo-task-source.js +2 -1
package/dist/commands/pipeline-action.d.ts +4 -3
package/dist/commands/pipeline-action.js +7 -5
package/dist/commands/run.js +2 -2
package/dist/config/rubrics.ts +12 -13
package/dist/job-store.d.ts +18 -0
package/dist/job-store.js +34 -0
package/dist/orchestration/build-app-context.js +8 -1
package/dist/orchestration/pipeline-orchestrator.js +46 -1
package/dist/orchestration/steps/compare-step.d.ts +7 -0
package/dist/orchestration/steps/compare-step.js +59 -23
package/dist/orchestration/steps/fetch-docs-step.js +3 -0
package/dist/orchestration/steps/finalize-run-step.js +2 -0
package/dist/orchestration/steps/generate-configs-step.d.ts +32 -1
package/dist/orchestration/steps/generate-configs-step.js +47 -13
package/dist/orchestration/steps/grader-consistency-step.js +11 -0
package/dist/orchestration/steps/publish-report-step.d.ts +12 -1
package/dist/orchestration/steps/publish-report-step.js +19 -3
package/dist/pipeline/cache-hit-restore.d.ts +14 -1
package/dist/pipeline/cache-hit-restore.js +17 -0
package/dist/pipeline/calculate-scores.js +57 -21
package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +7 -2
package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +13 -4
package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +1 -1
package/dist/pipeline/compiler/provider-assembler.d.ts +15 -1
package/dist/pipeline/compiler/provider-assembler.js +16 -3
package/dist/pipeline/failure-modes.d.ts +20 -10
package/dist/pipeline/failure-modes.js +84 -15
package/dist/pipeline/map-request-to-config.js +2 -0
package/dist/pipeline/normalize-mode.d.ts +1 -1
package/dist/pipeline/normalize-mode.js +2 -0
package/dist/pipeline/run-context.d.ts +16 -1
package/dist/pipeline/run-context.js +12 -1
package/dist/pipeline/validate.d.ts +8 -4
package/dist/pipeline/validate.js +8 -18
package/dist/report-store.d.ts +14 -1
package/dist/report-store.js +32 -0
package/dist/sanity/client.js +2 -2
package/package.json +1 -1

package/dist/orchestration/steps/compare-step.js CHANGED Viewed

@@ -4,6 +4,13 @@
  * This step is already pure (no execSync, no env vars) — the logic is
  * inlined directly from the former pipeline/steps/compare-step.ts.
  * This is an optional step — failure doesn't stop the pipeline.
+ *
+ * Baseline resolution order (highest priority first):
+ *  1. `compareBaselineReportId` — fetch the named report doc
+ *     and use its `summary` (a ReportSummary, which is a
+ *     superset of ComparableSummary) as the baseline.
+ *  2. `compareBaseline` — local filesystem path (CLI ergonomics).
+ *  3. Latest baseline in `results/baselines/`.
  */
 import { existsSync, mkdirSync, readFileSync, readdirSync, writeFileSync, } from "fs";
 import { join, resolve } from "path";
@@ -29,39 +36,68 @@ export class CompareStep {
         }
         // Load experiment (current run)
         const experiment = JSON.parse(readFileSync(scoreSummaryPath, "utf-8"));
-        // Resolve baseline
-        let resolvedBaselinePath;
-        if (ctx.config.compareBaseline) {
-            resolvedBaselinePath = resolve(ctx.config.compareBaseline);
-        }
-        else {
-            const baselinesDir = resolve(rootDir, "results", "baselines");
-            if (!existsSync(baselinesDir)) {
+        // Resolve baseline. Pinned report id wins over local FS, which wins
+        // over auto-discovery of the most recent file in `results/baselines/`.
+        let baseline;
+        const pinnedReportId = ctx.config.compareBaselineReportId;
+        if (pinnedReportId) {
+            if (!ctx.reportStore) {
                 return {
-                    reason: "No baselines directory found. Run 'pnpm baseline:save' first.",
+                    reason: "compareBaselineReportId set but no reportStore is configured. " +
+                        "Check Sanity credentials in .ailf/config.yaml.",
                     status: "skipped",
                 };
             }
-            const files = readdirSync(baselinesDir)
-                .filter((f) => f.endsWith(".json"))
-                .sort()
-                .reverse();
-            if (files.length === 0) {
+            const result = await ctx.reportStore.loadBaselineFromReport(pinnedReportId);
+            if (result.kind === "error") {
+                return {
+                    durationMs: Date.now() - start,
+                    error: `Failed to load baseline report ${pinnedReportId}: ${result.message}`,
+                    status: "failed",
+                };
+            }
+            if (result.kind === "not_found") {
                 return {
-                    reason: "No baseline files found. Run 'pnpm baseline:save' first.",
+                    reason: `Baseline report ${pinnedReportId} not found.`,
                     status: "skipped",
                 };
             }
-            resolvedBaselinePath = join(baselinesDir, files[0]);
+            baseline = result.baseline;
         }
-        if (!existsSync(resolvedBaselinePath)) {
-            return {
-                durationMs: Date.now() - start,
-                error: `Baseline file not found: ${resolvedBaselinePath}`,
-                status: "failed",
-            };
+        else {
+            let resolvedBaselinePath;
+            if (ctx.config.compareBaseline) {
+                resolvedBaselinePath = resolve(ctx.config.compareBaseline);
+            }
+            else {
+                const baselinesDir = resolve(rootDir, "results", "baselines");
+                if (!existsSync(baselinesDir)) {
+                    return {
+                        reason: "No baselines directory found. Run 'pnpm baseline:save' first.",
+                        status: "skipped",
+                    };
+                }
+                const files = readdirSync(baselinesDir)
+                    .filter((f) => f.endsWith(".json"))
+                    .sort()
+                    .reverse();
+                if (files.length === 0) {
+                    return {
+                        reason: "No baseline files found. Run 'pnpm baseline:save' first.",
+                        status: "skipped",
+                    };
+                }
+                resolvedBaselinePath = join(baselinesDir, files[0]);
+            }
+            if (!existsSync(resolvedBaselinePath)) {
+                return {
+                    durationMs: Date.now() - start,
+                    error: `Baseline file not found: ${resolvedBaselinePath}`,
+                    status: "failed",
+                };
+            }
+            baseline = JSON.parse(readFileSync(resolvedBaselinePath, "utf-8"));
         }
-        const baseline = JSON.parse(readFileSync(resolvedBaselinePath, "utf-8"));
         // Run comparison
         const options = ctx.config.compareThreshold
             ? { noiseThreshold: ctx.config.compareThreshold }

package/dist/orchestration/steps/fetch-docs-step.js CHANGED Viewed

@@ -37,6 +37,9 @@ export class FetchDocsStep {
             ...(ctx.config.areas?.length ? { areas: ctx.config.areas } : {}),
             ...(ctx.config.tasks?.length ? { taskIds: ctx.config.tasks } : {}),
             ...(ctx.config.tags?.length ? { tags: ctx.config.tags } : {}),
+            ...(ctx.config.changedDocs?.length
+                ? { changedDocs: ctx.config.changedDocs }
+                : {}),
         };
         const allTasks = await ctx.taskSource.loadTasks(Object.keys(filter).length > 0 ? filter : undefined);
         // Bridge: narrow to literacy tasks for canonical doc access

package/dist/orchestration/steps/finalize-run-step.js CHANGED Viewed

@@ -84,6 +84,8 @@ export class FinalizeRunStep {
             rootDir: ctx.config.rootDir,
             source: resolvedSource,
             taskIds: ctx.config.tasks,
+            variant: ctx.config.variant,
+            requestedModelIds: ctx.config.models,
         });
         // W0051 revisit: the composition-root wraps `ctx.artifactWriter` in
         // `AccumulatingArtifactWriter`, which keeps a map of every ref any

package/dist/orchestration/steps/generate-configs-step.d.ts CHANGED Viewed

@@ -8,7 +8,7 @@
  * When the variant is "full", the handler is called twice (baseline + agentic)
  * and three YAML files are written. Other modes produce one YAML file.
  */
-import { type AppContext, type PipelineState, type PipelineStep, type StepResult, type ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
+import { type AppContext, type ModelsConfig, type PipelineState, type PipelineStep, type StepResult, type ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
 export declare class GenerateConfigsStep implements PipelineStep {
     readonly name = "generate-configs";
     /** Task IDs from the last loadTasks call (pre-filter), for error messages. */
@@ -42,3 +42,34 @@ export declare class GenerateConfigsStep implements PipelineStep {
     cacheInputs(ctx: AppContext): string[];
     cacheContext(ctx: AppContext): string[];
 }
+/**
+ * Merge multiple compile results into one.
+ *
+ * Note: `providers` and `prompts` are taken from the first result only.
+ * This is correct for single-mode compilation where all tasks share the
+ * same provider set. Cross-mode merging with per-task provider overrides
+ * would need deduplication here.
+ */
+/**
+ * Apply `PipelineRequest.models` to the loaded model cohort (W0281).
+ *
+ * Returns one of three outcomes:
+ *   - `unfiltered` — caller didn't pin any models; pass through.
+ *   - `filtered`   — at least one requested ID matched the cohort; unknown
+ *                    IDs are reported via a structured warning so callers
+ *                    can detect typos.
+ *   - `no-match`   — every requested ID is unknown. Caller wired this
+ *                    step into a failure path so the rejection reason
+ *                    surfaces on the job's `error` field, not silently.
+ */
+export type FilterModelsResult = {
+    kind: "unfiltered";
+    models: ModelsConfig;
+} | {
+    kind: "filtered";
+    models: ModelsConfig;
+} | {
+    kind: "no-match";
+    reason: string;
+};
+export declare function filterModelsByRequest(loaded: ModelsConfig, requested: string[] | undefined, logger: import("@sanity/ailf-core").Logger): FilterModelsResult;

package/dist/orchestration/steps/generate-configs-step.js CHANGED Viewed

@@ -67,12 +67,32 @@ export class GenerateConfigsStep {
                 };
             }
             // Load models
-            const { loadModelsAndProviders } = await import("../../pipeline/compiler/provider-assembler.js");
+            const { loadModelsAndProviders, loadModelsYaml } = await import("../../pipeline/compiler/provider-assembler.js");
             const overrides = configToSourceOverrides(ctx.config);
             const resolvedSource = ctx.config.source
                 ? loadSource(ctx.config.source, overrides)
                 : undefined;
-            const { models, providers } = loadModelsAndProviders(ctx.config.rootDir, resolvedSource, ctx.config.searchMode, ctx.config.allowedOrigins);
+            // W0281: when the caller pinned a subset of models via
+            // `PipelineRequest.models`, filter the cohort BEFORE provider
+            // assembly. Filtering only the returned `models` field would silently
+            // defeat the filter — promptfoo decides which LLMs to call from the
+            // providers array, which is assembled from the unfiltered set unless
+            // we hand the assembler a pre-filtered ModelsConfig. Unknown IDs are
+            // surfaced via a structured warning AND a failed step result (whose
+            // message lands on `ailf.job.error`) so callers can detect typos
+            // instead of silently running the full default cohort.
+            const rawModels = loadModelsYaml(ctx.config.rootDir);
+            const filtered = filterModelsByRequest(rawModels, ctx.config.models, ctx.logger);
+            if (filtered.kind === "no-match") {
+                return {
+                    durationMs: Date.now() - start,
+                    error: filtered.reason,
+                    status: "failed",
+                };
+            }
+            const loaded = loadModelsAndProviders(ctx.config.rootDir, resolvedSource, ctx.config.searchMode, ctx.config.allowedOrigins, filtered.models);
+            const models = loaded.models;
+            const providers = loaded.providers;
             // Literacy mode: variant expansion (baseline + agentic → 3 YAML files)
             if (mode === "literacy") {
                 return this.compileLiteracyVariants(ctx, handler, tasks, models, providers, start);
@@ -239,6 +259,9 @@ export class GenerateConfigsStep {
             ...(ctx.config.areas?.length ? { areas: ctx.config.areas } : {}),
             ...(ctx.config.tasks?.length ? { taskIds: ctx.config.tasks } : {}),
             ...(ctx.config.tags?.length ? { tags: ctx.config.tags } : {}),
+            ...(ctx.config.changedDocs?.length
+                ? { changedDocs: ctx.config.changedDocs }
+                : {}),
         };
         const allTasks = await ctx.taskSource.loadTasks(Object.keys(filter).length > 0 ? filter : undefined);
         // Mode filter — the adapter may return a mixed-mode set (e.g. a user's
@@ -345,17 +368,28 @@ export class GenerateConfigsStep {
         return buildCacheContext(ctx.config);
     }
 }
-// ---------------------------------------------------------------------------
-// Helpers
-// ---------------------------------------------------------------------------
-/**
- * Merge multiple compile results into one.
- *
- * Note: `providers` and `prompts` are taken from the first result only.
- * This is correct for single-mode compilation where all tasks share the
- * same provider set. Cross-mode merging with per-task provider overrides
- * would need deduplication here.
- */
+export function filterModelsByRequest(loaded, requested, logger) {
+    if (!requested || requested.length === 0) {
+        return { kind: "unfiltered", models: loaded };
+    }
+    const availableIds = new Set(loaded.models.map((m) => m.id));
+    const requestedSet = new Set(requested);
+    const kept = loaded.models.filter((m) => requestedSet.has(m.id));
+    const unknown = requested.filter((id) => !availableIds.has(id));
+    if (kept.length === 0) {
+        const reason = `[generate-configs] PipelineRequest.models rejected — none of ` +
+            `[${requested.join(", ")}] match config/models.ts. ` +
+            `Available IDs: ${[...availableIds].join(", ") || "(none configured)"}.`;
+        logger.warn(reason);
+        return { kind: "no-match", reason };
+    }
+    if (unknown.length > 0) {
+        logger.warn(`[generate-configs] PipelineRequest.models partial match — ignoring ` +
+            `unknown ID(s) [${unknown.join(", ")}]; ` +
+            `running ${kept.length}/${requested.length} requested.`);
+    }
+    return { kind: "filtered", models: { ...loaded, models: kept } };
+}
 function mergeCompileResults(results) {
     const tests = results.flatMap((r) => r.tests);
     const warnings = results.flatMap((r) => r.warnings);

package/dist/orchestration/steps/grader-consistency-step.js CHANGED Viewed

@@ -18,7 +18,18 @@ export class GraderConsistencyStep {
     }
     async execute(ctx) {
         const start = Date.now();
+        // Default-on-omit is 5 (matches consistency-analysis-friendly defaults).
+        // The dashboard sends 1 by default for cost reasons (see W0283 / new-eval
+        // audit S1-E). When the resolved value is <2, the analysis can't compute
+        // variance — skip instead of failing so the job doesn't carry a
+        // misleading `error.step: "grader-consistency"`.
         const replications = ctx.config.graderReplications ?? 5;
+        if (replications < 2) {
+            return {
+                reason: `graderReplications=${replications} (<2) — consistency analysis requires at least 2 replications`,
+                status: "skipped",
+            };
+        }
         const primaryResultsRun = ctx.config.mode === "literacy"
             ? ctx.config.variant === LiteracyVariant.FULL
                 ? LiteracyVariant.STANDARD

package/dist/orchestration/steps/publish-report-step.d.ts CHANGED Viewed

@@ -10,7 +10,8 @@
  * - P5: Local-first (pipeline never fails because of a store write)
  * - P6: Sinks are fire-and-forget (failures logged, not thrown)
  */
-import { type AppContext, type PipelineState, type PipelineStep, type PromptfooUrlEntry, type StepResult, type ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
+import { type AppContext, type PipelineState, type PipelineStep, type PromptfooUrlEntry, type ReportAutoScope, type ScoreSummary, type StepResult, type ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
+import { type ProvenanceInput } from "../../pipeline/provenance.js";
 export declare class PublishReportStep implements PipelineStep {
     private readonly pipelineStart;
     private readonly options;
@@ -24,3 +25,13 @@ export declare class PublishReportStep implements PipelineStep {
     check(): ValidationIssue[];
     execute(ctx: AppContext, state: PipelineState): Promise<StepResult>;
 }
+/**
+ * Assemble provenance input from the score summary and pipeline context.
+ *
+ * Exported for unit testing — direct consumers should still call
+ * `buildProvenance` (which calls this transitively via the publish step).
+ */
+export declare function buildProvenanceInput(summary: ScoreSummary, ctx: AppContext, options: {
+    evalFingerprint?: string;
+    promptfooUrls?: PromptfooUrlEntry[];
+}, autoScope?: ReportAutoScope): ProvenanceInput;

package/dist/orchestration/steps/publish-report-step.js CHANGED Viewed

@@ -194,21 +194,35 @@ export class PublishReportStep {
 // ---------------------------------------------------------------------------
 /**
  * Assemble provenance input from the score summary and pipeline context.
+ *
+ * Exported for unit testing — direct consumers should still call
+ * `buildProvenance` (which calls this transitively via the publish step).
  */
-function buildProvenanceInput(summary, ctx, options, autoScope) {
+export function buildProvenanceInput(summary, ctx, options, autoScope) {
     const areas = summary.scores.map((s) => s.feature);
     const mode = ctx.config.mode;
     // Read document IDs from config
     const sanityDocumentIds = ctx.config.sanityDocumentArgs;
     // Read task filter from config
     const taskIds = ctx.config.tasks;
-    // Build source from summary metadata or config
+    // Build source from summary metadata or config. Resolution order:
+    //   1. summary.source — written by calculate-scores after a successful
+    //      `loadSource` round-trip.
+    //   2. ctx.config.source — the caller-requested source name. Preserves
+    //      the user's intent when `loadSource` failed silently upstream
+    //      (calculate-scores-step:104-108 swallows the throw, leaving
+    //      summary.source undefined). Without this fallback, the report
+    //      reads "production" regardless of what the dashboard sent.
+    //   3. "production" — last-resort built-in default.
+    if (summary.source?.name === undefined && ctx.config.source) {
+        ctx.logger.warn(`[publish-report] summary.source is missing; falling back to ctx.config.source="${ctx.config.source}" for provenance.source.name`);
+    }
     const source = {
         baseUrl: summary.source?.baseUrl ?? "https://www.sanity.io/docs",
         dataset: summary.source?.dataset ?? ctx.config.datasetOverride ?? "next",
         documentIds: [],
         llmsTxt: (summary.source?.baseUrl ?? "https://www.sanity.io/docs") + "/llms.txt",
-        name: summary.source?.name ?? "production",
+        name: summary.source?.name ?? ctx.config.source ?? "production",
         perspective: summary.source?.perspective ??
             ctx.config.perspectiveOverride ??
             undefined,
@@ -235,6 +249,8 @@ function buildProvenanceInput(summary, ctx, options, autoScope) {
         source,
         sourceReportId: ctx.config.sourceReportId,
         taskIds,
+        variant: ctx.config.variant,
+        requestedModelIds: ctx.config.models,
     };
 }
 /**

package/dist/pipeline/cache-hit-restore.d.ts CHANGED Viewed

@@ -8,7 +8,7 @@
  * @see docs/decisions/D0040-artifact-ref-source-run-id.md
  * @see docs/design-docs/cache-hit-artifact-restoration.md
  */
-import type { ArtifactManifest, RunId } from "../_vendor/ailf-core/index.d.ts";
+import { type ArtifactManifest, type RunId } from "../_vendor/ailf-core/index.d.ts";
 /**
  * Copy an artifact manifest verbatim and stamp `sourceRunId` on every ref
  * that doesn't already carry one.
@@ -29,6 +29,19 @@ import type { ArtifactManifest, RunId } from "../_vendor/ailf-core/index.d.ts";
  * `sourceRunId` equals the runId encoded in its `path` (= where the bytes
  * physically live).
  *
+ * **Post-hoc artifacts are dropped.** Refs whose descriptor has
+ * `writePolicy: "post-hoc"` (e.g. `diagnosis`) are skipped: the cached
+ * report's slot points at the *previous* run's path, but the post-hoc
+ * producer fires again on the new run and emits a fresh ref anchored at
+ * the new runId. Injecting the cached cross-run ref into the accumulator
+ * makes `FinalizeRunStep` embed the stale path into the on-GCS
+ * `runs/<newRunId>/manifest.json`; the post-hoc emit then only patches the
+ * Sanity report doc, leaving the GCS manifest stale. Dropping the ref
+ * here keeps the GCS manifest consistent with the cache-miss shape (no
+ * post-hoc slot until the post-hoc emit lands), and the reader-side
+ * fallback resolves diagnosis via the Sanity doc, which the post-hoc
+ * patch keeps correct.
+ *
  * Pure function; safe to call without side effects.
  */
 export declare function remapToCacheHitRefs(source: ArtifactManifest, opts: {

package/dist/pipeline/cache-hit-restore.js CHANGED Viewed

@@ -8,6 +8,7 @@
  * @see docs/decisions/D0040-artifact-ref-source-run-id.md
  * @see docs/design-docs/cache-hit-artifact-restoration.md
  */
+import { ARTIFACT_REGISTRY, } from "../_vendor/ailf-core/index.js";
 /**
  * Copy an artifact manifest verbatim and stamp `sourceRunId` on every ref
  * that doesn't already carry one.
@@ -28,6 +29,19 @@
  * `sourceRunId` equals the runId encoded in its `path` (= where the bytes
  * physically live).
  *
+ * **Post-hoc artifacts are dropped.** Refs whose descriptor has
+ * `writePolicy: "post-hoc"` (e.g. `diagnosis`) are skipped: the cached
+ * report's slot points at the *previous* run's path, but the post-hoc
+ * producer fires again on the new run and emits a fresh ref anchored at
+ * the new runId. Injecting the cached cross-run ref into the accumulator
+ * makes `FinalizeRunStep` embed the stale path into the on-GCS
+ * `runs/<newRunId>/manifest.json`; the post-hoc emit then only patches the
+ * Sanity report doc, leaving the GCS manifest stale. Dropping the ref
+ * here keeps the GCS manifest consistent with the cache-miss shape (no
+ * post-hoc slot until the post-hoc emit lands), and the reader-side
+ * fallback resolves diagnosis via the Sanity doc, which the post-hoc
+ * patch keeps correct.
+ *
  * Pure function; safe to call without side effects.
  */
 export function remapToCacheHitRefs(source, opts) {
@@ -35,6 +49,9 @@ export function remapToCacheHitRefs(source, opts) {
     for (const [type, ref] of Object.entries(source)) {
         if (!ref)
             continue;
+        const descriptor = ARTIFACT_REGISTRY[type];
+        if (descriptor?.writePolicy === "post-hoc")
+            continue;
         const typed = ref;
         out[type] = {
             ...typed,

package/dist/pipeline/calculate-scores.js CHANGED Viewed

@@ -32,7 +32,7 @@ import { join } from "path";
 import { classifyRubric, detectFeatureArea, extractUrlMetadata, generateJudgmentId, mergeScores, parseRubricScore, resolveVariantMode, } from "../_vendor/ailf-core/index.js";
 import { calculateCost } from "../agent-observer/pricing.js";
 import { ConsoleLogger } from "../adapters/loggers/index.js";
-import { GraderJudgmentSchema, graderJudgmentsVersion, } from "../adapters/grader-outputs/promptfoo-grader-output.js";
+import { GraderEmittedJudgmentSchema, graderJudgmentsVersion, } from "../adapters/grader-outputs/promptfoo-grader-output.js";
 import { validateFailureMode } from "./failure-modes.js";
 import { analyzeSourceIsolation, } from "../assertions/source-isolation.js";
 import { checkResultsExist } from "./checks.js";
@@ -184,34 +184,70 @@ export function extractGraderJudgments(resultsPath, telemetry) {
                 continue;
             }
             const score = parseRubricScore(comp);
-            // Extract the reason text — the grader's reasoning. Plan 03-01
-            // (D0045 trust boundary): the inline `JSON.parse + as`-cast at
-            // this site is replaced with `GraderJudgmentSchema.safeParse`
-            // so that grader output flows through a validated schema before
-            // it enters the scoring pipeline. On parse failure we fall to
-            // an `unclassified`-shape Phase 1 judgment built from the raw
-            // reason string — NEVER fall back to the legacy parser (Pitfall
-            // 4: strict and legacy schemas are deliberate siblings, not a
-            // fall-through chain).
+            // Extract the reason text — the grader's reasoning. W0273 splits
+            // the parse boundary into a wire shape (`GraderEmittedJudgmentSchema`
+            // — only fields the LLM controls) and a storage shape
+            // (`GraderJudgmentSchema` — full strict surface). The pipeline
+            // parses against the wire shape, then synthesizes the pipeline-owned
+            // fields (judgmentId, metadata.{graderModel,graderJudgmentsVersion},
+            // hallucinationCheckedAgainst) plus the result-context fields
+            // (taskId, modelId, dimension) to build the full storage shape.
+            //
+            // On parse failure we fall to an `unclassified`-shape Phase 1
+            // judgment built from the raw reason string — NEVER fall back to
+            // the legacy parser (Pitfall 4: strict and legacy schemas are
+            // deliberate siblings, not a fall-through chain).
             const reasonRaw = comp.reason ?? "";
             let parsedJudgment = null;
             let reason = reasonRaw;
             if (reasonRaw) {
                 try {
                     const candidate = JSON.parse(reasonRaw);
-                    // The strict schema asserts the full GraderJudgment surface.
-                    // safeParse handles non-object inputs (number, array, etc.)
-                    // by failing — we don't pre-narrow here.
+                    // The wire schema asserts only the LLM-emit subset. safeParse
+                    // handles non-object inputs (number, array, etc.) by failing —
+                    // we don't pre-narrow here.
                     const candidateObj = candidate && typeof candidate === "object" ? candidate : {};
-                    const result = GraderJudgmentSchema.safeParse({
-                        ...candidateObj,
-                        taskId,
-                        modelId,
-                        dimension: kind,
-                    });
+                    const result = GraderEmittedJudgmentSchema.safeParse(candidateObj);
                     if (result.success) {
-                        parsedJudgment = result.data;
-                        reason = result.data.reason;
+                        const emitted = result.data;
+                        parsedJudgment = {
+                            // Result-context fields — pipeline-supplied:
+                            taskId,
+                            modelId,
+                            dimension: kind,
+                            // Wire-emitted fields — LLM-controlled:
+                            score: emitted.score,
+                            reason: emitted.reason,
+                            failureMode: emitted.failureMode,
+                            subJudgments: emitted.subJudgments,
+                            docCitations: emitted.docCitations,
+                            confidence: emitted.confidence,
+                            ...(emitted.outputFailure && {
+                                outputFailure: emitted.outputFailure,
+                            }),
+                            // Pipeline-owned fields — synthesized:
+                            judgmentId: generateJudgmentId({
+                                taskId,
+                                modelId,
+                                dimension: kind,
+                                ...(telemetry?.runId ? { runId: telemetry.runId } : {}),
+                            }),
+                            // hallucinationCheckedAgainst is filled in later by
+                            // populateHallucinationFields (gap-analysis-step.ts) — it
+                            // needs the run.documentManifest union that isn't visible
+                            // here. Empty array is the documented pre-fill placeholder.
+                            hallucinationCheckedAgainst: [],
+                            metadata: {
+                                // graderModel is threaded via the existing
+                                // telemetry.reliability channel. When upstream wires the
+                                // real grader-provider alias into reliability.graderModel,
+                                // it propagates here automatically; today it's "unknown"
+                                // (matching the pre-W0273 synthesized-fallback default).
+                                graderModel: telemetry?.reliability.graderModel ?? "unknown",
+                                graderJudgmentsVersion,
+                            },
+                        };
+                        reason = emitted.reason;
                     }
                     else {
                         // Parse failure — drop to failureMode='unclassified' below.

package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts CHANGED Viewed

@@ -6,7 +6,7 @@
  */
 import type { LiteracyTaskDefinition } from "../../../../_vendor/ailf-core/index.d.ts";
 import type { PromptfooAssertion } from "../../assertion-mapper.js";
-import type { LiteracyCompileOptions } from "./types.js";
+import type { LiteracyCompileOptions, RubricResolutionInput } from "./types.js";
 export declare function resolveAssertions(task: LiteracyTaskDefinition, options: LiteracyCompileOptions | undefined, warnings: string[], canonicalReference?: string): PromptfooAssertion[];
 /**
  * Build baseline assertions matching the legacy expand-tasks behavior.
@@ -14,5 +14,10 @@ export declare function resolveAssertions(task: LiteracyTaskDefinition, options:
  * - "full": all assertions carried over
  * - "abbreviated": only first llm-rubric with shortened prompt
  * - "none": no assertions
+ *
+ * `rubricConfig` supplies the W0273 wire-shape footer for the abbreviated
+ * mode's synthetic rubric. Without it the abbreviated emission would fail
+ * `GraderEmittedJudgmentSchema.safeParse` (missing failureMode,
+ * subJudgments, docCitations, confidence).
  */
-export declare function buildBaselineAssertions(goldAssertions: PromptfooAssertion[], rubricMode?: "abbreviated" | "full" | "none"): PromptfooAssertion[];
+export declare function buildBaselineAssertions(goldAssertions: PromptfooAssertion[], rubricMode?: "abbreviated" | "full" | "none", rubricConfig?: RubricResolutionInput): PromptfooAssertion[];

package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js CHANGED Viewed

@@ -45,8 +45,10 @@ function buildDocCoverageAssertion(rubricConfig, graderProvider, canonicalRefere
         return null;
     const template = rubricConfig.templates["doc-coverage"];
     const scaleText = template.scale.map((s) => `- ${s}`).join("\n");
-    const rubricValue = `${template.header}\n${scaleText}\n\n` +
-        `Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}`;
+    // W0273 — use the centralized wire-shape footer so the grader emission
+    // parses against GraderEmittedJudgmentSchema. The pre-W0273 short
+    // {score, reason} footer caused 100% parse failures starting 2026-05-11.
+    const rubricValue = `${template.header}\n${scaleText}\n\n` + `${rubricConfig.footer}`;
     // doc-coverage benefits from the same authoritative reference — the grader
     // needs the doc content to judge whether the candidate actually used what
     // was documented.
@@ -92,8 +94,13 @@ function buildDocCoverageRubricPrompt(rubric, reference) {
  * - "full": all assertions carried over
  * - "abbreviated": only first llm-rubric with shortened prompt
  * - "none": no assertions
+ *
+ * `rubricConfig` supplies the W0273 wire-shape footer for the abbreviated
+ * mode's synthetic rubric. Without it the abbreviated emission would fail
+ * `GraderEmittedJudgmentSchema.safeParse` (missing failureMode,
+ * subJudgments, docCitations, confidence).
  */
-export function buildBaselineAssertions(goldAssertions, rubricMode) {
+export function buildBaselineAssertions(goldAssertions, rubricMode, rubricConfig) {
     const mode = rubricMode ?? "full";
     if (mode === "none")
         return [];
@@ -106,10 +113,12 @@ export function buildBaselineAssertions(goldAssertions, rubricMode) {
         if (a.type === "llm-rubric") {
             if (!foundFirst) {
                 foundFirst = true;
+                const footer = rubricConfig?.footer ??
+                    'Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}';
                 abbreviated.push({
                     type: "llm-rubric",
                     value: "Score task completion from 0 to 100 (same criteria as above).\n" +
-                        'Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}',
+                        footer,
                     ...(a.provider ? { provider: a.provider } : {}),
                 });
             }

package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js CHANGED Viewed

@@ -134,7 +134,7 @@ function buildTestCases(task, evalMode, options, warnings) {
     if (evalMode !== "agentic") {
         const baselineEnabled = task.baseline?.enabled !== false;
         if (baselineEnabled) {
-            const baselineAssertions = buildBaselineAssertions(assertions, task.baseline?.rubric);
+            const baselineAssertions = buildBaselineAssertions(assertions, task.baseline?.rubric, options?.rubricConfig);
             tests.push({
                 description: `${taskTitle} (baseline)`,
                 vars: {

package/dist/pipeline/compiler/provider-assembler.d.ts CHANGED Viewed

@@ -50,8 +50,15 @@ export interface ModelsAndProviders {
  * Returns provider arrays keyed by literacy variant name (baseline,
  * agentic, observed). These are consumed by the YAML writer to produce
  * the per-variant promptfoo config files.
+ *
+ * `loaded` (optional) lets callers pre-load and pre-filter the
+ * `ModelsConfig` so a caller-side filter (e.g. W0281's
+ * `filterModelsByRequest`) actually takes effect on the assembled
+ * providers — building providers from the unfiltered set would silently
+ * defeat the filter, since promptfoo decides which LLMs to call from the
+ * providers array, not the returned `models` field.
  */
-export declare function loadModelsAndProviders(rootDir: string, source?: ResolvedSourceConfig, searchMode?: string, allowedOrigins?: string[]): ModelsAndProviders;
+export declare function loadModelsAndProviders(rootDir: string, source?: ResolvedSourceConfig, searchMode?: string, allowedOrigins?: string[], loaded?: ModelsConfig): ModelsAndProviders;
 /**
  * Resolve `maxToolRounds` for an agentic variant (W0134).
  *
@@ -60,3 +67,10 @@ export declare function loadModelsAndProviders(rootDir: string, source?: Resolve
  * > hard fallback (5).
  */
 export declare function resolveMaxToolRounds(models: ModelsConfig, model: ModelsConfig["models"][number], variant: "agentic-naive" | "agentic-optimized"): number;
+/**
+ * Load the `ModelsConfig` for `rootDir` from disk. Exported so callers
+ * that need to pre-filter the model set before provider assembly (e.g.
+ * `PipelineRequest.models`) can hand the filtered config back to
+ * `loadModelsAndProviders` via its optional `loaded` parameter.
+ */
+export declare function loadModelsYaml(rootDir: string): ModelsConfig;