@sanity/ailf 7.0.1 → 7.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. package/config/rubrics.ts +12 -13
  2. package/dist/_vendor/ailf-core/ports/context.d.ts +45 -3
  3. package/dist/_vendor/ailf-core/ports/index.d.ts +1 -1
  4. package/dist/_vendor/ailf-core/schemas/branded-string.d.ts +9 -1
  5. package/dist/_vendor/ailf-core/schemas/branded-string.js +16 -6
  6. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +2 -0
  7. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +7 -0
  8. package/dist/_vendor/ailf-core/schemas/report.d.ts +12 -0
  9. package/dist/_vendor/ailf-core/schemas/report.js +2 -0
  10. package/dist/_vendor/ailf-core/schemas/team.d.ts +22 -0
  11. package/dist/_vendor/ailf-core/schemas/team.js +63 -0
  12. package/dist/_vendor/ailf-core/types/grader-judgment.d.ts +51 -0
  13. package/dist/_vendor/ailf-core/types/index.d.ts +8 -1
  14. package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +17 -0
  15. package/dist/_vendor/ailf-core/types/team.d.ts +65 -0
  16. package/dist/_vendor/ailf-core/types/team.js +1 -0
  17. package/dist/_vendor/ailf-shared/eval-modes.d.ts +2 -0
  18. package/dist/_vendor/ailf-shared/eval-modes.js +5 -0
  19. package/dist/_vendor/ailf-shared/event-types.d.ts +15 -0
  20. package/dist/_vendor/ailf-shared/event-types.js +23 -0
  21. package/dist/_vendor/ailf-shared/generated/help-content.js +2 -2
  22. package/dist/_vendor/ailf-shared/index.d.ts +4 -2
  23. package/dist/_vendor/ailf-shared/index.js +4 -2
  24. package/dist/_vendor/ailf-shared/member-roles.d.ts +16 -0
  25. package/dist/_vendor/ailf-shared/member-roles.js +16 -0
  26. package/dist/_vendor/ailf-shared/owner-teams.d.ts +19 -0
  27. package/dist/_vendor/ailf-shared/owner-teams.js +7 -0
  28. package/dist/_vendor/ailf-shared/run-context.d.ts +8 -1
  29. package/dist/adapters/grader-outputs/promptfoo-grader-output.d.ts +65 -1
  30. package/dist/adapters/grader-outputs/promptfoo-grader-output.js +35 -0
  31. package/dist/adapters/task-sources/changed-docs-filter.d.ts +12 -0
  32. package/dist/adapters/task-sources/changed-docs-filter.js +30 -0
  33. package/dist/adapters/task-sources/content-lake-task-source.js +2 -1
  34. package/dist/adapters/task-sources/repo-task-source.js +2 -1
  35. package/dist/commands/pipeline-action.d.ts +4 -3
  36. package/dist/commands/pipeline-action.js +7 -5
  37. package/dist/commands/run.js +2 -2
  38. package/dist/config/rubrics.ts +12 -13
  39. package/dist/job-store.d.ts +18 -0
  40. package/dist/job-store.js +34 -0
  41. package/dist/orchestration/build-app-context.js +8 -1
  42. package/dist/orchestration/pipeline-orchestrator.js +46 -1
  43. package/dist/orchestration/steps/compare-step.d.ts +7 -0
  44. package/dist/orchestration/steps/compare-step.js +59 -23
  45. package/dist/orchestration/steps/fetch-docs-step.js +3 -0
  46. package/dist/orchestration/steps/finalize-run-step.js +2 -0
  47. package/dist/orchestration/steps/generate-configs-step.d.ts +32 -1
  48. package/dist/orchestration/steps/generate-configs-step.js +47 -13
  49. package/dist/orchestration/steps/grader-consistency-step.js +11 -0
  50. package/dist/orchestration/steps/publish-report-step.d.ts +12 -1
  51. package/dist/orchestration/steps/publish-report-step.js +19 -3
  52. package/dist/pipeline/cache-hit-restore.d.ts +14 -1
  53. package/dist/pipeline/cache-hit-restore.js +17 -0
  54. package/dist/pipeline/calculate-scores.js +57 -21
  55. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +7 -2
  56. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +13 -4
  57. package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +1 -1
  58. package/dist/pipeline/compiler/provider-assembler.d.ts +15 -1
  59. package/dist/pipeline/compiler/provider-assembler.js +16 -3
  60. package/dist/pipeline/failure-modes.d.ts +20 -10
  61. package/dist/pipeline/failure-modes.js +84 -15
  62. package/dist/pipeline/map-request-to-config.js +2 -0
  63. package/dist/pipeline/normalize-mode.d.ts +1 -1
  64. package/dist/pipeline/normalize-mode.js +2 -0
  65. package/dist/pipeline/run-context.d.ts +16 -1
  66. package/dist/pipeline/run-context.js +12 -1
  67. package/dist/pipeline/validate.d.ts +8 -4
  68. package/dist/pipeline/validate.js +8 -18
  69. package/dist/report-store.d.ts +14 -1
  70. package/dist/report-store.js +32 -0
  71. package/dist/sanity/client.js +2 -2
  72. package/package.json +1 -1
@@ -4,6 +4,13 @@
4
4
  * This step is already pure (no execSync, no env vars) — the logic is
5
5
  * inlined directly from the former pipeline/steps/compare-step.ts.
6
6
  * This is an optional step — failure doesn't stop the pipeline.
7
+ *
8
+ * Baseline resolution order (highest priority first):
9
+ * 1. `compareBaselineReportId` — fetch the named report doc
10
+ * and use its `summary` (a ReportSummary, which is a
11
+ * superset of ComparableSummary) as the baseline.
12
+ * 2. `compareBaseline` — local filesystem path (CLI ergonomics).
13
+ * 3. Latest baseline in `results/baselines/`.
7
14
  */
8
15
  import { existsSync, mkdirSync, readFileSync, readdirSync, writeFileSync, } from "fs";
9
16
  import { join, resolve } from "path";
@@ -29,39 +36,68 @@ export class CompareStep {
29
36
  }
30
37
  // Load experiment (current run)
31
38
  const experiment = JSON.parse(readFileSync(scoreSummaryPath, "utf-8"));
32
- // Resolve baseline
33
- let resolvedBaselinePath;
34
- if (ctx.config.compareBaseline) {
35
- resolvedBaselinePath = resolve(ctx.config.compareBaseline);
36
- }
37
- else {
38
- const baselinesDir = resolve(rootDir, "results", "baselines");
39
- if (!existsSync(baselinesDir)) {
39
+ // Resolve baseline. Pinned report id wins over local FS, which wins
40
+ // over auto-discovery of the most recent file in `results/baselines/`.
41
+ let baseline;
42
+ const pinnedReportId = ctx.config.compareBaselineReportId;
43
+ if (pinnedReportId) {
44
+ if (!ctx.reportStore) {
40
45
  return {
41
- reason: "No baselines directory found. Run 'pnpm baseline:save' first.",
46
+ reason: "compareBaselineReportId set but no reportStore is configured. " +
47
+ "Check Sanity credentials in .ailf/config.yaml.",
42
48
  status: "skipped",
43
49
  };
44
50
  }
45
- const files = readdirSync(baselinesDir)
46
- .filter((f) => f.endsWith(".json"))
47
- .sort()
48
- .reverse();
49
- if (files.length === 0) {
51
+ const result = await ctx.reportStore.loadBaselineFromReport(pinnedReportId);
52
+ if (result.kind === "error") {
53
+ return {
54
+ durationMs: Date.now() - start,
55
+ error: `Failed to load baseline report ${pinnedReportId}: ${result.message}`,
56
+ status: "failed",
57
+ };
58
+ }
59
+ if (result.kind === "not_found") {
50
60
  return {
51
- reason: "No baseline files found. Run 'pnpm baseline:save' first.",
61
+ reason: `Baseline report ${pinnedReportId} not found.`,
52
62
  status: "skipped",
53
63
  };
54
64
  }
55
- resolvedBaselinePath = join(baselinesDir, files[0]);
65
+ baseline = result.baseline;
56
66
  }
57
- if (!existsSync(resolvedBaselinePath)) {
58
- return {
59
- durationMs: Date.now() - start,
60
- error: `Baseline file not found: ${resolvedBaselinePath}`,
61
- status: "failed",
62
- };
67
+ else {
68
+ let resolvedBaselinePath;
69
+ if (ctx.config.compareBaseline) {
70
+ resolvedBaselinePath = resolve(ctx.config.compareBaseline);
71
+ }
72
+ else {
73
+ const baselinesDir = resolve(rootDir, "results", "baselines");
74
+ if (!existsSync(baselinesDir)) {
75
+ return {
76
+ reason: "No baselines directory found. Run 'pnpm baseline:save' first.",
77
+ status: "skipped",
78
+ };
79
+ }
80
+ const files = readdirSync(baselinesDir)
81
+ .filter((f) => f.endsWith(".json"))
82
+ .sort()
83
+ .reverse();
84
+ if (files.length === 0) {
85
+ return {
86
+ reason: "No baseline files found. Run 'pnpm baseline:save' first.",
87
+ status: "skipped",
88
+ };
89
+ }
90
+ resolvedBaselinePath = join(baselinesDir, files[0]);
91
+ }
92
+ if (!existsSync(resolvedBaselinePath)) {
93
+ return {
94
+ durationMs: Date.now() - start,
95
+ error: `Baseline file not found: ${resolvedBaselinePath}`,
96
+ status: "failed",
97
+ };
98
+ }
99
+ baseline = JSON.parse(readFileSync(resolvedBaselinePath, "utf-8"));
63
100
  }
64
- const baseline = JSON.parse(readFileSync(resolvedBaselinePath, "utf-8"));
65
101
  // Run comparison
66
102
  const options = ctx.config.compareThreshold
67
103
  ? { noiseThreshold: ctx.config.compareThreshold }
@@ -37,6 +37,9 @@ export class FetchDocsStep {
37
37
  ...(ctx.config.areas?.length ? { areas: ctx.config.areas } : {}),
38
38
  ...(ctx.config.tasks?.length ? { taskIds: ctx.config.tasks } : {}),
39
39
  ...(ctx.config.tags?.length ? { tags: ctx.config.tags } : {}),
40
+ ...(ctx.config.changedDocs?.length
41
+ ? { changedDocs: ctx.config.changedDocs }
42
+ : {}),
40
43
  };
41
44
  const allTasks = await ctx.taskSource.loadTasks(Object.keys(filter).length > 0 ? filter : undefined);
42
45
  // Bridge: narrow to literacy tasks for canonical doc access
@@ -84,6 +84,8 @@ export class FinalizeRunStep {
84
84
  rootDir: ctx.config.rootDir,
85
85
  source: resolvedSource,
86
86
  taskIds: ctx.config.tasks,
87
+ variant: ctx.config.variant,
88
+ requestedModelIds: ctx.config.models,
87
89
  });
88
90
  // W0051 revisit: the composition-root wraps `ctx.artifactWriter` in
89
91
  // `AccumulatingArtifactWriter`, which keeps a map of every ref any
@@ -8,7 +8,7 @@
8
8
  * When the variant is "full", the handler is called twice (baseline + agentic)
9
9
  * and three YAML files are written. Other modes produce one YAML file.
10
10
  */
11
- import { type AppContext, type PipelineState, type PipelineStep, type StepResult, type ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
11
+ import { type AppContext, type ModelsConfig, type PipelineState, type PipelineStep, type StepResult, type ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
12
12
  export declare class GenerateConfigsStep implements PipelineStep {
13
13
  readonly name = "generate-configs";
14
14
  /** Task IDs from the last loadTasks call (pre-filter), for error messages. */
@@ -42,3 +42,34 @@ export declare class GenerateConfigsStep implements PipelineStep {
42
42
  cacheInputs(ctx: AppContext): string[];
43
43
  cacheContext(ctx: AppContext): string[];
44
44
  }
45
+ /**
46
+ * Merge multiple compile results into one.
47
+ *
48
+ * Note: `providers` and `prompts` are taken from the first result only.
49
+ * This is correct for single-mode compilation where all tasks share the
50
+ * same provider set. Cross-mode merging with per-task provider overrides
51
+ * would need deduplication here.
52
+ */
53
+ /**
54
+ * Apply `PipelineRequest.models` to the loaded model cohort (W0281).
55
+ *
56
+ * Returns one of three outcomes:
57
+ * - `unfiltered` — caller didn't pin any models; pass through.
58
+ * - `filtered` — at least one requested ID matched the cohort; unknown
59
+ * IDs are reported via a structured warning so callers
60
+ * can detect typos.
61
+ * - `no-match` — every requested ID is unknown. Caller wired this
62
+ * step into a failure path so the rejection reason
63
+ * surfaces on the job's `error` field, not silently.
64
+ */
65
+ export type FilterModelsResult = {
66
+ kind: "unfiltered";
67
+ models: ModelsConfig;
68
+ } | {
69
+ kind: "filtered";
70
+ models: ModelsConfig;
71
+ } | {
72
+ kind: "no-match";
73
+ reason: string;
74
+ };
75
+ export declare function filterModelsByRequest(loaded: ModelsConfig, requested: string[] | undefined, logger: import("@sanity/ailf-core").Logger): FilterModelsResult;
@@ -67,12 +67,32 @@ export class GenerateConfigsStep {
67
67
  };
68
68
  }
69
69
  // Load models
70
- const { loadModelsAndProviders } = await import("../../pipeline/compiler/provider-assembler.js");
70
+ const { loadModelsAndProviders, loadModelsYaml } = await import("../../pipeline/compiler/provider-assembler.js");
71
71
  const overrides = configToSourceOverrides(ctx.config);
72
72
  const resolvedSource = ctx.config.source
73
73
  ? loadSource(ctx.config.source, overrides)
74
74
  : undefined;
75
- const { models, providers } = loadModelsAndProviders(ctx.config.rootDir, resolvedSource, ctx.config.searchMode, ctx.config.allowedOrigins);
75
+ // W0281: when the caller pinned a subset of models via
76
+ // `PipelineRequest.models`, filter the cohort BEFORE provider
77
+ // assembly. Filtering only the returned `models` field would silently
78
+ // defeat the filter — promptfoo decides which LLMs to call from the
79
+ // providers array, which is assembled from the unfiltered set unless
80
+ // we hand the assembler a pre-filtered ModelsConfig. Unknown IDs are
81
+ // surfaced via a structured warning AND a failed step result (whose
82
+ // message lands on `ailf.job.error`) so callers can detect typos
83
+ // instead of silently running the full default cohort.
84
+ const rawModels = loadModelsYaml(ctx.config.rootDir);
85
+ const filtered = filterModelsByRequest(rawModels, ctx.config.models, ctx.logger);
86
+ if (filtered.kind === "no-match") {
87
+ return {
88
+ durationMs: Date.now() - start,
89
+ error: filtered.reason,
90
+ status: "failed",
91
+ };
92
+ }
93
+ const loaded = loadModelsAndProviders(ctx.config.rootDir, resolvedSource, ctx.config.searchMode, ctx.config.allowedOrigins, filtered.models);
94
+ const models = loaded.models;
95
+ const providers = loaded.providers;
76
96
  // Literacy mode: variant expansion (baseline + agentic → 3 YAML files)
77
97
  if (mode === "literacy") {
78
98
  return this.compileLiteracyVariants(ctx, handler, tasks, models, providers, start);
@@ -239,6 +259,9 @@ export class GenerateConfigsStep {
239
259
  ...(ctx.config.areas?.length ? { areas: ctx.config.areas } : {}),
240
260
  ...(ctx.config.tasks?.length ? { taskIds: ctx.config.tasks } : {}),
241
261
  ...(ctx.config.tags?.length ? { tags: ctx.config.tags } : {}),
262
+ ...(ctx.config.changedDocs?.length
263
+ ? { changedDocs: ctx.config.changedDocs }
264
+ : {}),
242
265
  };
243
266
  const allTasks = await ctx.taskSource.loadTasks(Object.keys(filter).length > 0 ? filter : undefined);
244
267
  // Mode filter — the adapter may return a mixed-mode set (e.g. a user's
@@ -345,17 +368,28 @@ export class GenerateConfigsStep {
345
368
  return buildCacheContext(ctx.config);
346
369
  }
347
370
  }
348
- // ---------------------------------------------------------------------------
349
- // Helpers
350
- // ---------------------------------------------------------------------------
351
- /**
352
- * Merge multiple compile results into one.
353
- *
354
- * Note: `providers` and `prompts` are taken from the first result only.
355
- * This is correct for single-mode compilation where all tasks share the
356
- * same provider set. Cross-mode merging with per-task provider overrides
357
- * would need deduplication here.
358
- */
371
+ export function filterModelsByRequest(loaded, requested, logger) {
372
+ if (!requested || requested.length === 0) {
373
+ return { kind: "unfiltered", models: loaded };
374
+ }
375
+ const availableIds = new Set(loaded.models.map((m) => m.id));
376
+ const requestedSet = new Set(requested);
377
+ const kept = loaded.models.filter((m) => requestedSet.has(m.id));
378
+ const unknown = requested.filter((id) => !availableIds.has(id));
379
+ if (kept.length === 0) {
380
+ const reason = `[generate-configs] PipelineRequest.models rejected — none of ` +
381
+ `[${requested.join(", ")}] match config/models.ts. ` +
382
+ `Available IDs: ${[...availableIds].join(", ") || "(none configured)"}.`;
383
+ logger.warn(reason);
384
+ return { kind: "no-match", reason };
385
+ }
386
+ if (unknown.length > 0) {
387
+ logger.warn(`[generate-configs] PipelineRequest.models partial match — ignoring ` +
388
+ `unknown ID(s) [${unknown.join(", ")}]; ` +
389
+ `running ${kept.length}/${requested.length} requested.`);
390
+ }
391
+ return { kind: "filtered", models: { ...loaded, models: kept } };
392
+ }
359
393
  function mergeCompileResults(results) {
360
394
  const tests = results.flatMap((r) => r.tests);
361
395
  const warnings = results.flatMap((r) => r.warnings);
@@ -18,7 +18,18 @@ export class GraderConsistencyStep {
18
18
  }
19
19
  async execute(ctx) {
20
20
  const start = Date.now();
21
+ // Default-on-omit is 5 (matches consistency-analysis-friendly defaults).
22
+ // The dashboard sends 1 by default for cost reasons (see W0283 / new-eval
23
+ // audit S1-E). When the resolved value is <2, the analysis can't compute
24
+ // variance — skip instead of failing so the job doesn't carry a
25
+ // misleading `error.step: "grader-consistency"`.
21
26
  const replications = ctx.config.graderReplications ?? 5;
27
+ if (replications < 2) {
28
+ return {
29
+ reason: `graderReplications=${replications} (<2) — consistency analysis requires at least 2 replications`,
30
+ status: "skipped",
31
+ };
32
+ }
22
33
  const primaryResultsRun = ctx.config.mode === "literacy"
23
34
  ? ctx.config.variant === LiteracyVariant.FULL
24
35
  ? LiteracyVariant.STANDARD
@@ -10,7 +10,8 @@
10
10
  * - P5: Local-first (pipeline never fails because of a store write)
11
11
  * - P6: Sinks are fire-and-forget (failures logged, not thrown)
12
12
  */
13
- import { type AppContext, type PipelineState, type PipelineStep, type PromptfooUrlEntry, type StepResult, type ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
13
+ import { type AppContext, type PipelineState, type PipelineStep, type PromptfooUrlEntry, type ReportAutoScope, type ScoreSummary, type StepResult, type ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
14
+ import { type ProvenanceInput } from "../../pipeline/provenance.js";
14
15
  export declare class PublishReportStep implements PipelineStep {
15
16
  private readonly pipelineStart;
16
17
  private readonly options;
@@ -24,3 +25,13 @@ export declare class PublishReportStep implements PipelineStep {
24
25
  check(): ValidationIssue[];
25
26
  execute(ctx: AppContext, state: PipelineState): Promise<StepResult>;
26
27
  }
28
+ /**
29
+ * Assemble provenance input from the score summary and pipeline context.
30
+ *
31
+ * Exported for unit testing — direct consumers should still call
32
+ * `buildProvenance` (which calls this transitively via the publish step).
33
+ */
34
+ export declare function buildProvenanceInput(summary: ScoreSummary, ctx: AppContext, options: {
35
+ evalFingerprint?: string;
36
+ promptfooUrls?: PromptfooUrlEntry[];
37
+ }, autoScope?: ReportAutoScope): ProvenanceInput;
@@ -194,21 +194,35 @@ export class PublishReportStep {
194
194
  // ---------------------------------------------------------------------------
195
195
  /**
196
196
  * Assemble provenance input from the score summary and pipeline context.
197
+ *
198
+ * Exported for unit testing — direct consumers should still call
199
+ * `buildProvenance` (which calls this transitively via the publish step).
197
200
  */
198
- function buildProvenanceInput(summary, ctx, options, autoScope) {
201
+ export function buildProvenanceInput(summary, ctx, options, autoScope) {
199
202
  const areas = summary.scores.map((s) => s.feature);
200
203
  const mode = ctx.config.mode;
201
204
  // Read document IDs from config
202
205
  const sanityDocumentIds = ctx.config.sanityDocumentArgs;
203
206
  // Read task filter from config
204
207
  const taskIds = ctx.config.tasks;
205
- // Build source from summary metadata or config
208
+ // Build source from summary metadata or config. Resolution order:
209
+ // 1. summary.source — written by calculate-scores after a successful
210
+ // `loadSource` round-trip.
211
+ // 2. ctx.config.source — the caller-requested source name. Preserves
212
+ // the user's intent when `loadSource` failed silently upstream
213
+ // (calculate-scores-step:104-108 swallows the throw, leaving
214
+ // summary.source undefined). Without this fallback, the report
215
+ // reads "production" regardless of what the dashboard sent.
216
+ // 3. "production" — last-resort built-in default.
217
+ if (summary.source?.name === undefined && ctx.config.source) {
218
+ ctx.logger.warn(`[publish-report] summary.source is missing; falling back to ctx.config.source="${ctx.config.source}" for provenance.source.name`);
219
+ }
206
220
  const source = {
207
221
  baseUrl: summary.source?.baseUrl ?? "https://www.sanity.io/docs",
208
222
  dataset: summary.source?.dataset ?? ctx.config.datasetOverride ?? "next",
209
223
  documentIds: [],
210
224
  llmsTxt: (summary.source?.baseUrl ?? "https://www.sanity.io/docs") + "/llms.txt",
211
- name: summary.source?.name ?? "production",
225
+ name: summary.source?.name ?? ctx.config.source ?? "production",
212
226
  perspective: summary.source?.perspective ??
213
227
  ctx.config.perspectiveOverride ??
214
228
  undefined,
@@ -235,6 +249,8 @@ function buildProvenanceInput(summary, ctx, options, autoScope) {
235
249
  source,
236
250
  sourceReportId: ctx.config.sourceReportId,
237
251
  taskIds,
252
+ variant: ctx.config.variant,
253
+ requestedModelIds: ctx.config.models,
238
254
  };
239
255
  }
240
256
  /**
@@ -8,7 +8,7 @@
8
8
  * @see docs/decisions/D0040-artifact-ref-source-run-id.md
9
9
  * @see docs/design-docs/cache-hit-artifact-restoration.md
10
10
  */
11
- import type { ArtifactManifest, RunId } from "../_vendor/ailf-core/index.d.ts";
11
+ import { type ArtifactManifest, type RunId } from "../_vendor/ailf-core/index.d.ts";
12
12
  /**
13
13
  * Copy an artifact manifest verbatim and stamp `sourceRunId` on every ref
14
14
  * that doesn't already carry one.
@@ -29,6 +29,19 @@ import type { ArtifactManifest, RunId } from "../_vendor/ailf-core/index.d.ts";
29
29
  * `sourceRunId` equals the runId encoded in its `path` (= where the bytes
30
30
  * physically live).
31
31
  *
32
+ * **Post-hoc artifacts are dropped.** Refs whose descriptor has
33
+ * `writePolicy: "post-hoc"` (e.g. `diagnosis`) are skipped: the cached
34
+ * report's slot points at the *previous* run's path, but the post-hoc
35
+ * producer fires again on the new run and emits a fresh ref anchored at
36
+ * the new runId. Injecting the cached cross-run ref into the accumulator
37
+ * makes `FinalizeRunStep` embed the stale path into the on-GCS
38
+ * `runs/<newRunId>/manifest.json`; the post-hoc emit then only patches the
39
+ * Sanity report doc, leaving the GCS manifest stale. Dropping the ref
40
+ * here keeps the GCS manifest consistent with the cache-miss shape (no
41
+ * post-hoc slot until the post-hoc emit lands), and the reader-side
42
+ * fallback resolves diagnosis via the Sanity doc, which the post-hoc
43
+ * patch keeps correct.
44
+ *
32
45
  * Pure function; safe to call without side effects.
33
46
  */
34
47
  export declare function remapToCacheHitRefs(source: ArtifactManifest, opts: {
@@ -8,6 +8,7 @@
8
8
  * @see docs/decisions/D0040-artifact-ref-source-run-id.md
9
9
  * @see docs/design-docs/cache-hit-artifact-restoration.md
10
10
  */
11
+ import { ARTIFACT_REGISTRY, } from "../_vendor/ailf-core/index.js";
11
12
  /**
12
13
  * Copy an artifact manifest verbatim and stamp `sourceRunId` on every ref
13
14
  * that doesn't already carry one.
@@ -28,6 +29,19 @@
28
29
  * `sourceRunId` equals the runId encoded in its `path` (= where the bytes
29
30
  * physically live).
30
31
  *
32
+ * **Post-hoc artifacts are dropped.** Refs whose descriptor has
33
+ * `writePolicy: "post-hoc"` (e.g. `diagnosis`) are skipped: the cached
34
+ * report's slot points at the *previous* run's path, but the post-hoc
35
+ * producer fires again on the new run and emits a fresh ref anchored at
36
+ * the new runId. Injecting the cached cross-run ref into the accumulator
37
+ * makes `FinalizeRunStep` embed the stale path into the on-GCS
38
+ * `runs/<newRunId>/manifest.json`; the post-hoc emit then only patches the
39
+ * Sanity report doc, leaving the GCS manifest stale. Dropping the ref
40
+ * here keeps the GCS manifest consistent with the cache-miss shape (no
41
+ * post-hoc slot until the post-hoc emit lands), and the reader-side
42
+ * fallback resolves diagnosis via the Sanity doc, which the post-hoc
43
+ * patch keeps correct.
44
+ *
31
45
  * Pure function; safe to call without side effects.
32
46
  */
33
47
  export function remapToCacheHitRefs(source, opts) {
@@ -35,6 +49,9 @@ export function remapToCacheHitRefs(source, opts) {
35
49
  for (const [type, ref] of Object.entries(source)) {
36
50
  if (!ref)
37
51
  continue;
52
+ const descriptor = ARTIFACT_REGISTRY[type];
53
+ if (descriptor?.writePolicy === "post-hoc")
54
+ continue;
38
55
  const typed = ref;
39
56
  out[type] = {
40
57
  ...typed,
@@ -32,7 +32,7 @@ import { join } from "path";
32
32
  import { classifyRubric, detectFeatureArea, extractUrlMetadata, generateJudgmentId, mergeScores, parseRubricScore, resolveVariantMode, } from "../_vendor/ailf-core/index.js";
33
33
  import { calculateCost } from "../agent-observer/pricing.js";
34
34
  import { ConsoleLogger } from "../adapters/loggers/index.js";
35
- import { GraderJudgmentSchema, graderJudgmentsVersion, } from "../adapters/grader-outputs/promptfoo-grader-output.js";
35
+ import { GraderEmittedJudgmentSchema, graderJudgmentsVersion, } from "../adapters/grader-outputs/promptfoo-grader-output.js";
36
36
  import { validateFailureMode } from "./failure-modes.js";
37
37
  import { analyzeSourceIsolation, } from "../assertions/source-isolation.js";
38
38
  import { checkResultsExist } from "./checks.js";
@@ -184,34 +184,70 @@ export function extractGraderJudgments(resultsPath, telemetry) {
184
184
  continue;
185
185
  }
186
186
  const score = parseRubricScore(comp);
187
- // Extract the reason text — the grader's reasoning. Plan 03-01
188
- // (D0045 trust boundary): the inline `JSON.parse + as`-cast at
189
- // this site is replaced with `GraderJudgmentSchema.safeParse`
190
- // so that grader output flows through a validated schema before
191
- // it enters the scoring pipeline. On parse failure we fall to
192
- // an `unclassified`-shape Phase 1 judgment built from the raw
193
- // reason string — NEVER fall back to the legacy parser (Pitfall
194
- // 4: strict and legacy schemas are deliberate siblings, not a
195
- // fall-through chain).
187
+ // Extract the reason text — the grader's reasoning. W0273 splits
188
+ // the parse boundary into a wire shape (`GraderEmittedJudgmentSchema`
189
+ // only fields the LLM controls) and a storage shape
190
+ // (`GraderJudgmentSchema` full strict surface). The pipeline
191
+ // parses against the wire shape, then synthesizes the pipeline-owned
192
+ // fields (judgmentId, metadata.{graderModel,graderJudgmentsVersion},
193
+ // hallucinationCheckedAgainst) plus the result-context fields
194
+ // (taskId, modelId, dimension) to build the full storage shape.
195
+ //
196
+ // On parse failure we fall to an `unclassified`-shape Phase 1
197
+ // judgment built from the raw reason string — NEVER fall back to
198
+ // the legacy parser (Pitfall 4: strict and legacy schemas are
199
+ // deliberate siblings, not a fall-through chain).
196
200
  const reasonRaw = comp.reason ?? "";
197
201
  let parsedJudgment = null;
198
202
  let reason = reasonRaw;
199
203
  if (reasonRaw) {
200
204
  try {
201
205
  const candidate = JSON.parse(reasonRaw);
202
- // The strict schema asserts the full GraderJudgment surface.
203
- // safeParse handles non-object inputs (number, array, etc.)
204
- // by failing — we don't pre-narrow here.
206
+ // The wire schema asserts only the LLM-emit subset. safeParse
207
+ // handles non-object inputs (number, array, etc.) by failing —
208
+ // we don't pre-narrow here.
205
209
  const candidateObj = candidate && typeof candidate === "object" ? candidate : {};
206
- const result = GraderJudgmentSchema.safeParse({
207
- ...candidateObj,
208
- taskId,
209
- modelId,
210
- dimension: kind,
211
- });
210
+ const result = GraderEmittedJudgmentSchema.safeParse(candidateObj);
212
211
  if (result.success) {
213
- parsedJudgment = result.data;
214
- reason = result.data.reason;
212
+ const emitted = result.data;
213
+ parsedJudgment = {
214
+ // Result-context fields — pipeline-supplied:
215
+ taskId,
216
+ modelId,
217
+ dimension: kind,
218
+ // Wire-emitted fields — LLM-controlled:
219
+ score: emitted.score,
220
+ reason: emitted.reason,
221
+ failureMode: emitted.failureMode,
222
+ subJudgments: emitted.subJudgments,
223
+ docCitations: emitted.docCitations,
224
+ confidence: emitted.confidence,
225
+ ...(emitted.outputFailure && {
226
+ outputFailure: emitted.outputFailure,
227
+ }),
228
+ // Pipeline-owned fields — synthesized:
229
+ judgmentId: generateJudgmentId({
230
+ taskId,
231
+ modelId,
232
+ dimension: kind,
233
+ ...(telemetry?.runId ? { runId: telemetry.runId } : {}),
234
+ }),
235
+ // hallucinationCheckedAgainst is filled in later by
236
+ // populateHallucinationFields (gap-analysis-step.ts) — it
237
+ // needs the run.documentManifest union that isn't visible
238
+ // here. Empty array is the documented pre-fill placeholder.
239
+ hallucinationCheckedAgainst: [],
240
+ metadata: {
241
+ // graderModel is threaded via the existing
242
+ // telemetry.reliability channel. When upstream wires the
243
+ // real grader-provider alias into reliability.graderModel,
244
+ // it propagates here automatically; today it's "unknown"
245
+ // (matching the pre-W0273 synthesized-fallback default).
246
+ graderModel: telemetry?.reliability.graderModel ?? "unknown",
247
+ graderJudgmentsVersion,
248
+ },
249
+ };
250
+ reason = emitted.reason;
215
251
  }
216
252
  else {
217
253
  // Parse failure — drop to failureMode='unclassified' below.
@@ -6,7 +6,7 @@
6
6
  */
7
7
  import type { LiteracyTaskDefinition } from "../../../../_vendor/ailf-core/index.d.ts";
8
8
  import type { PromptfooAssertion } from "../../assertion-mapper.js";
9
- import type { LiteracyCompileOptions } from "./types.js";
9
+ import type { LiteracyCompileOptions, RubricResolutionInput } from "./types.js";
10
10
  export declare function resolveAssertions(task: LiteracyTaskDefinition, options: LiteracyCompileOptions | undefined, warnings: string[], canonicalReference?: string): PromptfooAssertion[];
11
11
  /**
12
12
  * Build baseline assertions matching the legacy expand-tasks behavior.
@@ -14,5 +14,10 @@ export declare function resolveAssertions(task: LiteracyTaskDefinition, options:
14
14
  * - "full": all assertions carried over
15
15
  * - "abbreviated": only first llm-rubric with shortened prompt
16
16
  * - "none": no assertions
17
+ *
18
+ * `rubricConfig` supplies the W0273 wire-shape footer for the abbreviated
19
+ * mode's synthetic rubric. Without it the abbreviated emission would fail
20
+ * `GraderEmittedJudgmentSchema.safeParse` (missing failureMode,
21
+ * subJudgments, docCitations, confidence).
17
22
  */
18
- export declare function buildBaselineAssertions(goldAssertions: PromptfooAssertion[], rubricMode?: "abbreviated" | "full" | "none"): PromptfooAssertion[];
23
+ export declare function buildBaselineAssertions(goldAssertions: PromptfooAssertion[], rubricMode?: "abbreviated" | "full" | "none", rubricConfig?: RubricResolutionInput): PromptfooAssertion[];
@@ -45,8 +45,10 @@ function buildDocCoverageAssertion(rubricConfig, graderProvider, canonicalRefere
45
45
  return null;
46
46
  const template = rubricConfig.templates["doc-coverage"];
47
47
  const scaleText = template.scale.map((s) => `- ${s}`).join("\n");
48
- const rubricValue = `${template.header}\n${scaleText}\n\n` +
49
- `Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}`;
48
+ // W0273 use the centralized wire-shape footer so the grader emission
49
+ // parses against GraderEmittedJudgmentSchema. The pre-W0273 short
50
+ // {score, reason} footer caused 100% parse failures starting 2026-05-11.
51
+ const rubricValue = `${template.header}\n${scaleText}\n\n` + `${rubricConfig.footer}`;
50
52
  // doc-coverage benefits from the same authoritative reference — the grader
51
53
  // needs the doc content to judge whether the candidate actually used what
52
54
  // was documented.
@@ -92,8 +94,13 @@ function buildDocCoverageRubricPrompt(rubric, reference) {
92
94
  * - "full": all assertions carried over
93
95
  * - "abbreviated": only first llm-rubric with shortened prompt
94
96
  * - "none": no assertions
97
+ *
98
+ * `rubricConfig` supplies the W0273 wire-shape footer for the abbreviated
99
+ * mode's synthetic rubric. Without it the abbreviated emission would fail
100
+ * `GraderEmittedJudgmentSchema.safeParse` (missing failureMode,
101
+ * subJudgments, docCitations, confidence).
95
102
  */
96
- export function buildBaselineAssertions(goldAssertions, rubricMode) {
103
+ export function buildBaselineAssertions(goldAssertions, rubricMode, rubricConfig) {
97
104
  const mode = rubricMode ?? "full";
98
105
  if (mode === "none")
99
106
  return [];
@@ -106,10 +113,12 @@ export function buildBaselineAssertions(goldAssertions, rubricMode) {
106
113
  if (a.type === "llm-rubric") {
107
114
  if (!foundFirst) {
108
115
  foundFirst = true;
116
+ const footer = rubricConfig?.footer ??
117
+ 'Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}';
109
118
  abbreviated.push({
110
119
  type: "llm-rubric",
111
120
  value: "Score task completion from 0 to 100 (same criteria as above).\n" +
112
- 'Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}',
121
+ footer,
113
122
  ...(a.provider ? { provider: a.provider } : {}),
114
123
  });
115
124
  }
@@ -134,7 +134,7 @@ function buildTestCases(task, evalMode, options, warnings) {
134
134
  if (evalMode !== "agentic") {
135
135
  const baselineEnabled = task.baseline?.enabled !== false;
136
136
  if (baselineEnabled) {
137
- const baselineAssertions = buildBaselineAssertions(assertions, task.baseline?.rubric);
137
+ const baselineAssertions = buildBaselineAssertions(assertions, task.baseline?.rubric, options?.rubricConfig);
138
138
  tests.push({
139
139
  description: `${taskTitle} (baseline)`,
140
140
  vars: {
@@ -50,8 +50,15 @@ export interface ModelsAndProviders {
50
50
  * Returns provider arrays keyed by literacy variant name (baseline,
51
51
  * agentic, observed). These are consumed by the YAML writer to produce
52
52
  * the per-variant promptfoo config files.
53
+ *
54
+ * `loaded` (optional) lets callers pre-load and pre-filter the
55
+ * `ModelsConfig` so a caller-side filter (e.g. W0281's
56
+ * `filterModelsByRequest`) actually takes effect on the assembled
57
+ * providers — building providers from the unfiltered set would silently
58
+ * defeat the filter, since promptfoo decides which LLMs to call from the
59
+ * providers array, not the returned `models` field.
53
60
  */
54
- export declare function loadModelsAndProviders(rootDir: string, source?: ResolvedSourceConfig, searchMode?: string, allowedOrigins?: string[]): ModelsAndProviders;
61
+ export declare function loadModelsAndProviders(rootDir: string, source?: ResolvedSourceConfig, searchMode?: string, allowedOrigins?: string[], loaded?: ModelsConfig): ModelsAndProviders;
55
62
  /**
56
63
  * Resolve `maxToolRounds` for an agentic variant (W0134).
57
64
  *
@@ -60,3 +67,10 @@ export declare function loadModelsAndProviders(rootDir: string, source?: Resolve
60
67
  * > hard fallback (5).
61
68
  */
62
69
  export declare function resolveMaxToolRounds(models: ModelsConfig, model: ModelsConfig["models"][number], variant: "agentic-naive" | "agentic-optimized"): number;
70
+ /**
71
+ * Load the `ModelsConfig` for `rootDir` from disk. Exported so callers
72
+ * that need to pre-filter the model set before provider assembly (e.g.
73
+ * `PipelineRequest.models`) can hand the filtered config back to
74
+ * `loadModelsAndProviders` via its optional `loaded` parameter.
75
+ */
76
+ export declare function loadModelsYaml(rootDir: string): ModelsConfig;