@sanity/ailf 7.0.1 → 7.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. package/config/rubrics.ts +12 -13
  2. package/dist/_vendor/ailf-core/ports/context.d.ts +45 -3
  3. package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +10 -0
  4. package/dist/_vendor/ailf-core/ports/index.d.ts +1 -1
  5. package/dist/_vendor/ailf-core/schemas/branded-string.d.ts +9 -1
  6. package/dist/_vendor/ailf-core/schemas/branded-string.js +16 -6
  7. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +2 -0
  8. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +7 -0
  9. package/dist/_vendor/ailf-core/schemas/report.d.ts +12 -0
  10. package/dist/_vendor/ailf-core/schemas/report.js +2 -0
  11. package/dist/_vendor/ailf-core/schemas/team.d.ts +22 -0
  12. package/dist/_vendor/ailf-core/schemas/team.js +63 -0
  13. package/dist/_vendor/ailf-core/types/grader-judgment.d.ts +51 -0
  14. package/dist/_vendor/ailf-core/types/index.d.ts +8 -1
  15. package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +17 -0
  16. package/dist/_vendor/ailf-core/types/team.d.ts +65 -0
  17. package/dist/_vendor/ailf-core/types/team.js +1 -0
  18. package/dist/_vendor/ailf-shared/document-ref.d.ts +29 -1
  19. package/dist/_vendor/ailf-shared/document-ref.js +23 -1
  20. package/dist/_vendor/ailf-shared/eval-modes.d.ts +2 -0
  21. package/dist/_vendor/ailf-shared/eval-modes.js +5 -0
  22. package/dist/_vendor/ailf-shared/event-types.d.ts +15 -0
  23. package/dist/_vendor/ailf-shared/event-types.js +23 -0
  24. package/dist/_vendor/ailf-shared/generated/help-content.js +2 -2
  25. package/dist/_vendor/ailf-shared/index.d.ts +5 -3
  26. package/dist/_vendor/ailf-shared/index.js +5 -2
  27. package/dist/_vendor/ailf-shared/member-roles.d.ts +16 -0
  28. package/dist/_vendor/ailf-shared/member-roles.js +16 -0
  29. package/dist/_vendor/ailf-shared/owner-teams.d.ts +19 -0
  30. package/dist/_vendor/ailf-shared/owner-teams.js +26 -6
  31. package/dist/_vendor/ailf-shared/run-context.d.ts +8 -1
  32. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +15 -1
  33. package/dist/adapters/grader-outputs/promptfoo-grader-output.d.ts +65 -1
  34. package/dist/adapters/grader-outputs/promptfoo-grader-output.js +35 -0
  35. package/dist/adapters/task-sources/changed-docs-filter.d.ts +12 -0
  36. package/dist/adapters/task-sources/changed-docs-filter.js +30 -0
  37. package/dist/adapters/task-sources/content-lake-task-source.js +14 -8
  38. package/dist/adapters/task-sources/repo-task-source.js +2 -1
  39. package/dist/commands/pipeline-action.d.ts +4 -3
  40. package/dist/commands/pipeline-action.js +7 -5
  41. package/dist/commands/run.js +2 -2
  42. package/dist/config/rubrics.ts +12 -13
  43. package/dist/job-store.d.ts +18 -0
  44. package/dist/job-store.js +34 -0
  45. package/dist/orchestration/build-app-context.js +8 -1
  46. package/dist/orchestration/pipeline-orchestrator.js +46 -1
  47. package/dist/orchestration/steps/compare-step.d.ts +7 -0
  48. package/dist/orchestration/steps/compare-step.js +59 -23
  49. package/dist/orchestration/steps/fetch-docs-step.js +3 -0
  50. package/dist/orchestration/steps/finalize-run-step.js +2 -0
  51. package/dist/orchestration/steps/gap-analysis-step.js +9 -8
  52. package/dist/orchestration/steps/generate-configs-step.d.ts +32 -1
  53. package/dist/orchestration/steps/generate-configs-step.js +47 -13
  54. package/dist/orchestration/steps/grader-consistency-step.js +11 -0
  55. package/dist/orchestration/steps/publish-report-step.d.ts +12 -1
  56. package/dist/orchestration/steps/publish-report-step.js +36 -8
  57. package/dist/pipeline/cache-hit-restore.d.ts +14 -1
  58. package/dist/pipeline/cache-hit-restore.js +17 -0
  59. package/dist/pipeline/calculate-scores.d.ts +13 -1
  60. package/dist/pipeline/calculate-scores.js +123 -29
  61. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +7 -2
  62. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +13 -4
  63. package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +1 -1
  64. package/dist/pipeline/compiler/provider-assembler.d.ts +15 -1
  65. package/dist/pipeline/compiler/provider-assembler.js +16 -3
  66. package/dist/pipeline/failure-modes.d.ts +20 -10
  67. package/dist/pipeline/failure-modes.js +84 -15
  68. package/dist/pipeline/map-request-to-config.js +2 -0
  69. package/dist/pipeline/normalize-mode.d.ts +1 -1
  70. package/dist/pipeline/normalize-mode.js +2 -0
  71. package/dist/pipeline/run-context.d.ts +16 -1
  72. package/dist/pipeline/run-context.js +12 -1
  73. package/dist/pipeline/validate.d.ts +8 -4
  74. package/dist/pipeline/validate.js +8 -18
  75. package/dist/report-store.d.ts +14 -1
  76. package/dist/report-store.js +32 -0
  77. package/dist/sanity/client.js +2 -2
  78. package/dist/sanity/queries.d.ts +1 -1
  79. package/dist/sanity/queries.js +1 -0
  80. package/dist/sources.js +40 -2
  81. package/package.json +1 -1
@@ -187,6 +187,13 @@ export declare function validateGraderJudgmentsCalibration(judgments: GraderJudg
187
187
  * @param manifestSlugs - All slugs in the run's document manifest.
188
188
  */
189
189
  export declare function populateHallucinationFields(judgments: GraderJudgment[], taskDocSlugs: Map<string, string[]>, manifestSlugs: Iterable<string>): void;
190
+ /**
191
+ * Per-variant scoring profiles passed to {@link extractStoredTestResults}.
192
+ * Each profile maps dimension id → weight. Variants whose dimensions don't
193
+ * intersect the supplied keys yield `compositeScore: undefined` rather than
194
+ * a misleading 0.
195
+ */
196
+ export type StoredTestResultProfiles = Partial<Record<"gold" | "baseline", Record<string, number>>>;
190
197
  /**
191
198
  * Extract per-test results with model output from evaluation results.
192
199
  *
@@ -194,9 +201,14 @@ export declare function populateHallucinationFields(judgments: GraderJudgment[],
194
201
  * shape including response.output (truncated), latency, and cost.
195
202
  * One StoredTestResult per test × model combination.
196
203
  *
204
+ * When `profiles` is provided, each entry's `compositeScore` is computed as
205
+ * the weighted mean of its dimension scores using the profile matching its
206
+ * detected `variant`. Without profiles, `compositeScore` is omitted — legacy
207
+ * behavior preserved.
208
+ *
197
209
  * See D0029 and docs/design-docs/score-drill-down.md (Phase 1).
198
210
  */
199
- export declare function extractStoredTestResults(resultsPath: string): StoredTestResult[];
211
+ export declare function extractStoredTestResults(resultsPath: string, profiles?: StoredTestResultProfiles): StoredTestResult[];
200
212
  /**
201
213
  * W0198 — aggregate every per-test `SymbolPreflightReport` into a single
202
214
  * resolver-health summary. Returns `undefined` when the run had no
@@ -32,7 +32,7 @@ import { join } from "path";
32
32
  import { classifyRubric, detectFeatureArea, extractUrlMetadata, generateJudgmentId, mergeScores, parseRubricScore, resolveVariantMode, } from "../_vendor/ailf-core/index.js";
33
33
  import { calculateCost } from "../agent-observer/pricing.js";
34
34
  import { ConsoleLogger } from "../adapters/loggers/index.js";
35
- import { GraderJudgmentSchema, graderJudgmentsVersion, } from "../adapters/grader-outputs/promptfoo-grader-output.js";
35
+ import { GraderEmittedJudgmentSchema, graderJudgmentsVersion, } from "../adapters/grader-outputs/promptfoo-grader-output.js";
36
36
  import { validateFailureMode } from "./failure-modes.js";
37
37
  import { analyzeSourceIsolation, } from "../assertions/source-isolation.js";
38
38
  import { checkResultsExist } from "./checks.js";
@@ -184,34 +184,70 @@ export function extractGraderJudgments(resultsPath, telemetry) {
184
184
  continue;
185
185
  }
186
186
  const score = parseRubricScore(comp);
187
- // Extract the reason text — the grader's reasoning. Plan 03-01
188
- // (D0045 trust boundary): the inline `JSON.parse + as`-cast at
189
- // this site is replaced with `GraderJudgmentSchema.safeParse`
190
- // so that grader output flows through a validated schema before
191
- // it enters the scoring pipeline. On parse failure we fall to
192
- // an `unclassified`-shape Phase 1 judgment built from the raw
193
- // reason string — NEVER fall back to the legacy parser (Pitfall
194
- // 4: strict and legacy schemas are deliberate siblings, not a
195
- // fall-through chain).
187
+ // Extract the reason text — the grader's reasoning. W0273 splits
188
+ // the parse boundary into a wire shape (`GraderEmittedJudgmentSchema`
189
+ // only fields the LLM controls) and a storage shape
190
+ // (`GraderJudgmentSchema` full strict surface). The pipeline
191
+ // parses against the wire shape, then synthesizes the pipeline-owned
192
+ // fields (judgmentId, metadata.{graderModel,graderJudgmentsVersion},
193
+ // hallucinationCheckedAgainst) plus the result-context fields
194
+ // (taskId, modelId, dimension) to build the full storage shape.
195
+ //
196
+ // On parse failure we fall to an `unclassified`-shape Phase 1
197
+ // judgment built from the raw reason string — NEVER fall back to
198
+ // the legacy parser (Pitfall 4: strict and legacy schemas are
199
+ // deliberate siblings, not a fall-through chain).
196
200
  const reasonRaw = comp.reason ?? "";
197
201
  let parsedJudgment = null;
198
202
  let reason = reasonRaw;
199
203
  if (reasonRaw) {
200
204
  try {
201
205
  const candidate = JSON.parse(reasonRaw);
202
- // The strict schema asserts the full GraderJudgment surface.
203
- // safeParse handles non-object inputs (number, array, etc.)
204
- // by failing — we don't pre-narrow here.
206
+ // The wire schema asserts only the LLM-emit subset. safeParse
207
+ // handles non-object inputs (number, array, etc.) by failing —
208
+ // we don't pre-narrow here.
205
209
  const candidateObj = candidate && typeof candidate === "object" ? candidate : {};
206
- const result = GraderJudgmentSchema.safeParse({
207
- ...candidateObj,
208
- taskId,
209
- modelId,
210
- dimension: kind,
211
- });
210
+ const result = GraderEmittedJudgmentSchema.safeParse(candidateObj);
212
211
  if (result.success) {
213
- parsedJudgment = result.data;
214
- reason = result.data.reason;
212
+ const emitted = result.data;
213
+ parsedJudgment = {
214
+ // Result-context fields — pipeline-supplied:
215
+ taskId,
216
+ modelId,
217
+ dimension: kind,
218
+ // Wire-emitted fields — LLM-controlled:
219
+ score: emitted.score,
220
+ reason: emitted.reason,
221
+ failureMode: emitted.failureMode,
222
+ subJudgments: emitted.subJudgments,
223
+ docCitations: emitted.docCitations,
224
+ confidence: emitted.confidence,
225
+ ...(emitted.outputFailure && {
226
+ outputFailure: emitted.outputFailure,
227
+ }),
228
+ // Pipeline-owned fields — synthesized:
229
+ judgmentId: generateJudgmentId({
230
+ taskId,
231
+ modelId,
232
+ dimension: kind,
233
+ ...(telemetry?.runId ? { runId: telemetry.runId } : {}),
234
+ }),
235
+ // hallucinationCheckedAgainst is filled in later by
236
+ // populateHallucinationFields (gap-analysis-step.ts) — it
237
+ // needs the run.documentManifest union that isn't visible
238
+ // here. Empty array is the documented pre-fill placeholder.
239
+ hallucinationCheckedAgainst: [],
240
+ metadata: {
241
+ // graderModel is threaded via the existing
242
+ // telemetry.reliability channel. When upstream wires the
243
+ // real grader-provider alias into reliability.graderModel,
244
+ // it propagates here automatically; today it's "unknown"
245
+ // (matching the pre-W0273 synthesized-fallback default).
246
+ graderModel: telemetry?.reliability.graderModel ?? "unknown",
247
+ graderJudgmentsVersion,
248
+ },
249
+ };
250
+ reason = emitted.reason;
215
251
  }
216
252
  else {
217
253
  // Parse failure — drop to failureMode='unclassified' below.
@@ -433,6 +469,26 @@ export function populateHallucinationFields(judgments, taskDocSlugs, manifestSlu
433
469
  * `responseOutputTruncated` still flips for the extreme tail.
434
470
  */
435
471
  const MAX_RESPONSE_OUTPUT_LENGTH = 1_000_000;
472
+ /**
473
+ * Weighted mean of dimension scores. Mirrors the dashboard's read-side
474
+ * fallback in `apps/dashboard/src/data/projections/test-entries.ts` so writer
475
+ * and reader stay aligned. Returns `undefined` when no dimension matches the
476
+ * profile (caller decides whether that signals misconfiguration).
477
+ */
478
+ function computeStoredCompositeScore(dimensions, weights) {
479
+ let weighted = 0;
480
+ let totalWeight = 0;
481
+ for (const dim of dimensions) {
482
+ const w = weights[dim.dimension];
483
+ if (w === undefined)
484
+ continue;
485
+ weighted += dim.score * w;
486
+ totalWeight += w;
487
+ }
488
+ if (totalWeight === 0)
489
+ return undefined;
490
+ return Math.round(weighted / totalWeight);
491
+ }
436
492
  /**
437
493
  * Extract per-test results with model output from evaluation results.
438
494
  *
@@ -440,9 +496,14 @@ const MAX_RESPONSE_OUTPUT_LENGTH = 1_000_000;
440
496
  * shape including response.output (truncated), latency, and cost.
441
497
  * One StoredTestResult per test × model combination.
442
498
  *
499
+ * When `profiles` is provided, each entry's `compositeScore` is computed as
500
+ * the weighted mean of its dimension scores using the profile matching its
501
+ * detected `variant`. Without profiles, `compositeScore` is omitted — legacy
502
+ * behavior preserved.
503
+ *
443
504
  * See D0029 and docs/design-docs/score-drill-down.md (Phase 1).
444
505
  */
445
- export function extractStoredTestResults(resultsPath) {
506
+ export function extractStoredTestResults(resultsPath, profiles) {
446
507
  const results = readAndNormalizeResults(resultsPath);
447
508
  const testResults = [];
448
509
  for (const result of results) {
@@ -487,8 +548,13 @@ export function extractStoredTestResults(resultsPath) {
487
548
  dimensions.push({ dimension, reason, score });
488
549
  }
489
550
  const tokenUsage = result.response?.tokenUsage;
551
+ const profileForVariant = profiles?.[variant];
552
+ const compositeScore = profileForVariant
553
+ ? computeStoredCompositeScore(dimensions, profileForVariant)
554
+ : undefined;
490
555
  testResults.push({
491
556
  area,
557
+ ...(compositeScore !== undefined && { compositeScore }),
492
558
  cost: result.cost || undefined,
493
559
  dimensions,
494
560
  latencyMs: result.latencyMs,
@@ -1441,7 +1507,12 @@ export async function calculateAndWriteScores(options) {
1441
1507
  log.info(`Grader judgments written to results/latest/grader-judgments.json (${judgments.length} judgments)`);
1442
1508
  }
1443
1509
  // Extract and persist per-test results (D0029: model output + metadata)
1444
- const testResults = extractStoredTestResults(baselineResultsPath);
1510
+ // Agent-harness produces a single profile shared across detected variants
1511
+ // (the docs/no-docs split doesn't apply — there is no gold/baseline pair).
1512
+ const testResults = extractStoredTestResults(baselineResultsPath, {
1513
+ gold: agentProfile,
1514
+ baseline: agentProfile,
1515
+ });
1445
1516
  if (testResults.length > 0) {
1446
1517
  writeFileSync(join(outDir, "test-results.json"), JSON.stringify(testResults, null, 2));
1447
1518
  log.info(`Test results written to results/latest/test-results.json (${testResults.length} results)`);
@@ -1498,7 +1569,13 @@ export async function calculateAndWriteScores(options) {
1498
1569
  writeFileSync(join(outDir, "grader-judgments.json"), JSON.stringify(judgments, null, 2));
1499
1570
  log.info(`Grader judgments written to results/latest/grader-judgments.json (${judgments.length} judgments)`);
1500
1571
  }
1501
- const testResults = extractStoredTestResults(baselineResultsPath);
1572
+ // Knowledge-probe deletes vars.docs in the compiler, so every entry's
1573
+ // detected variant is "baseline" — supply the probe profile under both
1574
+ // keys so the composite is populated regardless of detection.
1575
+ const testResults = extractStoredTestResults(baselineResultsPath, {
1576
+ gold: probeProfile,
1577
+ baseline: probeProfile,
1578
+ });
1502
1579
  if (testResults.length > 0) {
1503
1580
  writeFileSync(join(outDir, "test-results.json"), JSON.stringify(testResults, null, 2));
1504
1581
  log.info(`Test results written to results/latest/test-results.json (${testResults.length} results)`);
@@ -1512,9 +1589,15 @@ export async function calculateAndWriteScores(options) {
1512
1589
  // doc-coverage excluded). See docs/design-docs/named-scoring-profiles.md.
1513
1590
  const goldProfile = resolveProfile("literacy", "gold", rubricConfig, LiteracyVariant.STANDARD);
1514
1591
  const baselineProfileWeights = resolveProfile("literacy", LiteracyVariant.STANDARD, rubricConfig, LiteracyVariant.STANDARD);
1592
+ // Hoisted so the post-scoring extractStoredTestResults call against the
1593
+ // agentic results file can attach the matching profile (W0291).
1594
+ const agenticProfile = mode === LiteracyVariant.FULL && existsSync(agenticResultsPath)
1595
+ ? resolveProfile("literacy", "gold", rubricConfig, LiteracyVariant.AGENTIC)
1596
+ : undefined;
1515
1597
  log.debug("Loaded scoring profiles", {
1516
1598
  gold: goldProfile,
1517
1599
  baseline: baselineProfileWeights,
1600
+ ...(agenticProfile && { agentic: agenticProfile }),
1518
1601
  });
1519
1602
  const baselineScores = calculateScores(baselineResultsPath, goldProfile, baselineProfileWeights, preflightOptions);
1520
1603
  log.debug("Baseline scores calculated", {
@@ -1541,7 +1624,8 @@ export async function calculateAndWriteScores(options) {
1541
1624
  let evaluationMode;
1542
1625
  if (mode === LiteracyVariant.FULL && existsSync(agenticResultsPath)) {
1543
1626
  log.info(`\nReading agentic results from: ${agenticResultsPath}`);
1544
- const agenticProfile = resolveProfile("literacy", "gold", rubricConfig, LiteracyVariant.AGENTIC);
1627
+ // Non-null assertion safe the outer guard hoisting agenticProfile uses
1628
+ // the same condition; if we entered this block, the profile was resolved.
1545
1629
  const agenticScores = scoreAgenticResults(agenticResultsPath, agenticProfile, preflightOptions);
1546
1630
  log.debug("Agentic scores calculated", {
1547
1631
  featureCount: Object.keys(agenticScores).length,
@@ -1645,11 +1729,21 @@ export async function calculateAndWriteScores(options) {
1645
1729
  });
1646
1730
  }
1647
1731
  }
1648
- // Extract and persist per-test results (D0029: model output + metadata)
1649
- const testResults = extractStoredTestResults(baselineResultsPath);
1650
- // In full mode, also extract test results from agentic results
1732
+ // Extract and persist per-test results (D0029: model output + metadata).
1733
+ // Literacy gold (with-docs) entries score against the default profile;
1734
+ // baseline (without-docs) entries score against the output-only profile.
1735
+ const testResults = extractStoredTestResults(baselineResultsPath, {
1736
+ gold: goldProfile,
1737
+ baseline: baselineProfileWeights,
1738
+ });
1739
+ // In full mode, also extract test results from agentic results — the
1740
+ // agentic file's gold entries score against the agentic profile while
1741
+ // baseline entries (if any leak through) still use the literacy baseline.
1651
1742
  if (mode === LiteracyVariant.FULL && existsSync(agenticResultsPath)) {
1652
- const agenticTestResults = extractStoredTestResults(agenticResultsPath);
1743
+ const agenticTestResults = extractStoredTestResults(agenticResultsPath, {
1744
+ gold: agenticProfile,
1745
+ baseline: baselineProfileWeights,
1746
+ });
1653
1747
  testResults.push(...agenticTestResults);
1654
1748
  }
1655
1749
  if (testResults.length > 0) {
@@ -6,7 +6,7 @@
6
6
  */
7
7
  import type { LiteracyTaskDefinition } from "../../../../_vendor/ailf-core/index.d.ts";
8
8
  import type { PromptfooAssertion } from "../../assertion-mapper.js";
9
- import type { LiteracyCompileOptions } from "./types.js";
9
+ import type { LiteracyCompileOptions, RubricResolutionInput } from "./types.js";
10
10
  export declare function resolveAssertions(task: LiteracyTaskDefinition, options: LiteracyCompileOptions | undefined, warnings: string[], canonicalReference?: string): PromptfooAssertion[];
11
11
  /**
12
12
  * Build baseline assertions matching the legacy expand-tasks behavior.
@@ -14,5 +14,10 @@ export declare function resolveAssertions(task: LiteracyTaskDefinition, options:
14
14
  * - "full": all assertions carried over
15
15
  * - "abbreviated": only first llm-rubric with shortened prompt
16
16
  * - "none": no assertions
17
+ *
18
+ * `rubricConfig` supplies the W0273 wire-shape footer for the abbreviated
19
+ * mode's synthetic rubric. Without it the abbreviated emission would fail
20
+ * `GraderEmittedJudgmentSchema.safeParse` (missing failureMode,
21
+ * subJudgments, docCitations, confidence).
17
22
  */
18
- export declare function buildBaselineAssertions(goldAssertions: PromptfooAssertion[], rubricMode?: "abbreviated" | "full" | "none"): PromptfooAssertion[];
23
+ export declare function buildBaselineAssertions(goldAssertions: PromptfooAssertion[], rubricMode?: "abbreviated" | "full" | "none", rubricConfig?: RubricResolutionInput): PromptfooAssertion[];
@@ -45,8 +45,10 @@ function buildDocCoverageAssertion(rubricConfig, graderProvider, canonicalRefere
45
45
  return null;
46
46
  const template = rubricConfig.templates["doc-coverage"];
47
47
  const scaleText = template.scale.map((s) => `- ${s}`).join("\n");
48
- const rubricValue = `${template.header}\n${scaleText}\n\n` +
49
- `Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}`;
48
+ // W0273 use the centralized wire-shape footer so the grader emission
49
+ // parses against GraderEmittedJudgmentSchema. The pre-W0273 short
50
+ // {score, reason} footer caused 100% parse failures starting 2026-05-11.
51
+ const rubricValue = `${template.header}\n${scaleText}\n\n` + `${rubricConfig.footer}`;
50
52
  // doc-coverage benefits from the same authoritative reference — the grader
51
53
  // needs the doc content to judge whether the candidate actually used what
52
54
  // was documented.
@@ -92,8 +94,13 @@ function buildDocCoverageRubricPrompt(rubric, reference) {
92
94
  * - "full": all assertions carried over
93
95
  * - "abbreviated": only first llm-rubric with shortened prompt
94
96
  * - "none": no assertions
97
+ *
98
+ * `rubricConfig` supplies the W0273 wire-shape footer for the abbreviated
99
+ * mode's synthetic rubric. Without it the abbreviated emission would fail
100
+ * `GraderEmittedJudgmentSchema.safeParse` (missing failureMode,
101
+ * subJudgments, docCitations, confidence).
95
102
  */
96
- export function buildBaselineAssertions(goldAssertions, rubricMode) {
103
+ export function buildBaselineAssertions(goldAssertions, rubricMode, rubricConfig) {
97
104
  const mode = rubricMode ?? "full";
98
105
  if (mode === "none")
99
106
  return [];
@@ -106,10 +113,12 @@ export function buildBaselineAssertions(goldAssertions, rubricMode) {
106
113
  if (a.type === "llm-rubric") {
107
114
  if (!foundFirst) {
108
115
  foundFirst = true;
116
+ const footer = rubricConfig?.footer ??
117
+ 'Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}';
109
118
  abbreviated.push({
110
119
  type: "llm-rubric",
111
120
  value: "Score task completion from 0 to 100 (same criteria as above).\n" +
112
- 'Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}',
121
+ footer,
113
122
  ...(a.provider ? { provider: a.provider } : {}),
114
123
  });
115
124
  }
@@ -134,7 +134,7 @@ function buildTestCases(task, evalMode, options, warnings) {
134
134
  if (evalMode !== "agentic") {
135
135
  const baselineEnabled = task.baseline?.enabled !== false;
136
136
  if (baselineEnabled) {
137
- const baselineAssertions = buildBaselineAssertions(assertions, task.baseline?.rubric);
137
+ const baselineAssertions = buildBaselineAssertions(assertions, task.baseline?.rubric, options?.rubricConfig);
138
138
  tests.push({
139
139
  description: `${taskTitle} (baseline)`,
140
140
  vars: {
@@ -50,8 +50,15 @@ export interface ModelsAndProviders {
50
50
  * Returns provider arrays keyed by literacy variant name (baseline,
51
51
  * agentic, observed). These are consumed by the YAML writer to produce
52
52
  * the per-variant promptfoo config files.
53
+ *
54
+ * `loaded` (optional) lets callers pre-load and pre-filter the
55
+ * `ModelsConfig` so a caller-side filter (e.g. W0281's
56
+ * `filterModelsByRequest`) actually takes effect on the assembled
57
+ * providers — building providers from the unfiltered set would silently
58
+ * defeat the filter, since promptfoo decides which LLMs to call from the
59
+ * providers array, not the returned `models` field.
53
60
  */
54
- export declare function loadModelsAndProviders(rootDir: string, source?: ResolvedSourceConfig, searchMode?: string, allowedOrigins?: string[]): ModelsAndProviders;
61
+ export declare function loadModelsAndProviders(rootDir: string, source?: ResolvedSourceConfig, searchMode?: string, allowedOrigins?: string[], loaded?: ModelsConfig): ModelsAndProviders;
55
62
  /**
56
63
  * Resolve `maxToolRounds` for an agentic variant (W0134).
57
64
  *
@@ -60,3 +67,10 @@ export declare function loadModelsAndProviders(rootDir: string, source?: Resolve
60
67
  * > hard fallback (5).
61
68
  */
62
69
  export declare function resolveMaxToolRounds(models: ModelsConfig, model: ModelsConfig["models"][number], variant: "agentic-naive" | "agentic-optimized"): number;
70
+ /**
71
+ * Load the `ModelsConfig` for `rootDir` from disk. Exported so callers
72
+ * that need to pre-filter the model set before provider assembly (e.g.
73
+ * `PipelineRequest.models`) can hand the filtered config back to
74
+ * `loadModelsAndProviders` via its optional `loaded` parameter.
75
+ */
76
+ export declare function loadModelsYaml(rootDir: string): ModelsConfig;
@@ -64,9 +64,16 @@ function applyReplaySwap(providers) {
64
64
  * Returns provider arrays keyed by literacy variant name (baseline,
65
65
  * agentic, observed). These are consumed by the YAML writer to produce
66
66
  * the per-variant promptfoo config files.
67
+ *
68
+ * `loaded` (optional) lets callers pre-load and pre-filter the
69
+ * `ModelsConfig` so a caller-side filter (e.g. W0281's
70
+ * `filterModelsByRequest`) actually takes effect on the assembled
71
+ * providers — building providers from the unfiltered set would silently
72
+ * defeat the filter, since promptfoo decides which LLMs to call from the
73
+ * providers array, not the returned `models` field.
67
74
  */
68
- export function loadModelsAndProviders(rootDir, source, searchMode, allowedOrigins) {
69
- const models = loadModelsYaml(rootDir);
75
+ export function loadModelsAndProviders(rootDir, source, searchMode, allowedOrigins, loaded) {
76
+ const models = loaded ?? loadModelsYaml(rootDir);
70
77
  return {
71
78
  models,
72
79
  providers: {
@@ -203,6 +210,12 @@ export function resolveMaxToolRounds(models, model, variant) {
203
210
  // ---------------------------------------------------------------------------
204
211
  // Helpers
205
212
  // ---------------------------------------------------------------------------
206
- function loadModelsYaml(rootDir) {
213
+ /**
214
+ * Load the `ModelsConfig` for `rootDir` from disk. Exported so callers
215
+ * that need to pre-filter the model set before provider assembly (e.g.
216
+ * `PipelineRequest.models`) can hand the filtered config back to
217
+ * `loadModelsAndProviders` via its optional `loaded` parameter.
218
+ */
219
+ export function loadModelsYaml(rootDir) {
207
220
  return loadConfigFile("models", rootDir).data;
208
221
  }
@@ -1,24 +1,34 @@
1
1
  /**
2
2
  * pipeline/failure-modes.ts
3
3
  *
4
- * Ceiling-cross-check failure-mode validator + report assembly.
4
+ * Ceiling-cross-check failure-mode validator + report assembly + keyword
5
+ * fallback classifier.
5
6
  *
6
7
  * The grader emits `failureMode` directly under the per-dimension taxonomy
7
- * (Plan 03-02 — `packages/eval/src/grader/`). This module consumes the
8
- * grader's emission as the source of truth and uses the surviving ceiling
9
- * decomposition (`classifyByCeiling`) only as a CONFIDENCE VALIDATOR it
10
- * cross-checks the emitted mode against structural score signals and emits
11
- * a D0049 `Confidence` triple stamped with `derivation: "ceiling-cross-check"`.
8
+ * (Plan 03-02 — `packages/eval/src/grader/`) when its structured response
9
+ * is available to the pipeline. In practice (W0273 discovery), Promptfoo's
10
+ * `llm-rubric` post-processor extracts `score` + `reason` from the grader's
11
+ * JSON envelope and discards the rest of the structured surface including
12
+ * `failureMode`. The wire-shape footer instructs the LLM correctly but the
13
+ * structured fields never reach `extractGraderJudgments`, so every emission
14
+ * arrives as the synthesized `failureMode: "unclassified"` placeholder.
12
15
  *
13
- * The legacy keyword-pattern classifier (and its five regex pattern
14
- * constants) was deleted in Plan 03-03 its production coverage was ~1%
15
- * (Doc 03 §"Where classification lives"), and resurrecting it as a fallback
16
- * is explicitly out of scope.
16
+ * To restore the pre-2026-05-11 classification rate (15-23% 0% 15-23%),
17
+ * a keyword-pattern classifier is run as a FALLBACK when the grader's
18
+ * emitted mode is `"unclassified"` and the score is below the classification
19
+ * threshold. Plan 03-03 deleted this classifier in favor of grader-emission
20
+ * source-of-truth; W0273 reinstates it because the grader-emission path is
21
+ * blocked by Promptfoo's lossy `llm-rubric` parsing. The long-term fix
22
+ * (capturing the grader's full structured response) is tracked separately.
23
+ *
24
+ * `classifyByCeiling` continues to serve as the confidence cross-check.
17
25
  *
18
26
  * @see docs/decisions/D0005-grader-model-separation.md — single grader emits
19
27
  * failureMode under the per-dimension taxonomy
20
28
  * @see docs/decisions/D0049-shared-confidence-contract.md — Confidence triple
21
29
  * shape and `ceiling-cross-check` derivation tag
30
+ * @see docs/audits/2026-05-22-empty-gap-analysis-regression.md — W0273 root
31
+ * cause (Promptfoo strips structured fields)
22
32
  */
23
33
  import type { Confidence } from "../_vendor/ailf-core/index.d.ts";
24
34
  import type { FailureModeReport, FeatureScore, GraderJudgment } from "./types.js";
@@ -1,24 +1,34 @@
1
1
  /**
2
2
  * pipeline/failure-modes.ts
3
3
  *
4
- * Ceiling-cross-check failure-mode validator + report assembly.
4
+ * Ceiling-cross-check failure-mode validator + report assembly + keyword
5
+ * fallback classifier.
5
6
  *
6
7
  * The grader emits `failureMode` directly under the per-dimension taxonomy
7
- * (Plan 03-02 — `packages/eval/src/grader/`). This module consumes the
8
- * grader's emission as the source of truth and uses the surviving ceiling
9
- * decomposition (`classifyByCeiling`) only as a CONFIDENCE VALIDATOR it
10
- * cross-checks the emitted mode against structural score signals and emits
11
- * a D0049 `Confidence` triple stamped with `derivation: "ceiling-cross-check"`.
8
+ * (Plan 03-02 — `packages/eval/src/grader/`) when its structured response
9
+ * is available to the pipeline. In practice (W0273 discovery), Promptfoo's
10
+ * `llm-rubric` post-processor extracts `score` + `reason` from the grader's
11
+ * JSON envelope and discards the rest of the structured surface including
12
+ * `failureMode`. The wire-shape footer instructs the LLM correctly but the
13
+ * structured fields never reach `extractGraderJudgments`, so every emission
14
+ * arrives as the synthesized `failureMode: "unclassified"` placeholder.
12
15
  *
13
- * The legacy keyword-pattern classifier (and its five regex pattern
14
- * constants) was deleted in Plan 03-03 its production coverage was ~1%
15
- * (Doc 03 §"Where classification lives"), and resurrecting it as a fallback
16
- * is explicitly out of scope.
16
+ * To restore the pre-2026-05-11 classification rate (15-23% 0% 15-23%),
17
+ * a keyword-pattern classifier is run as a FALLBACK when the grader's
18
+ * emitted mode is `"unclassified"` and the score is below the classification
19
+ * threshold. Plan 03-03 deleted this classifier in favor of grader-emission
20
+ * source-of-truth; W0273 reinstates it because the grader-emission path is
21
+ * blocked by Promptfoo's lossy `llm-rubric` parsing. The long-term fix
22
+ * (capturing the grader's full structured response) is tracked separately.
23
+ *
24
+ * `classifyByCeiling` continues to serve as the confidence cross-check.
17
25
  *
18
26
  * @see docs/decisions/D0005-grader-model-separation.md — single grader emits
19
27
  * failureMode under the per-dimension taxonomy
20
28
  * @see docs/decisions/D0049-shared-confidence-contract.md — Confidence triple
21
29
  * shape and `ceiling-cross-check` derivation tag
30
+ * @see docs/audits/2026-05-22-empty-gap-analysis-regression.md — W0273 root
31
+ * cause (Promptfoo strips structured fields)
22
32
  */
23
33
  import { LEGACY_FAILURE_MODES, detectFeatureArea } from "../_vendor/ailf-core/index.js";
24
34
  // ---------------------------------------------------------------------------
@@ -27,6 +37,20 @@ import { LEGACY_FAILURE_MODES, detectFeatureArea } from "../_vendor/ailf-core/in
27
37
  /** Only classify judgments with scores below this threshold */
28
38
  const CLASSIFICATION_THRESHOLD = 60;
29
39
  // ---------------------------------------------------------------------------
40
+ // Keyword patterns (W0273 fallback)
41
+ //
42
+ // Verbatim from the pre-Plan-03-03 implementation. Used only when the
43
+ // grader's emitted `failureMode` is `"unclassified"` — the grader's
44
+ // emission still wins whenever it actually reaches the pipeline.
45
+ // ---------------------------------------------------------------------------
46
+ /** API error pattern — checked FIRST to prevent timeout errors containing
47
+ * "deprecated" from being misclassified as outdated-docs. */
48
+ const API_ERROR_PATTERN = /\[api-error\]|timeout|timed out|rate limit|429|503|ECONNRESET|ETIMEDOUT|socket hang up|fetch failed/i;
49
+ const OUTDATED_PATTERN = /deprecated|old api|v[0-9]+ syntax|no longer supported|legacy|previous version|outdated|superseded|replaced by/i;
50
+ const MISSING_PATTERN = /no documentation|not covered|had to guess|not found|missing.*doc|no.*information|undocumented|couldn't find|without.*documentation/i;
51
+ const INCORRECT_PATTERN = /contradicts|incorrect.*doc|doc.*incorrect|wrong.*doc|doc.*wrong|documentation says.*but|factual error|inaccurate|misleading.*doc/i;
52
+ const POOR_STRUCTURE_PATTERN = /unclear|ambiguous|couldn't determine|conflicting|confusing|hard to follow|poorly organized|scattered|fragmented/i;
53
+ // ---------------------------------------------------------------------------
30
54
  // Public API
31
55
  // ---------------------------------------------------------------------------
32
56
  /**
@@ -69,13 +93,25 @@ export function buildFailureModeReport(judgments, scores) {
69
93
  // grader's actual taxonomy choice rather than a collapsed
70
94
  // `"unclassified"` bucket.
71
95
  const emittedMode = readEmittedMode(judgment);
96
+ // W0273 fallback — when the grader's emitted mode is "unclassified"
97
+ // (the synthesized-unparsed-judgment placeholder; in practice this
98
+ // is every judgment today because Promptfoo's llm-rubric strips the
99
+ // grader's structured response), try keyword classification against
100
+ // the reason prose. Gated on score < CLASSIFICATION_THRESHOLD so
101
+ // passing judgments don't get spurious classifications.
102
+ const keywordFallback = emittedMode === "unclassified" &&
103
+ judgment.score < CLASSIFICATION_THRESHOLD
104
+ ? classifyByKeyword(judgment.reason)
105
+ : null;
72
106
  // Cross-check the grader's emission against ceiling decomposition.
73
107
  const stamp = validateFailureMode(judgment, ceilingScore, floorScore);
74
- const classification = {
75
- confidence: stamp.level,
76
- mode: emittedMode,
77
- source: "ceiling",
78
- };
108
+ const classification = keywordFallback
109
+ ? keywordFallback
110
+ : {
111
+ confidence: stamp.level,
112
+ mode: emittedMode,
113
+ source: "ceiling",
114
+ };
79
115
  classifiedJudgments.push({ classification, judgment });
80
116
  summary[classification.mode] = (summary[classification.mode] ?? 0) + 1;
81
117
  // Per-area tracking
@@ -282,6 +318,39 @@ function readEmittedMode(judgment) {
282
318
  }
283
319
  return emitted;
284
320
  }
321
+ /**
322
+ * Classify the failure mode of a low-scoring grader judgment by matching
323
+ * keyword patterns against the reason prose. Returns `null` when no
324
+ * pattern matches. Patterns checked in priority order (API errors first
325
+ * so timeout messages containing "deprecated" don't get misclassified
326
+ * as outdated-docs).
327
+ *
328
+ * W0273 — reinstated as a fallback when the grader's emitted failureMode
329
+ * is "unclassified". Plan 03-03 deleted this code in favor of grader-
330
+ * emission source-of-truth; the deletion is reversed here because
331
+ * Promptfoo's llm-rubric post-processor strips the grader's structured
332
+ * response (only score + reason survive into `comp.*`), so the
333
+ * grader-emission path produces 0% classification on every run.
334
+ */
335
+ function classifyByKeyword(reason) {
336
+ const lower = reason.toLowerCase();
337
+ if (API_ERROR_PATTERN.test(lower)) {
338
+ return { confidence: "high", mode: "api-error", source: "keyword" };
339
+ }
340
+ if (OUTDATED_PATTERN.test(lower)) {
341
+ return { confidence: "high", mode: "outdated-docs", source: "keyword" };
342
+ }
343
+ if (MISSING_PATTERN.test(lower)) {
344
+ return { confidence: "high", mode: "missing-docs", source: "keyword" };
345
+ }
346
+ if (INCORRECT_PATTERN.test(lower)) {
347
+ return { confidence: "medium", mode: "incorrect-docs", source: "keyword" };
348
+ }
349
+ if (POOR_STRUCTURE_PATTERN.test(lower)) {
350
+ return { confidence: "medium", mode: "poor-structure", source: "keyword" };
351
+ }
352
+ return null;
353
+ }
285
354
  /**
286
355
  * Classify by ceiling-decomposition structural signals — preserved
287
356
  * verbatim from the pre-Plan-03-03 implementation. The function itself
@@ -37,6 +37,7 @@ export function mapRequestToConfig(request, rootDir) {
37
37
  mode,
38
38
  variant,
39
39
  debug: mapDebug(request.debug),
40
+ models: request.models,
40
41
  areas: request.areas,
41
42
  tasks: request.tasks,
42
43
  changedDocs: request.changedDocs,
@@ -46,6 +47,7 @@ export function mapRequestToConfig(request, rootDir) {
46
47
  compareEnabled: request.compare ?? false,
47
48
  compareThreshold: request.compareThreshold,
48
49
  compareBaseline: request.compareBaseline,
50
+ compareBaselineReportId: request.compareBaselineReportId,
49
51
  gapAnalysisEnabled: request.gapAnalysis ?? true,
50
52
  publishEnabled: request.publish ?? publishDefault,
51
53
  publishTag: request.publishTag,
@@ -35,7 +35,7 @@ export type LiteracyVariantName = (typeof LiteracyVariant)[keyof typeof Literacy
35
35
  export type LiteracyEvalSubMode = typeof LiteracyVariant.STANDARD | typeof LiteracyVariant.AGENTIC;
36
36
  export interface NormalizedMode {
37
37
  mode: EvalMode;
38
- variant?: string;
38
+ variant?: LiteracyVariantName;
39
39
  }
40
40
  /**
41
41
  * Normalize a raw CLI mode string to a canonical mode + optional variant.
@@ -55,6 +55,8 @@ const ALL_ACCEPTED = [
55
55
  export function normalizeMode(input) {
56
56
  if (LEGACY_LITERACY_VARIANTS.has(input)) {
57
57
  console.warn(`⚠ Deprecated: --mode ${input} is a legacy alias. Use --mode literacy --variant ${input} instead.`);
58
+ // The membership check above narrows `input` to LITERACY_VARIANTS — the
59
+ // cast is to the closed type, not a widening.
58
60
  return { mode: "literacy", variant: input };
59
61
  }
60
62
  if (CANONICAL_MODES.has(input)) {